11 years ago · d69ef83fcc
--- a/ccan/ntdb/ABI/ntdb-0.9.sigs
+++ b/ccan/ntdb/ABI/ntdb-0.9.sigs
@@ -0,0 +1,38 @@
 
				+ntdb_add_flag: void (struct ntdb_context *, unsigned int)
			
 
				+ntdb_append: enum NTDB_ERROR (struct ntdb_context *, NTDB_DATA, NTDB_DATA)
			
 
				+ntdb_chainlock: enum NTDB_ERROR (struct ntdb_context *, NTDB_DATA)
			
 
				+ntdb_chainlock_read: enum NTDB_ERROR (struct ntdb_context *, NTDB_DATA)
			
 
				+ntdb_chainunlock: void (struct ntdb_context *, NTDB_DATA)
			
 
				+ntdb_chainunlock_read: void (struct ntdb_context *, NTDB_DATA)
			
 
				+ntdb_check_: enum NTDB_ERROR (struct ntdb_context *, enum NTDB_ERROR (*)(NTDB_DATA, NTDB_DATA, void *), void *)
			
 
				+ntdb_close: int (struct ntdb_context *)
			
 
				+ntdb_delete: enum NTDB_ERROR (struct ntdb_context *, NTDB_DATA)
			
 
				+ntdb_errorstr: const char *(enum NTDB_ERROR)
			
 
				+ntdb_exists: bool (struct ntdb_context *, NTDB_DATA)
			
 
				+ntdb_fd: int (const struct ntdb_context *)
			
 
				+ntdb_fetch: enum NTDB_ERROR (struct ntdb_context *, NTDB_DATA, NTDB_DATA *)
			
 
				+ntdb_firstkey: enum NTDB_ERROR (struct ntdb_context *, NTDB_DATA *)
			
 
				+ntdb_foreach_: void (int (*)(struct ntdb_context *, void *), void *)
			
 
				+ntdb_get_attribute: enum NTDB_ERROR (struct ntdb_context *, union ntdb_attribute *)
			
 
				+ntdb_get_flags: unsigned int (struct ntdb_context *)
			
 
				+ntdb_get_seqnum: int64_t (struct ntdb_context *)
			
 
				+ntdb_lockall: enum NTDB_ERROR (struct ntdb_context *)
			
 
				+ntdb_lockall_read: enum NTDB_ERROR (struct ntdb_context *)
			
 
				+ntdb_name: const char *(const struct ntdb_context *)
			
 
				+ntdb_nextkey: enum NTDB_ERROR (struct ntdb_context *, NTDB_DATA *)
			
 
				+ntdb_open: struct ntdb_context *(const char *, int, int, mode_t, union ntdb_attribute *)
			
 
				+ntdb_parse_record_: enum NTDB_ERROR (struct ntdb_context *, NTDB_DATA, enum NTDB_ERROR (*)(NTDB_DATA, NTDB_DATA, void *), void *)
			
 
				+ntdb_remove_flag: void (struct ntdb_context *, unsigned int)
			
 
				+ntdb_repack: enum NTDB_ERROR (struct ntdb_context *)
			
 
				+ntdb_set_attribute: enum NTDB_ERROR (struct ntdb_context *, const union ntdb_attribute *)
			
 
				+ntdb_store: enum NTDB_ERROR (struct ntdb_context *, NTDB_DATA, NTDB_DATA, int)
			
 
				+ntdb_summary: enum NTDB_ERROR (struct ntdb_context *, enum ntdb_summary_flags, char **)
			
 
				+ntdb_transaction_cancel: void (struct ntdb_context *)
			
 
				+ntdb_transaction_commit: enum NTDB_ERROR (struct ntdb_context *)
			
 
				+ntdb_transaction_prepare_commit: enum NTDB_ERROR (struct ntdb_context *)
			
 
				+ntdb_transaction_start: enum NTDB_ERROR (struct ntdb_context *)
			
 
				+ntdb_traverse_: int64_t (struct ntdb_context *, int (*)(struct ntdb_context *, NTDB_DATA, NTDB_DATA, void *), void *)
			
 
				+ntdb_unlockall: void (struct ntdb_context *)
			
 
				+ntdb_unlockall_read: void (struct ntdb_context *)
			
 
				+ntdb_unset_attribute: void (struct ntdb_context *, enum ntdb_attribute_type)
			
 
				+ntdb_wipe_all: enum NTDB_ERROR (struct ntdb_context *)
			
--- a/ccan/ntdb/ABI/ntdb-1.0.sigs
+++ b/ccan/ntdb/ABI/ntdb-1.0.sigs
@@ -0,0 +1,38 @@
 
				+ntdb_add_flag: void (struct ntdb_context *, unsigned int)
			
 
				+ntdb_append: enum NTDB_ERROR (struct ntdb_context *, NTDB_DATA, NTDB_DATA)
			
 
				+ntdb_chainlock: enum NTDB_ERROR (struct ntdb_context *, NTDB_DATA)
			
 
				+ntdb_chainlock_read: enum NTDB_ERROR (struct ntdb_context *, NTDB_DATA)
			
 
				+ntdb_chainunlock: void (struct ntdb_context *, NTDB_DATA)
			
 
				+ntdb_chainunlock_read: void (struct ntdb_context *, NTDB_DATA)
			
 
				+ntdb_check_: enum NTDB_ERROR (struct ntdb_context *, enum NTDB_ERROR (*)(NTDB_DATA, NTDB_DATA, void *), void *)
			
 
				+ntdb_close: int (struct ntdb_context *)
			
 
				+ntdb_delete: enum NTDB_ERROR (struct ntdb_context *, NTDB_DATA)
			
 
				+ntdb_errorstr: const char *(enum NTDB_ERROR)
			
 
				+ntdb_exists: bool (struct ntdb_context *, NTDB_DATA)
			
 
				+ntdb_fd: int (const struct ntdb_context *)
			
 
				+ntdb_fetch: enum NTDB_ERROR (struct ntdb_context *, NTDB_DATA, NTDB_DATA *)
			
 
				+ntdb_firstkey: enum NTDB_ERROR (struct ntdb_context *, NTDB_DATA *)
			
 
				+ntdb_foreach_: void (int (*)(struct ntdb_context *, void *), void *)
			
 
				+ntdb_get_attribute: enum NTDB_ERROR (struct ntdb_context *, union ntdb_attribute *)
			
 
				+ntdb_get_flags: unsigned int (struct ntdb_context *)
			
 
				+ntdb_get_seqnum: int64_t (struct ntdb_context *)
			
 
				+ntdb_lockall: enum NTDB_ERROR (struct ntdb_context *)
			
 
				+ntdb_lockall_read: enum NTDB_ERROR (struct ntdb_context *)
			
 
				+ntdb_name: const char *(const struct ntdb_context *)
			
 
				+ntdb_nextkey: enum NTDB_ERROR (struct ntdb_context *, NTDB_DATA *)
			
 
				+ntdb_open: struct ntdb_context *(const char *, int, int, mode_t, union ntdb_attribute *)
			
 
				+ntdb_parse_record_: enum NTDB_ERROR (struct ntdb_context *, NTDB_DATA, enum NTDB_ERROR (*)(NTDB_DATA, NTDB_DATA, void *), void *)
			
 
				+ntdb_remove_flag: void (struct ntdb_context *, unsigned int)
			
 
				+ntdb_repack: enum NTDB_ERROR (struct ntdb_context *)
			
 
				+ntdb_set_attribute: enum NTDB_ERROR (struct ntdb_context *, const union ntdb_attribute *)
			
 
				+ntdb_store: enum NTDB_ERROR (struct ntdb_context *, NTDB_DATA, NTDB_DATA, int)
			
 
				+ntdb_summary: enum NTDB_ERROR (struct ntdb_context *, enum ntdb_summary_flags, char **)
			
 
				+ntdb_transaction_cancel: void (struct ntdb_context *)
			
 
				+ntdb_transaction_commit: enum NTDB_ERROR (struct ntdb_context *)
			
 
				+ntdb_transaction_prepare_commit: enum NTDB_ERROR (struct ntdb_context *)
			
 
				+ntdb_transaction_start: enum NTDB_ERROR (struct ntdb_context *)
			
 
				+ntdb_traverse_: int64_t (struct ntdb_context *, int (*)(struct ntdb_context *, NTDB_DATA, NTDB_DATA, void *), void *)
			
 
				+ntdb_unlockall: void (struct ntdb_context *)
			
 
				+ntdb_unlockall_read: void (struct ntdb_context *)
			
 
				+ntdb_unset_attribute: void (struct ntdb_context *, enum ntdb_attribute_type)
			
 
				+ntdb_wipe_all: enum NTDB_ERROR (struct ntdb_context *)
			
--- a/ccan/ntdb/LICENSE
+++ b/ccan/ntdb/LICENSE
@@ -0,0 +1,165 @@
 
				+		   GNU LESSER GENERAL PUBLIC LICENSE
			
 
				+                       Version 3, 29 June 2007
			
 
				+
			
 
				+ Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
			
 
				+ Everyone is permitted to copy and distribute verbatim copies
			
 
				+ of this license document, but changing it is not allowed.
			
 
				+
			
 
				+
			
 
				+  This version of the GNU Lesser General Public License incorporates
			
 
				+the terms and conditions of version 3 of the GNU General Public
			
 
				+License, supplemented by the additional permissions listed below.
			
 
				+
			
 
				+  0. Additional Definitions.
			
 
				+
			
 
				+  As used herein, "this License" refers to version 3 of the GNU Lesser
			
 
				+General Public License, and the "GNU GPL" refers to version 3 of the GNU
			
 
				+General Public License.
			
 
				+
			
 
				+  "The Library" refers to a covered work governed by this License,
			
 
				+other than an Application or a Combined Work as defined below.
			
 
				+
			
 
				+  An "Application" is any work that makes use of an interface provided
			
 
				+by the Library, but which is not otherwise based on the Library.
			
 
				+Defining a subclass of a class defined by the Library is deemed a mode
			
 
				+of using an interface provided by the Library.
			
 
				+
			
 
				+  A "Combined Work" is a work produced by combining or linking an
			
 
				+Application with the Library.  The particular version of the Library
			
 
				+with which the Combined Work was made is also called the "Linked
			
 
				+Version".
			
 
				+
			
 
				+  The "Minimal Corresponding Source" for a Combined Work means the
			
 
				+Corresponding Source for the Combined Work, excluding any source code
			
 
				+for portions of the Combined Work that, considered in isolation, are
			
 
				+based on the Application, and not on the Linked Version.
			
 
				+
			
 
				+  The "Corresponding Application Code" for a Combined Work means the
			
 
				+object code and/or source code for the Application, including any data
			
 
				+and utility programs needed for reproducing the Combined Work from the
			
 
				+Application, but excluding the System Libraries of the Combined Work.
			
 
				+
			
 
				+  1. Exception to Section 3 of the GNU GPL.
			
 
				+
			
 
				+  You may convey a covered work under sections 3 and 4 of this License
			
 
				+without being bound by section 3 of the GNU GPL.
			
 
				+
			
 
				+  2. Conveying Modified Versions.
			
 
				+
			
 
				+  If you modify a copy of the Library, and, in your modifications, a
			
 
				+facility refers to a function or data to be supplied by an Application
			
 
				+that uses the facility (other than as an argument passed when the
			
 
				+facility is invoked), then you may convey a copy of the modified
			
 
				+version:
			
 
				+
			
 
				+   a) under this License, provided that you make a good faith effort to
			
 
				+   ensure that, in the event an Application does not supply the
			
 
				+   function or data, the facility still operates, and performs
			
 
				+   whatever part of its purpose remains meaningful, or
			
 
				+
			
 
				+   b) under the GNU GPL, with none of the additional permissions of
			
 
				+   this License applicable to that copy.
			
 
				+
			
 
				+  3. Object Code Incorporating Material from Library Header Files.
			
 
				+
			
 
				+  The object code form of an Application may incorporate material from
			
 
				+a header file that is part of the Library.  You may convey such object
			
 
				+code under terms of your choice, provided that, if the incorporated
			
 
				+material is not limited to numerical parameters, data structure
			
 
				+layouts and accessors, or small macros, inline functions and templates
			
 
				+(ten or fewer lines in length), you do both of the following:
			
 
				+
			
 
				+   a) Give prominent notice with each copy of the object code that the
			
 
				+   Library is used in it and that the Library and its use are
			
 
				+   covered by this License.
			
 
				+
			
 
				+   b) Accompany the object code with a copy of the GNU GPL and this license
			
 
				+   document.
			
 
				+
			
 
				+  4. Combined Works.
			
 
				+
			
 
				+  You may convey a Combined Work under terms of your choice that,
			
 
				+taken together, effectively do not restrict modification of the
			
 
				+portions of the Library contained in the Combined Work and reverse
			
 
				+engineering for debugging such modifications, if you also do each of
			
 
				+the following:
			
 
				+
			
 
				+   a) Give prominent notice with each copy of the Combined Work that
			
 
				+   the Library is used in it and that the Library and its use are
			
 
				+   covered by this License.
			
 
				+
			
 
				+   b) Accompany the Combined Work with a copy of the GNU GPL and this license
			
 
				+   document.
			
 
				+
			
 
				+   c) For a Combined Work that displays copyright notices during
			
 
				+   execution, include the copyright notice for the Library among
			
 
				+   these notices, as well as a reference directing the user to the
			
 
				+   copies of the GNU GPL and this license document.
			
 
				+
			
 
				+   d) Do one of the following:
			
 
				+
			
 
				+       0) Convey the Minimal Corresponding Source under the terms of this
			
 
				+       License, and the Corresponding Application Code in a form
			
 
				+       suitable for, and under terms that permit, the user to
			
 
				+       recombine or relink the Application with a modified version of
			
 
				+       the Linked Version to produce a modified Combined Work, in the
			
 
				+       manner specified by section 6 of the GNU GPL for conveying
			
 
				+       Corresponding Source.
			
 
				+
			
 
				+       1) Use a suitable shared library mechanism for linking with the
			
 
				+       Library.  A suitable mechanism is one that (a) uses at run time
			
 
				+       a copy of the Library already present on the user's computer
			
 
				+       system, and (b) will operate properly with a modified version
			
 
				+       of the Library that is interface-compatible with the Linked
			
 
				+       Version.
			
 
				+
			
 
				+   e) Provide Installation Information, but only if you would otherwise
			
 
				+   be required to provide such information under section 6 of the
			
 
				+   GNU GPL, and only to the extent that such information is
			
 
				+   necessary to install and execute a modified version of the
			
 
				+   Combined Work produced by recombining or relinking the
			
 
				+   Application with a modified version of the Linked Version. (If
			
 
				+   you use option 4d0, the Installation Information must accompany
			
 
				+   the Minimal Corresponding Source and Corresponding Application
			
 
				+   Code. If you use option 4d1, you must provide the Installation
			
 
				+   Information in the manner specified by section 6 of the GNU GPL
			
 
				+   for conveying Corresponding Source.)
			
 
				+
			
 
				+  5. Combined Libraries.
			
 
				+
			
 
				+  You may place library facilities that are a work based on the
			
 
				+Library side by side in a single library together with other library
			
 
				+facilities that are not Applications and are not covered by this
			
 
				+License, and convey such a combined library under terms of your
			
 
				+choice, if you do both of the following:
			
 
				+
			
 
				+   a) Accompany the combined library with a copy of the same work based
			
 
				+   on the Library, uncombined with any other library facilities,
			
 
				+   conveyed under the terms of this License.
			
 
				+
			
 
				+   b) Give prominent notice with the combined library that part of it
			
 
				+   is a work based on the Library, and explaining where to find the
			
 
				+   accompanying uncombined form of the same work.
			
 
				+
			
 
				+  6. Revised Versions of the GNU Lesser General Public License.
			
 
				+
			
 
				+  The Free Software Foundation may publish revised and/or new versions
			
 
				+of the GNU Lesser General Public License from time to time. Such new
			
 
				+versions will be similar in spirit to the present version, but may
			
 
				+differ in detail to address new problems or concerns.
			
 
				+
			
 
				+  Each version is given a distinguishing version number. If the
			
 
				+Library as you received it specifies that a certain numbered version
			
 
				+of the GNU Lesser General Public License "or any later version"
			
 
				+applies to it, you have the option of following the terms and
			
 
				+conditions either of that published version or of any later version
			
 
				+published by the Free Software Foundation. If the Library as you
			
 
				+received it does not specify a version number of the GNU Lesser
			
 
				+General Public License, you may choose any version of the GNU Lesser
			
 
				+General Public License ever published by the Free Software Foundation.
			
 
				+
			
 
				+  If the Library as you received it specifies that a proxy can decide
			
 
				+whether future versions of the GNU Lesser General Public License shall
			
 
				+apply, that proxy's public statement of acceptance of any version is
			
 
				+permanent authorization for you to choose that version for the
			
 
				+Library.
			
--- a/ccan/ntdb/Makefile
+++ b/ccan/ntdb/Makefile
@@ -0,0 +1,67 @@
 
				+# simple makefile wrapper to run waf
			
 
				+
			
 
				+WAF=WAF_MAKE=1 PATH=buildtools/bin:../../buildtools/bin:$$PATH waf
			
 
				+
			
 
				+all:
			
 
				+	$(WAF) build
			
 
				+
			
 
				+install:
			
 
				+	$(WAF) install
			
 
				+
			
 
				+uninstall:
			
 
				+	$(WAF) uninstall
			
 
				+
			
 
				+test: FORCE
			
 
				+	$(WAF) test $(TEST_OPTIONS)
			
 
				+
			
 
				+testenv:
			
 
				+	$(WAF) test --testenv $(TEST_OPTIONS)
			
 
				+
			
 
				+quicktest:
			
 
				+	$(WAF) test --quick $(TEST_OPTIONS)
			
 
				+
			
 
				+dist:
			
 
				+	touch .tmplock
			
 
				+	WAFLOCK=.tmplock $(WAF) dist
			
 
				+
			
 
				+distcheck:
			
 
				+	touch .tmplock
			
 
				+	WAFLOCK=.tmplock $(WAF) distcheck
			
 
				+
			
 
				+clean:
			
 
				+	$(WAF) clean
			
 
				+
			
 
				+distclean:
			
 
				+	$(WAF) distclean
			
 
				+
			
 
				+reconfigure: configure
			
 
				+	$(WAF) reconfigure
			
 
				+
			
 
				+show_waf_options:
			
 
				+	$(WAF) --help
			
 
				+
			
 
				+# some compatibility make targets
			
 
				+everything: all
			
 
				+
			
 
				+testsuite: all
			
 
				+
			
 
				+.PHONY: check
			
 
				+check: test
			
 
				+
			
 
				+torture: all
			
 
				+
			
 
				+# this should do an install as well, once install is finished
			
 
				+installcheck: test
			
 
				+
			
 
				+etags:
			
 
				+	$(WAF) etags
			
 
				+
			
 
				+ctags:
			
 
				+	$(WAF) ctags
			
 
				+
			
 
				+pydoctor:
			
 
				+	$(WAF) pydoctor
			
 
				+
			
 
				+bin/%:: FORCE
			
 
				+	$(WAF) --targets=`basename $@`
			
 
				+FORCE:
			
--- a/ccan/ntdb/check.c
+++ b/ccan/ntdb/check.c
@@ -0,0 +1,726 @@
 
				+ /*
			
 
				+   Trivial Database 2: free list/block handling
			
 
				+   Copyright (C) Rusty Russell 2010
			
 
				+
			
 
				+   This library is free software; you can redistribute it and/or
			
 
				+   modify it under the terms of the GNU Lesser General Public
			
 
				+   License as published by the Free Software Foundation; either
			
 
				+   version 3 of the License, or (at your option) any later version.
			
 
				+
			
 
				+   This library is distributed in the hope that it will be useful,
			
 
				+   but WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
			
 
				+   Lesser General Public License for more details.
			
 
				+
			
 
				+   You should have received a copy of the GNU Lesser General Public
			
 
				+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
			
 
				+*/
			
 
				+#include "private.h"
			
 
				+#include <ccan/likely/likely.h>
			
 
				+#include <ccan/asearch/asearch.h>
			
 
				+
			
 
				+/* We keep an ordered array of offsets. */
			
 
				+static bool append(struct ntdb_context *ntdb,
			
 
				+		   ntdb_off_t **arr, size_t *num, ntdb_off_t off)
			
 
				+{
			
 
				+	ntdb_off_t *new;
			
 
				+
			
 
				+	if (*num == 0) {
			
 
				+		new = ntdb->alloc_fn(ntdb, sizeof(ntdb_off_t), ntdb->alloc_data);
			
 
				+	} else {
			
 
				+		new = ntdb->expand_fn(*arr, (*num + 1) * sizeof(ntdb_off_t),
			
 
				+				  ntdb->alloc_data);
			
 
				+	}
			
 
				+	if (!new)
			
 
				+		return false;
			
 
				+	new[(*num)++] = off;
			
 
				+	*arr = new;
			
 
				+	return true;
			
 
				+}
			
 
				+
			
 
				+static enum NTDB_ERROR check_header(struct ntdb_context *ntdb,
			
 
				+				    ntdb_off_t *recovery,
			
 
				+				    uint64_t *features,
			
 
				+				    size_t *num_capabilities)
			
 
				+{
			
 
				+	uint64_t hash_test;
			
 
				+	struct ntdb_header hdr;
			
 
				+	enum NTDB_ERROR ecode;
			
 
				+	ntdb_off_t off, next;
			
 
				+
			
 
				+	ecode = ntdb_read_convert(ntdb, 0, &hdr, sizeof(hdr));
			
 
				+	if (ecode != NTDB_SUCCESS) {
			
 
				+		return ecode;
			
 
				+	}
			
 
				+	/* magic food should not be converted, so convert back. */
			
 
				+	ntdb_convert(ntdb, hdr.magic_food, sizeof(hdr.magic_food));
			
 
				+
			
 
				+	hash_test = NTDB_HASH_MAGIC;
			
 
				+	hash_test = ntdb_hash(ntdb, &hash_test, sizeof(hash_test));
			
 
				+	if (hdr.hash_test != hash_test) {
			
 
				+		return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
			
 
				+				  "check: hash test %llu should be %llu",
			
 
				+				  (long long)hdr.hash_test,
			
 
				+				  (long long)hash_test);
			
 
				+	}
			
 
				+
			
 
				+	if (strcmp(hdr.magic_food, NTDB_MAGIC_FOOD) != 0) {
			
 
				+		return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
			
 
				+				  "check: bad magic '%.*s'",
			
 
				+				  (unsigned)sizeof(hdr.magic_food),
			
 
				+				  hdr.magic_food);
			
 
				+	}
			
 
				+
			
 
				+	/* Features which are used must be a subset of features offered. */
			
 
				+	if (hdr.features_used & ~hdr.features_offered) {
			
 
				+		return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
			
 
				+				  "check: features used (0x%llx) which"
			
 
				+				  " are not offered (0x%llx)",
			
 
				+				  (long long)hdr.features_used,
			
 
				+				  (long long)hdr.features_offered);
			
 
				+	}
			
 
				+
			
 
				+	*features = hdr.features_offered;
			
 
				+	*recovery = hdr.recovery;
			
 
				+	if (*recovery) {
			
 
				+		if (*recovery < sizeof(hdr)
			
 
				+		    || *recovery > ntdb->file->map_size) {
			
 
				+			return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
			
 
				+					  "ntdb_check:"
			
 
				+					  " invalid recovery offset %zu",
			
 
				+					  (size_t)*recovery);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	for (off = hdr.capabilities; off && ecode == NTDB_SUCCESS; off = next) {
			
 
				+		const struct ntdb_capability *cap;
			
 
				+		enum NTDB_ERROR e;
			
 
				+
			
 
				+		cap = ntdb_access_read(ntdb, off, sizeof(*cap), true);
			
 
				+		if (NTDB_PTR_IS_ERR(cap)) {
			
 
				+			return NTDB_PTR_ERR(cap);
			
 
				+		}
			
 
				+
			
 
				+		/* All capabilities are unknown. */
			
 
				+		e = unknown_capability(ntdb, "ntdb_check", cap->type);
			
 
				+		next = cap->next;
			
 
				+		ntdb_access_release(ntdb, cap);
			
 
				+		if (e)
			
 
				+			return e;
			
 
				+		(*num_capabilities)++;
			
 
				+	}
			
 
				+
			
 
				+	/* Don't check reserved: they *can* be used later. */
			
 
				+	return NTDB_SUCCESS;
			
 
				+}
			
 
				+
			
 
				+static int off_cmp(const ntdb_off_t *a, const ntdb_off_t *b)
			
 
				+{
			
 
				+	/* Can overflow an int. */
			
 
				+	return *a > *b ? 1
			
 
				+		: *a < *b ? -1
			
 
				+		: 0;
			
 
				+}
			
 
				+
			
 
				+static enum NTDB_ERROR check_entry(struct ntdb_context *ntdb,
			
 
				+				   ntdb_off_t off_and_hash,
			
 
				+				   ntdb_len_t bucket,
			
 
				+				   ntdb_off_t used[],
			
 
				+				   size_t num_used,
			
 
				+				   size_t *num_found,
			
 
				+				   enum NTDB_ERROR (*check)(NTDB_DATA,
			
 
				+							    NTDB_DATA,
			
 
				+							    void *),
			
 
				+				   void *data)
			
 
				+{
			
 
				+	enum NTDB_ERROR ecode;
			
 
				+	const struct ntdb_used_record *r;
			
 
				+	const unsigned char *kptr;
			
 
				+	ntdb_len_t klen, dlen;
			
 
				+	uint32_t hash;
			
 
				+	ntdb_off_t off = off_and_hash & NTDB_OFF_MASK;
			
 
				+	ntdb_off_t *p;
			
 
				+
			
 
				+	/* Empty bucket is fine. */
			
 
				+	if (!off_and_hash) {
			
 
				+		return NTDB_SUCCESS;
			
 
				+	}
			
 
				+
			
 
				+	/* This can't point to a chain, we handled those at toplevel. */
			
 
				+	if (off_and_hash & (1ULL << NTDB_OFF_CHAIN_BIT)) {
			
 
				+		return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
			
 
				+				   "ntdb_check: Invalid chain bit in offset "
			
 
				+				   " %llu", (long long)off_and_hash);
			
 
				+	}
			
 
				+
			
 
				+	p = asearch(&off, used, num_used, off_cmp);
			
 
				+	if (!p) {
			
 
				+		return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
			
 
				+				   "ntdb_check: Invalid offset"
			
 
				+				   " %llu in hash", (long long)off);
			
 
				+	}
			
 
				+	/* Mark it invalid. */
			
 
				+	*p ^= 1;
			
 
				+	(*num_found)++;
			
 
				+
			
 
				+	r = ntdb_access_read(ntdb, off, sizeof(*r), true);
			
 
				+	if (NTDB_PTR_IS_ERR(r)) {
			
 
				+		return NTDB_PTR_ERR(r);
			
 
				+	}
			
 
				+	klen = rec_key_length(r);
			
 
				+	dlen = rec_data_length(r);
			
 
				+	ntdb_access_release(ntdb, r);
			
 
				+
			
 
				+	kptr = ntdb_access_read(ntdb, off + sizeof(*r), klen + dlen, false);
			
 
				+	if (NTDB_PTR_IS_ERR(kptr)) {
			
 
				+		return NTDB_PTR_ERR(kptr);
			
 
				+	}
			
 
				+
			
 
				+	hash = ntdb_hash(ntdb, kptr, klen);
			
 
				+
			
 
				+	/* Are we in the right chain? */
			
 
				+	if (bits_from(hash, 0, ntdb->hash_bits) != bucket) {
			
 
				+		ecode = ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
			
 
				+				    NTDB_LOG_ERROR,
			
 
				+				    "ntdb_check: Bad bucket %u vs %llu",
			
 
				+				    bits_from(hash, 0, ntdb->hash_bits),
			
 
				+				    (long long)bucket);
			
 
				+	/* Next 8 bits should be the same as top bits of bucket. */
			
 
				+	} else if (bits_from(hash, ntdb->hash_bits, NTDB_OFF_UPPER_STEAL)
			
 
				+		   != bits_from(off_and_hash, 64-NTDB_OFF_UPPER_STEAL,
			
 
				+				NTDB_OFF_UPPER_STEAL)) {
			
 
				+		ecode = ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
			
 
				+				    NTDB_LOG_ERROR,
			
 
				+				    "ntdb_check: Bad hash bits %llu vs %llu",
			
 
				+				    (long long)off_and_hash,
			
 
				+				    (long long)hash);
			
 
				+	} else if (check) {
			
 
				+		NTDB_DATA k, d;
			
 
				+
			
 
				+		k = ntdb_mkdata(kptr, klen);
			
 
				+		d = ntdb_mkdata(kptr + klen, dlen);
			
 
				+		ecode = check(k, d, data);
			
 
				+	} else {
			
 
				+		ecode = NTDB_SUCCESS;
			
 
				+	}
			
 
				+	ntdb_access_release(ntdb, kptr);
			
 
				+
			
 
				+	return ecode;
			
 
				+}
			
 
				+
			
 
				+static enum NTDB_ERROR check_hash_chain(struct ntdb_context *ntdb,
			
 
				+					ntdb_off_t off,
			
 
				+					ntdb_len_t bucket,
			
 
				+					ntdb_off_t used[],
			
 
				+					size_t num_used,
			
 
				+					size_t *num_found,
			
 
				+					enum NTDB_ERROR (*check)(NTDB_DATA,
			
 
				+								 NTDB_DATA,
			
 
				+								 void *),
			
 
				+					void *data)
			
 
				+{
			
 
				+	struct ntdb_used_record rec;
			
 
				+	enum NTDB_ERROR ecode;
			
 
				+	const ntdb_off_t *entries;
			
 
				+	ntdb_len_t i, num;
			
 
				+
			
 
				+	/* This is a used entry. */
			
 
				+	(*num_found)++;
			
 
				+
			
 
				+	ecode = ntdb_read_convert(ntdb, off, &rec, sizeof(rec));
			
 
				+	if (ecode != NTDB_SUCCESS) {
			
 
				+		return ecode;
			
 
				+	}
			
 
				+
			
 
				+	if (rec_magic(&rec) != NTDB_CHAIN_MAGIC) {
			
 
				+		return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
			
 
				+				  "ntdb_check: Bad hash chain magic %llu",
			
 
				+				  (long long)rec_magic(&rec));
			
 
				+	}
			
 
				+
			
 
				+	if (rec_data_length(&rec) % sizeof(ntdb_off_t)) {
			
 
				+		return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
			
 
				+				  "ntdb_check: Bad hash chain data length %llu",
			
 
				+				  (long long)rec_data_length(&rec));
			
 
				+	}
			
 
				+
			
 
				+	if (rec_key_length(&rec) != 0) {
			
 
				+		return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
			
 
				+				  "ntdb_check: Bad hash chain key length %llu",
			
 
				+				  (long long)rec_key_length(&rec));
			
 
				+	}
			
 
				+
			
 
				+	off += sizeof(rec);
			
 
				+	num = rec_data_length(&rec) / sizeof(ntdb_off_t);
			
 
				+	entries = ntdb_access_read(ntdb, off, rec_data_length(&rec), true);
			
 
				+	if (NTDB_PTR_IS_ERR(entries)) {
			
 
				+		return NTDB_PTR_ERR(entries);
			
 
				+	}
			
 
				+
			
 
				+	/* Check each non-deleted entry in chain. */
			
 
				+	for (i = 0; i < num; i++) {
			
 
				+		ecode = check_entry(ntdb, entries[i], bucket,
			
 
				+				    used, num_used, num_found, check, data);
			
 
				+		if (ecode) {
			
 
				+			break;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	ntdb_access_release(ntdb, entries);
			
 
				+	return ecode;
			
 
				+}
			
 
				+
			
 
				+static enum NTDB_ERROR check_hash(struct ntdb_context *ntdb,
			
 
				+				  ntdb_off_t used[],
			
 
				+				  size_t num_used,
			
 
				+				  size_t num_other_used,
			
 
				+				  enum NTDB_ERROR (*check)(NTDB_DATA,
			
 
				+							   NTDB_DATA,
			
 
				+							   void *),
			
 
				+				  void *data)
			
 
				+{
			
 
				+	enum NTDB_ERROR ecode;
			
 
				+	struct ntdb_used_record rec;
			
 
				+	const ntdb_off_t *entries;
			
 
				+	ntdb_len_t i;
			
 
				+	/* Free tables and capabilities also show up as used, as do we. */
			
 
				+	size_t num_found = num_other_used + 1;
			
 
				+
			
 
				+	ecode = ntdb_read_convert(ntdb, NTDB_HASH_OFFSET, &rec, sizeof(rec));
			
 
				+	if (ecode != NTDB_SUCCESS) {
			
 
				+		return ecode;
			
 
				+	}
			
 
				+
			
 
				+	if (rec_magic(&rec) != NTDB_HTABLE_MAGIC) {
			
 
				+		return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
			
 
				+				  "ntdb_check: Bad hash table magic %llu",
			
 
				+				  (long long)rec_magic(&rec));
			
 
				+	}
			
 
				+
			
 
				+	if (rec_data_length(&rec) != (sizeof(ntdb_off_t) << ntdb->hash_bits)) {
			
 
				+		return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
			
 
				+				  "ntdb_check: Bad hash table data length %llu",
			
 
				+				  (long long)rec_data_length(&rec));
			
 
				+	}
			
 
				+
			
 
				+	if (rec_key_length(&rec) != 0) {
			
 
				+		return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
			
 
				+				  "ntdb_check: Bad hash table key length %llu",
			
 
				+				  (long long)rec_key_length(&rec));
			
 
				+	}
			
 
				+
			
 
				+	entries = ntdb_access_read(ntdb, NTDB_HASH_OFFSET + sizeof(rec),
			
 
				+				   rec_data_length(&rec), true);
			
 
				+	if (NTDB_PTR_IS_ERR(entries)) {
			
 
				+		return NTDB_PTR_ERR(entries);
			
 
				+	}
			
 
				+
			
 
				+	for (i = 0; i < (1 << ntdb->hash_bits); i++) {
			
 
				+		ntdb_off_t off = entries[i] & NTDB_OFF_MASK;
			
 
				+		if (entries[i] & (1ULL << NTDB_OFF_CHAIN_BIT)) {
			
 
				+			ecode = check_hash_chain(ntdb, off, i,
			
 
				+						 used, num_used, &num_found,
			
 
				+						 check, data);
			
 
				+		} else {
			
 
				+			ecode = check_entry(ntdb, entries[i], i,
			
 
				+					    used, num_used, &num_found,
			
 
				+					    check, data);
			
 
				+		}
			
 
				+		if (ecode) {
			
 
				+			break;
			
 
				+		}
			
 
				+	}
			
 
				+	ntdb_access_release(ntdb, entries);
			
 
				+
			
 
				+	if (ecode == NTDB_SUCCESS && num_found != num_used) {
			
 
				+		ecode = ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
			
 
				+				    "ntdb_check: Not all entries are in hash");
			
 
				+	}
			
 
				+	return ecode;
			
 
				+}
			
 
				+
			
 
				+static enum NTDB_ERROR check_free(struct ntdb_context *ntdb,
			
 
				+				 ntdb_off_t off,
			
 
				+				 const struct ntdb_free_record *frec,
			
 
				+				 ntdb_off_t prev, unsigned int ftable,
			
 
				+				 unsigned int bucket)
			
 
				+{
			
 
				+	enum NTDB_ERROR ecode;
			
 
				+
			
 
				+	if (frec_magic(frec) != NTDB_FREE_MAGIC) {
			
 
				+		return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
			
 
				+				  "ntdb_check: offset %llu bad magic 0x%llx",
			
 
				+				  (long long)off,
			
 
				+				  (long long)frec->magic_and_prev);
			
 
				+	}
			
 
				+	if (frec_ftable(frec) != ftable) {
			
 
				+		return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
			
 
				+				  "ntdb_check: offset %llu bad freetable %u",
			
 
				+				  (long long)off, frec_ftable(frec));
			
 
				+
			
 
				+	}
			
 
				+
			
 
				+	ecode = ntdb_oob(ntdb, off,
			
 
				+			 frec_len(frec) + sizeof(struct ntdb_used_record),
			
 
				+			 false);
			
 
				+	if (ecode != NTDB_SUCCESS) {
			
 
				+		return ecode;
			
 
				+	}
			
 
				+	if (size_to_bucket(frec_len(frec)) != bucket) {
			
 
				+		return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
			
 
				+				  "ntdb_check: offset %llu in wrong bucket"
			
 
				+				  " (%u vs %u)",
			
 
				+				  (long long)off,
			
 
				+				  bucket, size_to_bucket(frec_len(frec)));
			
 
				+	}
			
 
				+	if (prev && prev != frec_prev(frec)) {
			
 
				+		return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
			
 
				+				  "ntdb_check: offset %llu bad prev"
			
 
				+				  " (%llu vs %llu)",
			
 
				+				  (long long)off,
			
 
				+				  (long long)prev, (long long)frec_len(frec));
			
 
				+	}
			
 
				+	return NTDB_SUCCESS;
			
 
				+}
			
 
				+
			
 
				+static enum NTDB_ERROR check_free_table(struct ntdb_context *ntdb,
			
 
				+				       ntdb_off_t ftable_off,
			
 
				+				       unsigned ftable_num,
			
 
				+				       ntdb_off_t fr[],
			
 
				+				       size_t num_free,
			
 
				+				       size_t *num_found)
			
 
				+{
			
 
				+	struct ntdb_freetable ft;
			
 
				+	ntdb_off_t h;
			
 
				+	unsigned int i;
			
 
				+	enum NTDB_ERROR ecode;
			
 
				+
			
 
				+	ecode = ntdb_read_convert(ntdb, ftable_off, &ft, sizeof(ft));
			
 
				+	if (ecode != NTDB_SUCCESS) {
			
 
				+		return ecode;
			
 
				+	}
			
 
				+
			
 
				+	if (rec_magic(&ft.hdr) != NTDB_FTABLE_MAGIC
			
 
				+	    || rec_key_length(&ft.hdr) != 0
			
 
				+	    || rec_data_length(&ft.hdr) != sizeof(ft) - sizeof(ft.hdr)) {
			
 
				+		return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
			
 
				+				  "ntdb_check: Invalid header on free table");
			
 
				+	}
			
 
				+
			
 
				+	for (i = 0; i < NTDB_FREE_BUCKETS; i++) {
			
 
				+		ntdb_off_t off, prev = 0, *p, first = 0;
			
 
				+		struct ntdb_free_record f;
			
 
				+
			
 
				+		h = bucket_off(ftable_off, i);
			
 
				+		for (off = ntdb_read_off(ntdb, h); off; off = f.next) {
			
 
				+			if (NTDB_OFF_IS_ERR(off)) {
			
 
				+				return NTDB_OFF_TO_ERR(off);
			
 
				+			}
			
 
				+			if (!first) {
			
 
				+				off &= NTDB_OFF_MASK;
			
 
				+				first = off;
			
 
				+			}
			
 
				+			ecode = ntdb_read_convert(ntdb, off, &f, sizeof(f));
			
 
				+			if (ecode != NTDB_SUCCESS) {
			
 
				+				return ecode;
			
 
				+			}
			
 
				+			ecode = check_free(ntdb, off, &f, prev, ftable_num, i);
			
 
				+			if (ecode != NTDB_SUCCESS) {
			
 
				+				return ecode;
			
 
				+			}
			
 
				+
			
 
				+			/* FIXME: Check hash bits */
			
 
				+			p = asearch(&off, fr, num_free, off_cmp);
			
 
				+			if (!p) {
			
 
				+				return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
			
 
				+						  NTDB_LOG_ERROR,
			
 
				+						  "ntdb_check: Invalid offset"
			
 
				+						  " %llu in free table",
			
 
				+						  (long long)off);
			
 
				+			}
			
 
				+			/* Mark it invalid. */
			
 
				+			*p ^= 1;
			
 
				+			(*num_found)++;
			
 
				+			prev = off;
			
 
				+		}
			
 
				+
			
 
				+		if (first) {
			
 
				+			/* Now we can check first back pointer. */
			
 
				+			ecode = ntdb_read_convert(ntdb, first, &f, sizeof(f));
			
 
				+			if (ecode != NTDB_SUCCESS) {
			
 
				+				return ecode;
			
 
				+			}
			
 
				+			ecode = check_free(ntdb, first, &f, prev, ftable_num, i);
			
 
				+			if (ecode != NTDB_SUCCESS) {
			
 
				+				return ecode;
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+	return NTDB_SUCCESS;
			
 
				+}
			
 
				+
			
 
				+/* Slow, but should be very rare. */
			
 
				+ntdb_off_t dead_space(struct ntdb_context *ntdb, ntdb_off_t off)
			
 
				+{
			
 
				+	size_t len;
			
 
				+	enum NTDB_ERROR ecode;
			
 
				+
			
 
				+	for (len = 0; off + len < ntdb->file->map_size; len++) {
			
 
				+		char c;
			
 
				+		ecode = ntdb->io->tread(ntdb, off, &c, 1);
			
 
				+		if (ecode != NTDB_SUCCESS) {
			
 
				+			return NTDB_ERR_TO_OFF(ecode);
			
 
				+		}
			
 
				+		if (c != 0 && c != 0x43)
			
 
				+			break;
			
 
				+	}
			
 
				+	return len;
			
 
				+}
			
 
				+
			
 
				+static enum NTDB_ERROR check_linear(struct ntdb_context *ntdb,
			
 
				+				   ntdb_off_t **used, size_t *num_used,
			
 
				+				   ntdb_off_t **fr, size_t *num_free,
			
 
				+				   uint64_t features, ntdb_off_t recovery)
			
 
				+{
			
 
				+	ntdb_off_t off;
			
 
				+	ntdb_len_t len;
			
 
				+	enum NTDB_ERROR ecode;
			
 
				+	bool found_recovery = false;
			
 
				+
			
 
				+	for (off = sizeof(struct ntdb_header);
			
 
				+	     off < ntdb->file->map_size;
			
 
				+	     off += len) {
			
 
				+		union {
			
 
				+			struct ntdb_used_record u;
			
 
				+			struct ntdb_free_record f;
			
 
				+			struct ntdb_recovery_record r;
			
 
				+		} rec;
			
 
				+		/* r is larger: only get that if we need to. */
			
 
				+		ecode = ntdb_read_convert(ntdb, off, &rec, sizeof(rec.f));
			
 
				+		if (ecode != NTDB_SUCCESS) {
			
 
				+			return ecode;
			
 
				+		}
			
 
				+
			
 
				+		/* If we crash after ftruncate, we can get zeroes or fill. */
			
 
				+		if (rec.r.magic == NTDB_RECOVERY_INVALID_MAGIC
			
 
				+		    || rec.r.magic ==  0x4343434343434343ULL) {
			
 
				+			ecode = ntdb_read_convert(ntdb, off, &rec, sizeof(rec.r));
			
 
				+			if (ecode != NTDB_SUCCESS) {
			
 
				+				return ecode;
			
 
				+			}
			
 
				+			if (recovery == off) {
			
 
				+				found_recovery = true;
			
 
				+				len = sizeof(rec.r) + rec.r.max_len;
			
 
				+			} else {
			
 
				+				len = dead_space(ntdb, off);
			
 
				+				if (NTDB_OFF_IS_ERR(len)) {
			
 
				+					return NTDB_OFF_TO_ERR(len);
			
 
				+				}
			
 
				+				if (len < sizeof(rec.r)) {
			
 
				+					return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
			
 
				+							  NTDB_LOG_ERROR,
			
 
				+							  "ntdb_check: invalid"
			
 
				+							  " dead space at %zu",
			
 
				+							  (size_t)off);
			
 
				+				}
			
 
				+
			
 
				+				ntdb_logerr(ntdb, NTDB_SUCCESS, NTDB_LOG_WARNING,
			
 
				+					   "Dead space at %zu-%zu (of %zu)",
			
 
				+					   (size_t)off, (size_t)(off + len),
			
 
				+					   (size_t)ntdb->file->map_size);
			
 
				+			}
			
 
				+		} else if (rec.r.magic == NTDB_RECOVERY_MAGIC) {
			
 
				+			ecode = ntdb_read_convert(ntdb, off, &rec, sizeof(rec.r));
			
 
				+			if (ecode != NTDB_SUCCESS) {
			
 
				+				return ecode;
			
 
				+			}
			
 
				+			if (recovery != off) {
			
 
				+				return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
			
 
				+						  NTDB_LOG_ERROR,
			
 
				+						  "ntdb_check: unexpected"
			
 
				+						  " recovery record at offset"
			
 
				+						  " %zu",
			
 
				+						  (size_t)off);
			
 
				+			}
			
 
				+			if (rec.r.len > rec.r.max_len) {
			
 
				+				return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
			
 
				+						  NTDB_LOG_ERROR,
			
 
				+						  "ntdb_check: invalid recovery"
			
 
				+						  " length %zu",
			
 
				+						  (size_t)rec.r.len);
			
 
				+			}
			
 
				+			if (rec.r.eof > ntdb->file->map_size) {
			
 
				+				return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
			
 
				+						  NTDB_LOG_ERROR,
			
 
				+						  "ntdb_check: invalid old EOF"
			
 
				+						  " %zu", (size_t)rec.r.eof);
			
 
				+			}
			
 
				+			found_recovery = true;
			
 
				+			len = sizeof(rec.r) + rec.r.max_len;
			
 
				+		} else if (frec_magic(&rec.f) == NTDB_FREE_MAGIC) {
			
 
				+			len = sizeof(rec.u) + frec_len(&rec.f);
			
 
				+			if (off + len > ntdb->file->map_size) {
			
 
				+				return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
			
 
				+						  NTDB_LOG_ERROR,
			
 
				+						  "ntdb_check: free overlength"
			
 
				+						  " %llu at offset %llu",
			
 
				+						  (long long)len,
			
 
				+						  (long long)off);
			
 
				+			}
			
 
				+			/* This record should be in free lists. */
			
 
				+			if (frec_ftable(&rec.f) != NTDB_FTABLE_NONE
			
 
				+			    && !append(ntdb, fr, num_free, off)) {
			
 
				+				return ntdb_logerr(ntdb, NTDB_ERR_OOM,
			
 
				+						  NTDB_LOG_ERROR,
			
 
				+						  "ntdb_check: tracking %zu'th"
			
 
				+						  " free record.", *num_free);
			
 
				+			}
			
 
				+		} else if (rec_magic(&rec.u) == NTDB_USED_MAGIC
			
 
				+			   || rec_magic(&rec.u) == NTDB_CHAIN_MAGIC
			
 
				+			   || rec_magic(&rec.u) == NTDB_HTABLE_MAGIC
			
 
				+			   || rec_magic(&rec.u) == NTDB_FTABLE_MAGIC
			
 
				+			   || rec_magic(&rec.u) == NTDB_CAP_MAGIC) {
			
 
				+			uint64_t klen, dlen, extra;
			
 
				+
			
 
				+			/* This record is used! */
			
 
				+			if (!append(ntdb, used, num_used, off)) {
			
 
				+				return ntdb_logerr(ntdb, NTDB_ERR_OOM,
			
 
				+						  NTDB_LOG_ERROR,
			
 
				+						  "ntdb_check: tracking %zu'th"
			
 
				+						  " used record.", *num_used);
			
 
				+			}
			
 
				+
			
 
				+			klen = rec_key_length(&rec.u);
			
 
				+			dlen = rec_data_length(&rec.u);
			
 
				+			extra = rec_extra_padding(&rec.u);
			
 
				+
			
 
				+			len = sizeof(rec.u) + klen + dlen + extra;
			
 
				+			if (off + len > ntdb->file->map_size) {
			
 
				+				return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
			
 
				+						  NTDB_LOG_ERROR,
			
 
				+						  "ntdb_check: used overlength"
			
 
				+						  " %llu at offset %llu",
			
 
				+						  (long long)len,
			
 
				+						  (long long)off);
			
 
				+			}
			
 
				+
			
 
				+			if (len < sizeof(rec.f)) {
			
 
				+				return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
			
 
				+						  NTDB_LOG_ERROR,
			
 
				+						  "ntdb_check: too short record"
			
 
				+						  " %llu at %llu",
			
 
				+						  (long long)len,
			
 
				+						  (long long)off);
			
 
				+			}
			
 
				+
			
 
				+			/* Check that records have correct 0 at end (but may
			
 
				+			 * not in future). */
			
 
				+			if (extra && !features
			
 
				+			    && rec_magic(&rec.u) != NTDB_CAP_MAGIC) {
			
 
				+				const char *p;
			
 
				+				char c;
			
 
				+				p = ntdb_access_read(ntdb, off + sizeof(rec.u)
			
 
				+						    + klen + dlen, 1, false);
			
 
				+				if (NTDB_PTR_IS_ERR(p))
			
 
				+					return NTDB_PTR_ERR(p);
			
 
				+				c = *p;
			
 
				+				ntdb_access_release(ntdb, p);
			
 
				+
			
 
				+				if (c != '\0') {
			
 
				+					return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
			
 
				+							  NTDB_LOG_ERROR,
			
 
				+							  "ntdb_check:"
			
 
				+							  " non-zero extra"
			
 
				+							  " at %llu",
			
 
				+							  (long long)off);
			
 
				+				}
			
 
				+			}
			
 
				+		} else {
			
 
				+			return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
			
 
				+					  NTDB_LOG_ERROR,
			
 
				+					  "ntdb_check: Bad magic 0x%llx"
			
 
				+					  " at offset %zu",
			
 
				+					  (long long)rec_magic(&rec.u),
			
 
				+					  (size_t)off);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	/* We must have found recovery area if there was one. */
			
 
				+	if (recovery != 0 && !found_recovery) {
			
 
				+		return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
			
 
				+				  "ntdb_check: expected a recovery area at %zu",
			
 
				+				  (size_t)recovery);
			
 
				+	}
			
 
				+
			
 
				+	return NTDB_SUCCESS;
			
 
				+}
			
 
				+
			
 
				+_PUBLIC_ enum NTDB_ERROR ntdb_check_(struct ntdb_context *ntdb,
			
 
				+			  enum NTDB_ERROR (*check)(NTDB_DATA, NTDB_DATA, void *),
			
 
				+			  void *data)
			
 
				+{
			
 
				+	ntdb_off_t *fr = NULL, *used = NULL;
			
 
				+	ntdb_off_t ft = 0, recovery = 0;
			
 
				+	size_t num_free = 0, num_used = 0, num_found = 0, num_ftables = 0,
			
 
				+		num_capabilities = 0;
			
 
				+	uint64_t features = 0;
			
 
				+	enum NTDB_ERROR ecode;
			
 
				+
			
 
				+	if (ntdb->flags & NTDB_CANT_CHECK) {
			
 
				+		return ntdb_logerr(ntdb, NTDB_SUCCESS, NTDB_LOG_WARNING,
			
 
				+				  "ntdb_check: database has unknown capability,"
			
 
				+				  " cannot check.");
			
 
				+	}
			
 
				+
			
 
				+	ecode = ntdb_allrecord_lock(ntdb, F_RDLCK, NTDB_LOCK_WAIT, false);
			
 
				+	if (ecode != NTDB_SUCCESS) {
			
 
				+		return ecode;
			
 
				+	}
			
 
				+
			
 
				+	ecode = ntdb_lock_expand(ntdb, F_RDLCK);
			
 
				+	if (ecode != NTDB_SUCCESS) {
			
 
				+		ntdb_allrecord_unlock(ntdb, F_RDLCK);
			
 
				+		return ecode;
			
 
				+	}
			
 
				+
			
 
				+	ecode = check_header(ntdb, &recovery, &features, &num_capabilities);
			
 
				+	if (ecode != NTDB_SUCCESS)
			
 
				+		goto out;
			
 
				+
			
 
				+	/* First we do a linear scan, checking all records. */
			
 
				+	ecode = check_linear(ntdb, &used, &num_used, &fr, &num_free, features,
			
 
				+			     recovery);
			
 
				+	if (ecode != NTDB_SUCCESS)
			
 
				+		goto out;
			
 
				+
			
 
				+	for (ft = first_ftable(ntdb); ft; ft = next_ftable(ntdb, ft)) {
			
 
				+		if (NTDB_OFF_IS_ERR(ft)) {
			
 
				+			ecode = NTDB_OFF_TO_ERR(ft);
			
 
				+			goto out;
			
 
				+		}
			
 
				+		ecode = check_free_table(ntdb, ft, num_ftables, fr, num_free,
			
 
				+					 &num_found);
			
 
				+		if (ecode != NTDB_SUCCESS)
			
 
				+			goto out;
			
 
				+		num_ftables++;
			
 
				+	}
			
 
				+
			
 
				+	/* FIXME: Check key uniqueness? */
			
 
				+	ecode = check_hash(ntdb, used, num_used, num_ftables + num_capabilities,
			
 
				+			   check, data);
			
 
				+	if (ecode != NTDB_SUCCESS)
			
 
				+		goto out;
			
 
				+
			
 
				+	if (num_found != num_free) {
			
 
				+		ecode = ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
			
 
				+				   "ntdb_check: Not all entries are in"
			
 
				+				   " free table");
			
 
				+	}
			
 
				+
			
 
				+out:
			
 
				+	ntdb_allrecord_unlock(ntdb, F_RDLCK);
			
 
				+	ntdb_unlock_expand(ntdb, F_RDLCK);
			
 
				+	ntdb->free_fn(fr, ntdb->alloc_data);
			
 
				+	ntdb->free_fn(used, ntdb->alloc_data);
			
 
				+	return ecode;
			
 
				+}
			
--- a/ccan/ntdb/configure
+++ b/ccan/ntdb/configure
@@ -0,0 +1,21 @@
 
				+#!/bin/sh
			
 
				+
			
 
				+PREVPATH=`dirname $0`
			
 
				+
			
 
				+if [ -f $PREVPATH/../../buildtools/bin/waf ]; then
			
 
				+	WAF=../../buildtools/bin/waf
			
 
				+elif [ -f $PREVPATH/buildtools/bin/waf ]; then
			
 
				+	WAF=./buildtools/bin/waf
			
 
				+else
			
 
				+	echo "replace: Unable to find waf"
			
 
				+	exit 1
			
 
				+fi
			
 
				+
			
 
				+# using JOBS=1 gives maximum compatibility with
			
 
				+# systems like AIX which have broken threading in python
			
 
				+JOBS=1
			
 
				+export JOBS
			
 
				+
			
 
				+cd . || exit 1
			
 
				+$WAF configure "$@" || exit 1
			
 
				+cd $PREVPATH
			
--- a/ccan/ntdb/doc/TDB_porting.txt
+++ b/ccan/ntdb/doc/TDB_porting.txt
@@ -0,0 +1,483 @@
 
				+Interface differences between TDB and NTDB.
			
 
				+
			
 
				+- ntdb shares 'struct TDB_DATA' with tdb, but TDB defines the TDB_DATA
			
 
				+  typedef, whereas ntdb defines NTDB_DATA (ie. both are compatible).
			
 
				+  If you include both ntdb.h and tdb.h, #include tdb.h first,
			
 
				+  otherwise you'll get a compile error when tdb.h re-defined struct
			
 
				+  TDB_DATA.
			
 
				+
			
 
				+  Example:
			
 
				+	#include <tdb.h>
			
 
				+	#include <ntdb.h>
			
 
				+
			
 
				+- ntdb functions return NTDB_SUCCESS (ie 0) on success, and a negative
			
 
				+  error on failure, whereas tdb functions returned 0 on success, and
			
 
				+  -1 on failure.  tdb then used tdb_error() to determine the error;
			
 
				+  this API is nasty if we ever want to support threads, so is not supported.
			
 
				+
			
 
				+  Example:
			
 
				+	#include <tdb.h>
			
 
				+	#include <ntdb.h>
			
 
				+
			
 
				+	void tdb_example(struct tdb_context *tdb, TDB_DATA key, TDB_DATA d)
			
 
				+	{
			
 
				+		if (tdb_store(tdb, key, d) == -1) {
			
 
				+			printf("store failed: %s\n", tdb_errorstr(tdb));
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	void ntdb_example(struct ntdb_context *ntdb, NTDB_DATA key, NTDB_DATA d)
			
 
				+	{
			
 
				+		enum NTDB_ERROR e;
			
 
				+
			
 
				+		e = ntdb_store(ntdb, key, d);
			
 
				+		if (e) {
			
 
				+			printf("store failed: %s\n", ntdb_errorstr(e));
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+- ntdb's ntdb_fetch() returns an error, tdb's returned the data directly
			
 
				+  (or tdb_null, and you were supposed to check tdb_error() to find out why).
			
 
				+
			
 
				+  Example:
			
 
				+	#include <tdb.h>
			
 
				+	#include <ntdb.h>
			
 
				+
			
 
				+	void tdb_example(struct tdb_context *tdb, TDB_DATA key)
			
 
				+	{
			
 
				+		TDB_DATA data;
			
 
				+
			
 
				+		data = tdb_fetch(tdb, key);
			
 
				+		if (!data.dptr) {
			
 
				+			printf("fetch failed: %s\n", tdb_errorstr(tdb));
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	void ntdb_example(struct ntdb_context *ntdb, NTDB_DATA key)
			
 
				+	{
			
 
				+		NTDB_DATA data;
			
 
				+		enum NTDB_ERROR e;
			
 
				+
			
 
				+		e = ntdb_fetch(ntdb, key, &data);
			
 
				+		if (e) {
			
 
				+			printf("fetch failed: %s\n", ntdb_errorstr(e));
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+- ntdb's ntdb_nextkey() frees the old key's dptr, in tdb you needed to do
			
 
				+  this manually.
			
 
				+
			
 
				+  Example:
			
 
				+	#include <tdb.h>
			
 
				+	#include <ntdb.h>
			
 
				+
			
 
				+	void tdb_example(struct tdb_context *tdb)
			
 
				+	{
			
 
				+		TDB_DATA key, next, data;
			
 
				+
			
 
				+		for (key = tdb_firstkey(tdb); key.dptr; key = next) {
			
 
				+			printf("Got key!\n");
			
 
				+			next = tdb_nextkey(tdb, key);
			
 
				+			free(key.dptr);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+
			
 
				+	void ntdb_example(struct ntdb_context *ntdb)
			
 
				+	{
			
 
				+		NTDB_DATA k, data;
			
 
				+		enum NTDB_ERROR e;
			
 
				+
			
 
				+		for (e = ntdb_firstkey(ntdb,&k); !e; e = ntdb_nextkey(ntdb,&k))
			
 
				+			printf("Got key!\n");
			
 
				+	}
			
 
				+
			
 
				+- Unlike tdb_open/tdb_open_ex, ntdb_open does not allow NULL names,
			
 
				+  even for NTDB_INTERNAL dbs, and thus ntdb_name() never returns NULL.
			
 
				+
			
 
				+  Example:
			
 
				+	#include <tdb.h>
			
 
				+	#include <ntdb.h>
			
 
				+
			
 
				+	struct tdb_context *tdb_example(void)
			
 
				+	{
			
 
				+		return tdb_open(NULL, 0, TDB_INTERNAL, O_RDWR, 0);
			
 
				+	}
			
 
				+
			
 
				+	struct ntdb_context *ntdb_example(void)
			
 
				+	{
			
 
				+		return ntdb_open("example", NTDB_INTERNAL, O_RDWR, 0);
			
 
				+	}
			
 
				+
			
 
				+- ntdb uses a linked list of attribute structures to implement logging and
			
 
				+  alternate hashes.  tdb used tdb_open_ex, which was not extensible.
			
 
				+
			
 
				+  Example:
			
 
				+	#include <tdb.h>
			
 
				+	#include <ntdb.h>
			
 
				+
			
 
				+	/* Custom hash function */
			
 
				+	static unsigned int my_tdb_hash_func(TDB_DATA *key)
			
 
				+	{
			
 
				+		return key->dsize;
			
 
				+	}
			
 
				+
			
 
				+	struct tdb_context *tdb_example(void)
			
 
				+	{
			
 
				+		return tdb_open_ex("example.tdb", 0, TDB_DEFAULT,
			
 
				+		                   O_CREAT|O_RDWR, 0600, NULL, my_hash_func);
			
 
				+	}
			
 
				+
			
 
				+	/* Custom hash function */
			
 
				+	static unsigned int my_ntdb_hash_func(const void *key, size_t len,
			
 
				+					      uint32_t seed, void *data)
			
 
				+	{
			
 
				+		return len;
			
 
				+	}
			
 
				+
			
 
				+	struct ntdb_context *ntdb_example(void)
			
 
				+	{
			
 
				+		union ntdb_attribute hash;
			
 
				+
			
 
				+		hash.base.attr = NTDB_ATTRIBUTE_HASH;
			
 
				+		hash.base.next = NULL;
			
 
				+		hash.hash.fn = my_ntdb_hash_func;
			
 
				+		return ntdb_open("example.ntdb", NTDB_DEFAULT,
			
 
				+		                   O_CREAT|O_RDWR, 0600, &hash);
			
 
				+	}
			
 
				+
			
 
				+- tdb's tdb_open/tdb_open_ex took an explicit hash size, defaulting to
			
 
				+  131.  ntdb's uses an attribute for this, defaulting to 8192.
			
 
				+
			
 
				+  Example:
			
 
				+	#include <tdb.h>
			
 
				+	#include <ntdb.h>
			
 
				+
			
 
				+	struct tdb_context *tdb_example(void)
			
 
				+	{
			
 
				+		return tdb_open("example.tdb", 10007, TDB_DEFAULT,
			
 
				+		                O_CREAT|O_RDWR, 0600);
			
 
				+	}
			
 
				+
			
 
				+	struct ntdb_context *ntdb_example(void)
			
 
				+	{
			
 
				+		union ntdb_attribute hashsize;
			
 
				+
			
 
				+		hashsize.base.attr = NTDB_ATTRIBUTE_HASHSIZE;
			
 
				+		hashsize.base.next = NULL;
			
 
				+		hashsize.hashsize.size = 16384;
			
 
				+		return ntdb_open("example.ntdb", NTDB_DEFAULT,
			
 
				+		                   O_CREAT|O_RDWR, 0600, &hashsize);
			
 
				+	}
			
 
				+
			
 
				+- ntdb's log function is simpler than tdb's log function.  The string
			
 
				+  is already formatted, is not terminated by a '\n', and it takes an
			
 
				+  enum ntdb_log_level not a tdb_debug_level, and which has only three
			
 
				+  values: NTDB_LOG_ERROR, NTDB_LOG_USE_ERROR and NTDB_LOG_WARNING.
			
 
				+
			
 
				+	#include <tdb.h>
			
 
				+	#include <ntdb.h>
			
 
				+
			
 
				+	static void tdb_log(struct tdb_context *tdb,
			
 
				+	                    enum tdb_debug_level level, const char *fmt, ...)
			
 
				+	{
			
 
				+		va_list ap;
			
 
				+		const char *name;
			
 
				+
			
 
				+		switch (level) {
			
 
				+		case TDB_DEBUG_FATAL:
			
 
				+			fprintf(stderr, "FATAL: ");
			
 
				+			break;
			
 
				+		case TDB_DEBUG_ERROR:
			
 
				+			fprintf(stderr, "ERROR: ");
			
 
				+			break;
			
 
				+		case TDB_DEBUG_WARNING:
			
 
				+			fprintf(stderr, "WARNING: ");
			
 
				+			break;
			
 
				+		case TDB_DEBUG_TRACE:
			
 
				+			/* Don't print out tracing. */
			
 
				+			return;
			
 
				+		}
			
 
				+
			
 
				+		name = tdb_name(tdb);
			
 
				+		if (!name) {
			
 
				+			name = "unnamed";
			
 
				+		}
			
 
				+
			
 
				+		fprintf(stderr, "tdb(%s):", name);
			
 
				+
			
 
				+		va_start(ap, fmt);
			
 
				+		vfprintf(stderr, fmt, ap);
			
 
				+		va_end(ap);
			
 
				+	}
			
 
				+
			
 
				+	struct tdb_context *tdb_example(void)
			
 
				+	{
			
 
				+		struct tdb_logging_context lctx;
			
 
				+
			
 
				+		lctx.log_fn = tdb_log;
			
 
				+		return tdb_open_ex("example.tdb", 0, TDB_DEFAULT,
			
 
				+		                   O_CREAT|O_RDWR, 0600, &lctx, NULL);
			
 
				+	}
			
 
				+
			
 
				+	static void ntdb_log(struct ntdb_context *ntdb,
			
 
				+			     enum ntdb_log_level level,
			
 
				+			     enum NTDB_ERROR ecode,
			
 
				+			     const char *message,
			
 
				+			     void *data)
			
 
				+	{
			
 
				+		switch (level) {
			
 
				+		case NTDB_LOG_ERROR:
			
 
				+			fprintf(stderr, "ERROR: ");
			
 
				+			break;
			
 
				+		case NTDB_LOG_USE_ERROR:
			
 
				+			/* We made a mistake, so abort. */
			
 
				+			abort();
			
 
				+			break;
			
 
				+		case NTDB_LOG_WARNING:
			
 
				+			fprintf(stderr, "WARNING: ");
			
 
				+			break;
			
 
				+		}
			
 
				+
			
 
				+		fprintf(stderr, "ntdb(%s):%s:%s\n",
			
 
				+			ntdb_name(ntdb), ntdb_errorstr(ecode), message);
			
 
				+	}
			
 
				+
			
 
				+	struct ntdb_context *ntdb_example(void)
			
 
				+	{
			
 
				+		union ntdb_attribute log;
			
 
				+
			
 
				+		log.base.attr = NTDB_ATTRIBUTE_LOG;
			
 
				+		log.base.next = NULL;
			
 
				+		log.log.fn = ntdb_log;
			
 
				+		return ntdb_open("example.ntdb", NTDB_DEFAULT,
			
 
				+		                 O_CREAT|O_RDWR, 0600, &log);
			
 
				+	}
			
 
				+
			
 
				+- ntdb provides ntdb_deq() for comparing two NTDB_DATA, and ntdb_mkdata() for
			
 
				+  creating an NTDB_DATA.
			
 
				+
			
 
				+	#include <tdb.h>
			
 
				+	#include <ntdb.h>
			
 
				+
			
 
				+	void tdb_example(struct tdb_context *tdb)
			
 
				+	{
			
 
				+		TDB_DATA data, key;
			
 
				+
			
 
				+		key.dsize = strlen("hello");
			
 
				+		key.dptr = "hello";
			
 
				+		data = tdb_fetch(tdb, key);
			
 
				+		if (data.dsize == key.dsize
			
 
				+		    && !memcmp(data.dptr, key.dptr, key.dsize))
			
 
				+			printf("key is same as data\n");
			
 
				+		}
			
 
				+		free(data.dptr);
			
 
				+	}
			
 
				+
			
 
				+	void ntdb_example(struct ntdb_context *ntdb)
			
 
				+	{
			
 
				+		NTDB_DATA data, key;
			
 
				+
			
 
				+		key = ntdb_mkdata("hello", strlen("hello"));
			
 
				+		if (ntdb_fetch(ntdb, key, &data) == NTDB_SUCCESS) {
			
 
				+			if (ntdb_deq(key, data)) {
			
 
				+				printf("key is same as data\n");
			
 
				+			}
			
 
				+			free(data.dptr);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+- ntdb's ntdb_parse_record() takes a type-checked callback data
			
 
				+  pointer, not a void * (though a void * pointer still works).  The
			
 
				+  callback function is allowed to do read operations on the database,
			
 
				+  or write operations if you first call ntdb_lockall().  TDB's
			
 
				+  tdb_parse_record() did not allow any database access within the
			
 
				+  callback, could crash if you tried.
			
 
				+
			
 
				+  Example:
			
 
				+	#include <tdb.h>
			
 
				+	#include <ntdb.h>
			
 
				+
			
 
				+	static int tdb_parser(TDB_DATA key, TDB_DATA data, void *private_data)
			
 
				+	{
			
 
				+		TDB_DATA *expect = private_data;
			
 
				+
			
 
				+		return data.dsize == expect->dsize
			
 
				+			&& !memcmp(data.dptr, expect->dptr, data.dsize);
			
 
				+	}
			
 
				+
			
 
				+	void tdb_example(struct tdb_context *tdb, TDB_DATA key, NTDB_DATA d)
			
 
				+	{
			
 
				+		switch (tdb_parse_record(tdb, key, tdb_parser, &d)) {
			
 
				+		case -1:
			
 
				+			printf("parse failed: %s\n", tdb_errorstr(tdb));
			
 
				+			break;
			
 
				+		case 0:
			
 
				+			printf("data was different!\n");
			
 
				+			break;
			
 
				+		case 1:
			
 
				+			printf("data was same!\n");
			
 
				+			break;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	static int ntdb_parser(TDB_DATA key, TDB_DATA data, TDB_DATA *expect)
			
 
				+	{
			
 
				+		return ntdb_deq(data, *expect);
			
 
				+	}
			
 
				+
			
 
				+	void ntdb_example(struct ntdb_context *ntdb, NTDB_DATA key, NTDB_DATA d)
			
 
				+	{
			
 
				+		enum NTDB_ERROR e;
			
 
				+
			
 
				+		e = tdb_parse_record(tdb, key, tdb_parser, &d);
			
 
				+		switch (e) {
			
 
				+		case 0:
			
 
				+			printf("data was different!\n");
			
 
				+			break;
			
 
				+		case 1:
			
 
				+			printf("data was same!\n");
			
 
				+			break;
			
 
				+		default:
			
 
				+			printf("parse failed: %s\n", ntdb_errorstr(e));
			
 
				+			break;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+- ntdb does locking on read-only databases (ie. O_RDONLY passed to ntdb_open).
			
 
				+  tdb did not: use the NTDB_NOLOCK flag if you want to suppress locking.
			
 
				+
			
 
				+  Example:
			
 
				+	#include <tdb.h>
			
 
				+	#include <ntdb.h>
			
 
				+
			
 
				+	struct tdb_context *tdb_example(void)
			
 
				+	{
			
 
				+		return tdb_open("example.tdb", 0, TDB_DEFAULT, O_RDONLY, 0);
			
 
				+	}
			
 
				+
			
 
				+	struct ntdb_context *ntdb_example(void)
			
 
				+	{
			
 
				+		return ntdb_open("example.ntdb", NTDB_NOLOCK, O_RDONLY, NULL);
			
 
				+	}
			
 
				+
			
 
				+- Failure inside a transaction (such as a lock function failing) does
			
 
				+  not implicitly cancel the transaction; you still need to call
			
 
				+  ntdb_transaction_cancel().
			
 
				+
			
 
				+	#include <tdb.h>
			
 
				+	#include <ntdb.h>
			
 
				+
			
 
				+	void tdb_example(struct tdb_context *tdb, TDB_DATA key, TDB_DATA d)
			
 
				+	{
			
 
				+		if (tdb_transaction_start(tdb) == -1) {
			
 
				+			printf("transaction failed: %s\n", tdb_errorstr(tdb));
			
 
				+			return;
			
 
				+		}
			
 
				+
			
 
				+		if (tdb_store(tdb, key, d) == -1) {
			
 
				+			printf("store failed: %s\n", tdb_errorstr(tdb));
			
 
				+			return;
			
 
				+		}
			
 
				+		if (tdb_transaction_commit(tdb) == -1) {
			
 
				+			printf("commit failed: %s\n", tdb_errorstr(tdb));
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	void ntdb_example(struct ntdb_context *ntdb, NTDB_DATA key, NTDB_DATA d)
			
 
				+	{
			
 
				+		enum NTDB_ERROR e;
			
 
				+
			
 
				+		e = ntdb_transaction_start(ntdb);
			
 
				+		if (e) {
			
 
				+			printf("transaction failed: %s\n", ntdb_errorstr(e));
			
 
				+			return;
			
 
				+		}
			
 
				+
			
 
				+		e = ntdb_store(ntdb, key, d);
			
 
				+		if (e) {
			
 
				+			printf("store failed: %s\n", ntdb_errorstr(e));
			
 
				+			ntdb_transaction_cancel(ntdb);
			
 
				+		}
			
 
				+
			
 
				+		e = ntdb_transaction_commit(ntdb);
			
 
				+		if (e) {
			
 
				+			printf("commit failed: %s\n", ntdb_errorstr(e));
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+- There is no NTDB_CLEAR_IF_FIRST flag; it has severe scalability and
			
 
				+  API problems.  If necessary, you can emulate this by using the open
			
 
				+  hook and placing a 1-byte lock at offset 4.  If your program forks
			
 
				+  and exits, you will need to place this lock again in the child before
			
 
				+  the parent exits.
			
 
				+
			
 
				+  Example:
			
 
				+
			
 
				+	#include <tdb.h>
			
 
				+	#include <ntdb.h>
			
 
				+
			
 
				+	struct tdb_context *tdb_example(void)
			
 
				+	{
			
 
				+		return tdb_open("example.tdb", 0, TDB_CLEAR_IF_FIRST,
			
 
				+		                   O_CREAT|O_RDWR, 0600);
			
 
				+	}
			
 
				+
			
 
				+	static enum NTDB_ERROR clear_if_first(int fd, void *unused)
			
 
				+	{
			
 
				+		/* We hold a lock offset 4 always, so we can tell if
			
 
				+		 * anyone else is. */
			
 
				+		struct flock fl;
			
 
				+
			
 
				+		fl.l_type = F_WRLCK;
			
 
				+		fl.l_whence = SEEK_SET;
			
 
				+		fl.l_start = 4; /* ACTIVE_LOCK */
			
 
				+		fl.l_len = 1;
			
 
				+
			
 
				+		if (fcntl(fd, F_SETLK, &fl) == 0) {
			
 
				+			/* We must be first ones to open it!  Clear it. */
			
 
				+			if (ftruncate(fd, 0) != 0) {
			
 
				+				return NTDB_ERR_IO;
			
 
				+			}
			
 
				+		}
			
 
				+		fl.l_type = F_RDLCK;
			
 
				+		if (fcntl(fd, F_SETLKW, &fl) != 0) {
			
 
				+			return NTDB_ERR_IO;
			
 
				+		}
			
 
				+		return NTDB_SUCCESS;
			
 
				+	}
			
 
				+
			
 
				+	struct ntdb_context *ntdb_example(void)
			
 
				+	{
			
 
				+		union ntdb_attribute open_attr;
			
 
				+
			
 
				+		open_attr.openhook.base.attr = NTDB_ATTRIBUTE_OPENHOOK;
			
 
				+		open_attr.openhook.base.next = NULL;
			
 
				+		open_attr.openhook.fn = clear_if_first;
			
 
				+
			
 
				+		return ntdb_open("example.ntdb", NTDB_DEFAULT,
			
 
				+		                 O_CREAT|O_RDWR, 0600, &open_attr);
			
 
				+	}
			
 
				+
			
 
				+- ntdb traversals are not reliable if the database is changed during
			
 
				+  the traversal, ie your traversal may not cover all elements, or may
			
 
				+  cover elements multiple times.  As a special exception, deleting the
			
 
				+  current record within ntdb_traverse() is reliable.
			
 
				+
			
 
				+- There is no ntdb_traverse_read, since ntdb_traverse does not hold
			
 
				+  a lock across the entire traversal anyway.  If you want to make sure
			
 
				+  that your traversal function does not write to the database, you can
			
 
				+  set and clear the NTDB_RDONLY flag around the traversal.
			
 
				+
			
 
				+- ntdb does not need tdb_reopen() or tdb_reopen_all().  If you call
			
 
				+  fork() after during certain operations the child should close the
			
 
				+  ntdb, or complete the operations before continuing to use the tdb:
			
 
				+
			
 
				+	ntdb_transaction_start(): child must ntdb_transaction_cancel()
			
 
				+	ntdb_lockall(): child must call ntdb_unlockall()
			
 
				+	ntdb_lockall_read(): child must call ntdb_unlockall_read()
			
 
				+	ntdb_chainlock(): child must call ntdb_chainunlock()
			
 
				+	ntdb_parse() callback: child must return from ntdb_parse()
			
 
				+
			
 
				+- ntdb will not open a non-ntdb file, even if O_CREAT is specified.  tdb
			
 
				+  will overwrite an unknown file in that case.
			
--- a/ccan/ntdb/doc/design.lyx
+++ b/ccan/ntdb/doc/design.lyx
@@ -0,0 +1,2727 @@
 
				+#LyX 2.0 created this file. For more info see http://www.lyx.org/
			
 
				+\lyxformat 413
			
 
				+\begin_document
			
 
				+\begin_header
			
 
				+\textclass article
			
 
				+\use_default_options true
			
 
				+\maintain_unincluded_children false
			
 
				+\language english
			
 
				+\language_package default
			
 
				+\inputencoding auto
			
 
				+\fontencoding global
			
 
				+\font_roman default
			
 
				+\font_sans default
			
 
				+\font_typewriter default
			
 
				+\font_default_family default
			
 
				+\use_non_tex_fonts false
			
 
				+\font_sc false
			
 
				+\font_osf false
			
 
				+\font_sf_scale 100
			
 
				+\font_tt_scale 100
			
 
				+
			
 
				+\graphics default
			
 
				+\default_output_format default
			
 
				+\output_sync 0
			
 
				+\bibtex_command default
			
 
				+\index_command default
			
 
				+\paperfontsize default
			
 
				+\use_hyperref false
			
 
				+\papersize default
			
 
				+\use_geometry false
			
 
				+\use_amsmath 1
			
 
				+\use_esint 1
			
 
				+\use_mhchem 1
			
 
				+\use_mathdots 1
			
 
				+\cite_engine basic
			
 
				+\use_bibtopic false
			
 
				+\use_indices false
			
 
				+\paperorientation portrait
			
 
				+\suppress_date false
			
 
				+\use_refstyle 0
			
 
				+\index Index
			
 
				+\shortcut idx
			
 
				+\color #008000
			
 
				+\end_index
			
 
				+\secnumdepth 3
			
 
				+\tocdepth 3
			
 
				+\paragraph_separation indent
			
 
				+\paragraph_indentation default
			
 
				+\quotes_language english
			
 
				+\papercolumns 1
			
 
				+\papersides 1
			
 
				+\paperpagestyle default
			
 
				+\tracking_changes true
			
 
				+\output_changes true
			
 
				+\html_math_output 0
			
 
				+\html_css_as_file 0
			
 
				+\html_be_strict false
			
 
				+\end_header
			
 
				+
			
 
				+\begin_body
			
 
				+
			
 
				+\begin_layout Title
			
 
				+NTDB: Redesigning The Trivial DataBase
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Author
			
 
				+Rusty Russell, IBM Corporation
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Date
			
 
				+19 June 2012
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Abstract
			
 
				+The Trivial DataBase on-disk format is 32 bits; with usage cases heading
			
 
				+ towards the 4G limit, that must change.
			
 
				+ This required breakage provides an opportunity to revisit TDB's other design
			
 
				+ decisions and reassess them.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Section
			
 
				+Introduction
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+The Trivial DataBase was originally written by Andrew Tridgell as a simple
			
 
				+ key/data pair storage system with the same API as dbm, but allowing multiple
			
 
				+ readers and writers while being small enough (< 1000 lines of C) to include
			
 
				+ in SAMBA.
			
 
				+ The simple design created in 1999 has proven surprisingly robust and performant
			
 
				+, used in Samba versions 3 and 4 as well as numerous other projects.
			
 
				+ Its useful life was greatly increased by the (backwards-compatible!) addition
			
 
				+ of transaction support in 2005.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+The wider variety and greater demands of TDB-using code has lead to some
			
 
				+ organic growth of the API, as well as some compromises on the implementation.
			
 
				+ None of these, by themselves, are seen as show-stoppers, but the cumulative
			
 
				+ effect is to a loss of elegance over the initial, simple TDB implementation.
			
 
				+ Here is a table of the approximate number of lines of implementation code
			
 
				+ and number of API functions at the end of each year:
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+\begin_inset Tabular
			
 
				+<lyxtabular version="3" rows="12" columns="3">
			
 
				+<features tabularvalignment="middle">
			
 
				+<column alignment="center" valignment="top" width="0">
			
 
				+<column alignment="center" valignment="top" width="0">
			
 
				+<column alignment="center" valignment="top" width="0">
			
 
				+<row>
			
 
				+<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
			
 
				+\begin_inset Text
			
 
				+
			
 
				+\begin_layout Plain Layout
			
 
				+Year End
			
 
				+\end_layout
			
 
				+
			
 
				+\end_inset
			
 
				+</cell>
			
 
				+<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
			
 
				+\begin_inset Text
			
 
				+
			
 
				+\begin_layout Plain Layout
			
 
				+API Functions
			
 
				+\end_layout
			
 
				+
			
 
				+\end_inset
			
 
				+</cell>
			
 
				+<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
			
 
				+\begin_inset Text
			
 
				+
			
 
				+\begin_layout Plain Layout
			
 
				+Lines of C Code Implementation
			
 
				+\end_layout
			
 
				+
			
 
				+\end_inset
			
 
				+</cell>
			
 
				+</row>
			
 
				+<row>
			
 
				+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
			
 
				+\begin_inset Text
			
 
				+
			
 
				+\begin_layout Plain Layout
			
 
				+1999
			
 
				+\end_layout
			
 
				+
			
 
				+\end_inset
			
 
				+</cell>
			
 
				+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
			
 
				+\begin_inset Text
			
 
				+
			
 
				+\begin_layout Plain Layout
			
 
				+13
			
 
				+\end_layout
			
 
				+
			
 
				+\end_inset
			
 
				+</cell>
			
 
				+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
			
 
				+\begin_inset Text
			
 
				+
			
 
				+\begin_layout Plain Layout
			
 
				+1195
			
 
				+\end_layout
			
 
				+
			
 
				+\end_inset
			
 
				+</cell>
			
 
				+</row>
			
 
				+<row>
			
 
				+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
			
 
				+\begin_inset Text
			
 
				+
			
 
				+\begin_layout Plain Layout
			
 
				+2000
			
 
				+\end_layout
			
 
				+
			
 
				+\end_inset
			
 
				+</cell>
			
 
				+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
			
 
				+\begin_inset Text
			
 
				+
			
 
				+\begin_layout Plain Layout
			
 
				+24
			
 
				+\end_layout
			
 
				+
			
 
				+\end_inset
			
 
				+</cell>
			
 
				+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
			
 
				+\begin_inset Text
			
 
				+
			
 
				+\begin_layout Plain Layout
			
 
				+1725
			
 
				+\end_layout
			
 
				+
			
 
				+\end_inset
			
 
				+</cell>
			
 
				+</row>
			
 
				+<row>
			
 
				+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
			
 
				+\begin_inset Text
			
 
				+
			
 
				+\begin_layout Plain Layout
			
 
				+2001
			
 
				+\end_layout
			
 
				+
			
 
				+\end_inset
			
 
				+</cell>
			
 
				+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
			
 
				+\begin_inset Text
			
 
				+
			
 
				+\begin_layout Plain Layout
			
 
				+32
			
 
				+\end_layout
			
 
				+
			
 
				+\end_inset
			
 
				+</cell>
			
 
				+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
			
 
				+\begin_inset Text
			
 
				+
			
 
				+\begin_layout Plain Layout
			
 
				+2228
			
 
				+\end_layout
			
 
				+
			
 
				+\end_inset
			
 
				+</cell>
			
 
				+</row>
			
 
				+<row>
			
 
				+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
			
 
				+\begin_inset Text
			
 
				+
			
 
				+\begin_layout Plain Layout
			
 
				+2002
			
 
				+\end_layout
			
 
				+
			
 
				+\end_inset
			
 
				+</cell>
			
 
				+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
			
 
				+\begin_inset Text
			
 
				+
			
 
				+\begin_layout Plain Layout
			
 
				+35
			
 
				+\end_layout
			
 
				+
			
 
				+\end_inset
			
 
				+</cell>
			
 
				+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
			
 
				+\begin_inset Text
			
 
				+
			
 
				+\begin_layout Plain Layout
			
 
				+2481
			
 
				+\end_layout
			
 
				+
			
 
				+\end_inset
			
 
				+</cell>
			
 
				+</row>
			
 
				+<row>
			
 
				+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
			
 
				+\begin_inset Text
			
 
				+
			
 
				+\begin_layout Plain Layout
			
 
				+2003
			
 
				+\end_layout
			
 
				+
			
 
				+\end_inset
			
 
				+</cell>
			
 
				+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
			
 
				+\begin_inset Text
			
 
				+
			
 
				+\begin_layout Plain Layout
			
 
				+35
			
 
				+\end_layout
			
 
				+
			
 
				+\end_inset
			
 
				+</cell>
			
 
				+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
			
 
				+\begin_inset Text
			
 
				+
			
 
				+\begin_layout Plain Layout
			
 
				+2552
			
 
				+\end_layout
			
 
				+
			
 
				+\end_inset
			
 
				+</cell>
			
 
				+</row>
			
 
				+<row>
			
 
				+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
			
 
				+\begin_inset Text
			
 
				+
			
 
				+\begin_layout Plain Layout
			
 
				+2004
			
 
				+\end_layout
			
 
				+
			
 
				+\end_inset
			
 
				+</cell>
			
 
				+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
			
 
				+\begin_inset Text
			
 
				+
			
 
				+\begin_layout Plain Layout
			
 
				+40
			
 
				+\end_layout
			
 
				+
			
 
				+\end_inset
			
 
				+</cell>
			
 
				+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
			
 
				+\begin_inset Text
			
 
				+
			
 
				+\begin_layout Plain Layout
			
 
				+2584
			
 
				+\end_layout
			
 
				+
			
 
				+\end_inset
			
 
				+</cell>
			
 
				+</row>
			
 
				+<row>
			
 
				+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
			
 
				+\begin_inset Text
			
 
				+
			
 
				+\begin_layout Plain Layout
			
 
				+2005
			
 
				+\end_layout
			
 
				+
			
 
				+\end_inset
			
 
				+</cell>
			
 
				+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
			
 
				+\begin_inset Text
			
 
				+
			
 
				+\begin_layout Plain Layout
			
 
				+38
			
 
				+\end_layout
			
 
				+
			
 
				+\end_inset
			
 
				+</cell>
			
 
				+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
			
 
				+\begin_inset Text
			
 
				+
			
 
				+\begin_layout Plain Layout
			
 
				+2647
			
 
				+\end_layout
			
 
				+
			
 
				+\end_inset
			
 
				+</cell>
			
 
				+</row>
			
 
				+<row>
			
 
				+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
			
 
				+\begin_inset Text
			
 
				+
			
 
				+\begin_layout Plain Layout
			
 
				+2006
			
 
				+\end_layout
			
 
				+
			
 
				+\end_inset
			
 
				+</cell>
			
 
				+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
			
 
				+\begin_inset Text
			
 
				+
			
 
				+\begin_layout Plain Layout
			
 
				+52
			
 
				+\end_layout
			
 
				+
			
 
				+\end_inset
			
 
				+</cell>
			
 
				+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
			
 
				+\begin_inset Text
			
 
				+
			
 
				+\begin_layout Plain Layout
			
 
				+3754
			
 
				+\end_layout
			
 
				+
			
 
				+\end_inset
			
 
				+</cell>
			
 
				+</row>
			
 
				+<row>
			
 
				+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
			
 
				+\begin_inset Text
			
 
				+
			
 
				+\begin_layout Plain Layout
			
 
				+2007
			
 
				+\end_layout
			
 
				+
			
 
				+\end_inset
			
 
				+</cell>
			
 
				+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
			
 
				+\begin_inset Text
			
 
				+
			
 
				+\begin_layout Plain Layout
			
 
				+66
			
 
				+\end_layout
			
 
				+
			
 
				+\end_inset
			
 
				+</cell>
			
 
				+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
			
 
				+\begin_inset Text
			
 
				+
			
 
				+\begin_layout Plain Layout
			
 
				+4398
			
 
				+\end_layout
			
 
				+
			
 
				+\end_inset
			
 
				+</cell>
			
 
				+</row>
			
 
				+<row>
			
 
				+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
			
 
				+\begin_inset Text
			
 
				+
			
 
				+\begin_layout Plain Layout
			
 
				+2008
			
 
				+\end_layout
			
 
				+
			
 
				+\end_inset
			
 
				+</cell>
			
 
				+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
			
 
				+\begin_inset Text
			
 
				+
			
 
				+\begin_layout Plain Layout
			
 
				+71
			
 
				+\end_layout
			
 
				+
			
 
				+\end_inset
			
 
				+</cell>
			
 
				+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
			
 
				+\begin_inset Text
			
 
				+
			
 
				+\begin_layout Plain Layout
			
 
				+4768
			
 
				+\end_layout
			
 
				+
			
 
				+\end_inset
			
 
				+</cell>
			
 
				+</row>
			
 
				+<row>
			
 
				+<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
			
 
				+\begin_inset Text
			
 
				+
			
 
				+\begin_layout Plain Layout
			
 
				+2009
			
 
				+\end_layout
			
 
				+
			
 
				+\end_inset
			
 
				+</cell>
			
 
				+<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
			
 
				+\begin_inset Text
			
 
				+
			
 
				+\begin_layout Plain Layout
			
 
				+73
			
 
				+\end_layout
			
 
				+
			
 
				+\end_inset
			
 
				+</cell>
			
 
				+<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
			
 
				+\begin_inset Text
			
 
				+
			
 
				+\begin_layout Plain Layout
			
 
				+5715
			
 
				+\end_layout
			
 
				+
			
 
				+\end_inset
			
 
				+</cell>
			
 
				+</row>
			
 
				+</lyxtabular>
			
 
				+
			
 
				+\end_inset
			
 
				+
			
 
				+
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+This review is an attempt to catalog and address all the known issues with
			
 
				+ TDB and create solutions which address the problems without significantly
			
 
				+ increasing complexity; all involved are far too aware of the dangers of
			
 
				+ second system syndrome in rewriting a successful project like this.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+Note: the final decision was to make ntdb a separate library, with a separarate
			
 
				+ 'ntdb' namespace so both can potentially be linked together.
			
 
				+ This document still refers to
			
 
				+\begin_inset Quotes eld
			
 
				+\end_inset
			
 
				+
			
 
				+tdb
			
 
				+\begin_inset Quotes erd
			
 
				+\end_inset
			
 
				+
			
 
				+ everywhere, for simplicity.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Section
			
 
				+API Issues
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Subsection
			
 
				+tdb_open_ex Is Not Expandable
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+The tdb_open() call was expanded to tdb_open_ex(), which added an optional
			
 
				+ hashing function and an optional logging function argument.
			
 
				+ Additional arguments to open would require the introduction of a tdb_open_ex2
			
 
				+ call etc.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Subsubsection
			
 
				+Proposed Solution
			
 
				+\begin_inset CommandInset label
			
 
				+LatexCommand label
			
 
				+name "attributes"
			
 
				+
			
 
				+\end_inset
			
 
				+
			
 
				+
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+tdb_open() will take a linked-list of attributes:
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout LyX-Code
			
 
				+enum tdb_attribute {
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout LyX-Code
			
 
				+    TDB_ATTRIBUTE_LOG = 0,
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout LyX-Code
			
 
				+    TDB_ATTRIBUTE_HASH = 1
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout LyX-Code
			
 
				+};
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout LyX-Code
			
 
				+struct tdb_attribute_base {
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout LyX-Code
			
 
				+    enum tdb_attribute attr;
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout LyX-Code
			
 
				+    union tdb_attribute *next;
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout LyX-Code
			
 
				+};
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout LyX-Code
			
 
				+struct tdb_attribute_log {
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout LyX-Code
			
 
				+    struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_LOG */
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout LyX-Code
			
 
				+    tdb_log_func log_fn;
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout LyX-Code
			
 
				+    void *log_private;
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout LyX-Code
			
 
				+};
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout LyX-Code
			
 
				+struct tdb_attribute_hash {
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout LyX-Code
			
 
				+    struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_HASH */
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout LyX-Code
			
 
				+    tdb_hash_func hash_fn;
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout LyX-Code
			
 
				+    void *hash_private;
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout LyX-Code
			
 
				+};
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout LyX-Code
			
 
				+union tdb_attribute {
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout LyX-Code
			
 
				+    struct tdb_attribute_base base;
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout LyX-Code
			
 
				+    struct tdb_attribute_log log;
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout LyX-Code
			
 
				+    struct tdb_attribute_hash hash;
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout LyX-Code
			
 
				+};
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+This allows future attributes to be added, even if this expands the size
			
 
				+ of the union.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Subsubsection
			
 
				+Status
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+Complete.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Subsection
			
 
				+tdb_traverse Makes Impossible Guarantees
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+tdb_traverse (and tdb_firstkey/tdb_nextkey) predate transactions, and it
			
 
				+ was thought that it was important to guarantee that all records which exist
			
 
				+ at the start and end of the traversal would be included, and no record
			
 
				+ would be included twice.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+This adds complexity (see
			
 
				+\begin_inset CommandInset ref
			
 
				+LatexCommand ref
			
 
				+reference "Reliable-Traversal-Adds"
			
 
				+
			
 
				+\end_inset
			
 
				+
			
 
				+) and does not work anyway for records which are altered (in particular,
			
 
				+ those which are expanded may be effectively deleted and re-added behind
			
 
				+ the traversal).
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Subsubsection
			
 
				+\begin_inset CommandInset label
			
 
				+LatexCommand label
			
 
				+name "traverse-Proposed-Solution"
			
 
				+
			
 
				+\end_inset
			
 
				+
			
 
				+Proposed Solution
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+Abandon the guarantee.
			
 
				+ You will see every record if no changes occur during your traversal, otherwise
			
 
				+ you will see some subset.
			
 
				+ You can prevent changes by using a transaction or the locking API.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Subsubsection
			
 
				+Status
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+Complete.
			
 
				+ Delete-during-traverse will still delete every record, too (assuming no
			
 
				+ other changes).
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Subsection
			
 
				+Nesting of Transactions Is Fraught
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+TDB has alternated between allowing nested transactions and not allowing
			
 
				+ them.
			
 
				+ Various paths in the Samba codebase assume that transactions will nest,
			
 
				+ and in a sense they can: the operation is only committed to disk when the
			
 
				+ outer transaction is committed.
			
 
				+ There are two problems, however:
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Enumerate
			
 
				+Canceling the inner transaction will cause the outer transaction commit
			
 
				+ to fail, and will not undo any operations since the inner transaction began.
			
 
				+ This problem is soluble with some additional internal code.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Enumerate
			
 
				+An inner transaction commit can be cancelled by the outer transaction.
			
 
				+ This is desirable in the way which Samba's database initialization code
			
 
				+ uses transactions, but could be a surprise to any users expecting a successful
			
 
				+ transaction commit to expose changes to others.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+The current solution is to specify the behavior at tdb_open(), with the
			
 
				+ default currently that nested transactions are allowed.
			
 
				+ This flag can also be changed at runtime.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Subsubsection
			
 
				+Proposed Solution
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+Given the usage patterns, it seems that the
			
 
				+\begin_inset Quotes eld
			
 
				+\end_inset
			
 
				+
			
 
				+least-surprise
			
 
				+\begin_inset Quotes erd
			
 
				+\end_inset
			
 
				+
			
 
				+ behavior of disallowing nested transactions should become the default.
			
 
				+ Additionally, it seems the outer transaction is the only code which knows
			
 
				+ whether inner transactions should be allowed, so a flag to indicate this
			
 
				+ could be added to tdb_transaction_start.
			
 
				+ However, this behavior can be simulated with a wrapper which uses tdb_add_flags
			
 
				+() and tdb_remove_flags(), so the API should not be expanded for this relatively
			
 
				+-obscure case.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Subsubsection
			
 
				+Status
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+Complete; the nesting flag has been removed.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Subsection
			
 
				+Incorrect Hash Function is Not Detected
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+tdb_open_ex() allows the calling code to specify a different hash function
			
 
				+ to use, but does not check that all other processes accessing this tdb
			
 
				+ are using the same hash function.
			
 
				+ The result is that records are missing from tdb_fetch().
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Subsubsection
			
 
				+Proposed Solution
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+The header should contain an example hash result (eg.
			
 
				+ the hash of 0xdeadbeef), and tdb_open_ex() should check that the given
			
 
				+ hash function produces the same answer, or fail the tdb_open call.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Subsubsection
			
 
				+Status
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+Complete.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Subsection
			
 
				+tdb_set_max_dead/TDB_VOLATILE Expose Implementation
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+In response to scalability issues with the free list (
			
 
				+\begin_inset CommandInset ref
			
 
				+LatexCommand ref
			
 
				+reference "TDB-Freelist-Is"
			
 
				+
			
 
				+\end_inset
			
 
				+
			
 
				+) two API workarounds have been incorporated in TDB: tdb_set_max_dead()
			
 
				+ and the TDB_VOLATILE flag to tdb_open.
			
 
				+ The latter actually calls the former with an argument of
			
 
				+\begin_inset Quotes eld
			
 
				+\end_inset
			
 
				+
			
 
				+5
			
 
				+\begin_inset Quotes erd
			
 
				+\end_inset
			
 
				+
			
 
				+.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+This code allows deleted records to accumulate without putting them in the
			
 
				+ free list.
			
 
				+ On delete we iterate through each chain and free them in a batch if there
			
 
				+ are more than max_dead entries.
			
 
				+ These are never otherwise recycled except as a side-effect of a tdb_repack.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Subsubsection
			
 
				+Proposed Solution
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+With the scalability problems of the freelist solved, this API can be removed.
			
 
				+ The TDB_VOLATILE flag may still be useful as a hint that store and delete
			
 
				+ of records will be at least as common as fetch in order to allow some internal
			
 
				+ tuning, but initially will become a no-op.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Subsubsection
			
 
				+Status
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+Complete.
			
 
				+ Unknown flags cause tdb_open() to fail as well, so they can be detected
			
 
				+ at runtime.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Subsection
			
 
				+\begin_inset CommandInset label
			
 
				+LatexCommand label
			
 
				+name "TDB-Files-Cannot"
			
 
				+
			
 
				+\end_inset
			
 
				+
			
 
				+TDB Files Cannot Be Opened Multiple Times In The Same Process
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+No process can open the same TDB twice; we check and disallow it.
			
 
				+ This is an unfortunate side-effect of fcntl locks, which operate on a per-file
			
 
				+ rather than per-file-descriptor basis, and do not nest.
			
 
				+ Thus, closing any file descriptor on a file clears all the locks obtained
			
 
				+ by this process, even if they were placed using a different file descriptor!
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+Note that even if this were solved, deadlock could occur if operations were
			
 
				+ nested: this is a more manageable programming error in most cases.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Subsubsection
			
 
				+Proposed Solution
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+We could lobby POSIX to fix the perverse rules, or at least lobby Linux
			
 
				+ to violate them so that the most common implementation does not have this
			
 
				+ restriction.
			
 
				+ This would be a generally good idea for other fcntl lock users.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+Samba uses a wrapper which hands out the same tdb_context to multiple callers
			
 
				+ if this happens, and does simple reference counting.
			
 
				+ We should do this inside the tdb library, which already emulates lock nesting
			
 
				+ internally; it would need to recognize when deadlock occurs within a single
			
 
				+ process.
			
 
				+ This would create a new failure mode for tdb operations (while we currently
			
 
				+ handle locking failures, they are impossible in normal use and a process
			
 
				+ encountering them can do little but give up).
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+I do not see benefit in an additional tdb_open flag to indicate whether
			
 
				+ re-opening is allowed, as though there may be some benefit to adding a
			
 
				+ call to detect when a tdb_context is shared, to allow other to create such
			
 
				+ an API.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Subsubsection
			
 
				+Status
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+Complete.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Subsection
			
 
				+TDB API Is Not POSIX Thread-safe
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+The TDB API uses an error code which can be queried after an operation to
			
 
				+ determine what went wrong.
			
 
				+ This programming model does not work with threads, unless specific additional
			
 
				+ guarantees are given by the implementation.
			
 
				+ In addition, even otherwise-independent threads cannot open the same TDB
			
 
				+ (as in
			
 
				+\begin_inset CommandInset ref
			
 
				+LatexCommand ref
			
 
				+reference "TDB-Files-Cannot"
			
 
				+
			
 
				+\end_inset
			
 
				+
			
 
				+).
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Subsubsection
			
 
				+Proposed Solution
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+Reachitecting the API to include a tdb_errcode pointer would be a great
			
 
				+ deal of churn, but fortunately most functions return 0 on success and -1
			
 
				+ on error: we can change these to return 0 on success and a negative error
			
 
				+ code on error, and the API remains similar to previous.
			
 
				+ The tdb_fetch, tdb_firstkey and tdb_nextkey functions need to take a TDB_DATA
			
 
				+ pointer and return an error code.
			
 
				+ It is also simpler to have tdb_nextkey replace its key argument in place,
			
 
				+ freeing up any old .dptr.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+Internal locking is required to make sure that fcntl locks do not overlap
			
 
				+ between threads, and also that the global list of tdbs is maintained.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+The aim is that building tdb with -DTDB_PTHREAD will result in a pthread-safe
			
 
				+ version of the library, and otherwise no overhead will exist.
			
 
				+ Alternatively, a hooking mechanism similar to that proposed for
			
 
				+\begin_inset CommandInset ref
			
 
				+LatexCommand ref
			
 
				+reference "Proposed-Solution-locking-hook"
			
 
				+
			
 
				+\end_inset
			
 
				+
			
 
				+ could be used to enable pthread locking at runtime.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Subsubsection
			
 
				+Status
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+Incomplete; API has been changed but thread safety has not been implemented.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Subsection
			
 
				+*_nonblock Functions And *_mark Functions Expose Implementation
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+CTDB
			
 
				+\begin_inset Foot
			
 
				+status collapsed
			
 
				+
			
 
				+\begin_layout Plain Layout
			
 
				+Clustered TDB, see http://ctdb.samba.org
			
 
				+\end_layout
			
 
				+
			
 
				+\end_inset
			
 
				+
			
 
				+ wishes to operate on TDB in a non-blocking manner.
			
 
				+ This is currently done as follows:
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Enumerate
			
 
				+Call the _nonblock variant of an API function (eg.
			
 
				+ tdb_lockall_nonblock).
			
 
				+ If this fails:
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Enumerate
			
 
				+Fork a child process, and wait for it to call the normal variant (eg.
			
 
				+ tdb_lockall).
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Enumerate
			
 
				+If the child succeeds, call the _mark variant to indicate we already have
			
 
				+ the locks (eg.
			
 
				+ tdb_lockall_mark).
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Enumerate
			
 
				+Upon completion, tell the child to release the locks (eg.
			
 
				+ tdb_unlockall).
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Enumerate
			
 
				+Indicate to tdb that it should consider the locks removed (eg.
			
 
				+ tdb_unlockall_mark).
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+There are several issues with this approach.
			
 
				+ Firstly, adding two new variants of each function clutters the API for
			
 
				+ an obscure use, and so not all functions have three variants.
			
 
				+ Secondly, it assumes that all paths of the functions ask for the same locks,
			
 
				+ otherwise the parent process will have to get a lock which the child doesn't
			
 
				+ have under some circumstances.
			
 
				+ I don't believe this is currently the case, but it constrains the implementatio
			
 
				+n.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Subsubsection
			
 
				+\begin_inset CommandInset label
			
 
				+LatexCommand label
			
 
				+name "Proposed-Solution-locking-hook"
			
 
				+
			
 
				+\end_inset
			
 
				+
			
 
				+Proposed Solution
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+Implement a hook for locking methods, so that the caller can control the
			
 
				+ calls to create and remove fcntl locks.
			
 
				+ In this scenario, ctdbd would operate as follows:
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Enumerate
			
 
				+Call the normal API function, eg tdb_lockall().
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Enumerate
			
 
				+When the lock callback comes in, check if the child has the lock.
			
 
				+ Initially, this is always false.
			
 
				+ If so, return 0.
			
 
				+ Otherwise, try to obtain it in non-blocking mode.
			
 
				+ If that fails, return EWOULDBLOCK.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Enumerate
			
 
				+Release locks in the unlock callback as normal.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Enumerate
			
 
				+If tdb_lockall() fails, see if we recorded a lock failure; if so, call the
			
 
				+ child to repeat the operation.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Enumerate
			
 
				+The child records what locks it obtains, and returns that information to
			
 
				+ the parent.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Enumerate
			
 
				+When the child has succeeded, goto 1.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+This is flexible enough to handle any potential locking scenario, even when
			
 
				+ lock requirements change.
			
 
				+ It can be optimized so that the parent does not release locks, just tells
			
 
				+ the child which locks it doesn't need to obtain.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+It also keeps the complexity out of the API, and in ctdbd where it is needed.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Subsubsection
			
 
				+Status
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+Complete.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Subsection
			
 
				+tdb_chainlock Functions Expose Implementation
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+tdb_chainlock locks some number of records, including the record indicated
			
 
				+ by the given key.
			
 
				+ This gave atomicity guarantees; no-one can start a transaction, alter,
			
 
				+ read or delete that key while the lock is held.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+It also makes the same guarantee for any other key in the chain, which is
			
 
				+ an internal implementation detail and potentially a cause for deadlock.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Subsubsection
			
 
				+Proposed Solution
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+None.
			
 
				+ It would be nice to have an explicit single entry lock which effected no
			
 
				+ other keys.
			
 
				+ Unfortunately, this won't work for an entry which doesn't exist.
			
 
				+ Thus while chainlock may be implemented more efficiently for the existing
			
 
				+ case, it will still have overlap issues with the non-existing case.
			
 
				+ So it is best to keep the current (lack of) guarantee about which records
			
 
				+ will be effected to avoid constraining our implementation.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Subsection
			
 
				+Signal Handling is Not Race-Free
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+The tdb_setalarm_sigptr() call allows the caller's signal handler to indicate
			
 
				+ that the tdb locking code should return with a failure, rather than trying
			
 
				+ again when a signal is received (and errno == EAGAIN).
			
 
				+ This is usually used to implement timeouts.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+Unfortunately, this does not work in the case where the signal is received
			
 
				+ before the tdb code enters the fcntl() call to place the lock: the code
			
 
				+ will sleep within the fcntl() code, unaware that the signal wants it to
			
 
				+ exit.
			
 
				+ In the case of long timeouts, this does not happen in practice.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Subsubsection
			
 
				+Proposed Solution
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+The locking hooks proposed in
			
 
				+\begin_inset CommandInset ref
			
 
				+LatexCommand ref
			
 
				+reference "Proposed-Solution-locking-hook"
			
 
				+
			
 
				+\end_inset
			
 
				+
			
 
				+ would allow the user to decide on whether to fail the lock acquisition
			
 
				+ on a signal.
			
 
				+ This allows the caller to choose their own compromise: they could narrow
			
 
				+ the race by checking immediately before the fcntl call.
			
 
				+\begin_inset Foot
			
 
				+status collapsed
			
 
				+
			
 
				+\begin_layout Plain Layout
			
 
				+It may be possible to make this race-free in some implementations by having
			
 
				+ the signal handler alter the struct flock to make it invalid.
			
 
				+ This will cause the fcntl() lock call to fail with EINVAL if the signal
			
 
				+ occurs before the kernel is entered, otherwise EAGAIN.
			
 
				+\end_layout
			
 
				+
			
 
				+\end_inset
			
 
				+
			
 
				+
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Subsubsection
			
 
				+Status
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+Complete.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Subsection
			
 
				+The API Uses Gratuitous Typedefs, Capitals
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+typedefs are useful for providing source compatibility when types can differ
			
 
				+ across implementations, or arguably in the case of function pointer definitions
			
 
				+ which are hard for humans to parse.
			
 
				+ Otherwise it is simply obfuscation and pollutes the namespace.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+Capitalization is usually reserved for compile-time constants and macros.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Description
			
 
				+TDB_CONTEXT There is no reason to use this over 'struct tdb_context'; the
			
 
				+ definition isn't visible to the API user anyway.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Description
			
 
				+TDB_DATA There is no reason to use this over struct TDB_DATA; the struct
			
 
				+ needs to be understood by the API user.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Description
			
 
				+struct
			
 
				+\begin_inset space ~
			
 
				+\end_inset
			
 
				+
			
 
				+TDB_DATA This would normally be called 'struct tdb_data'.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Description
			
 
				+enum
			
 
				+\begin_inset space ~
			
 
				+\end_inset
			
 
				+
			
 
				+TDB_ERROR Similarly, this would normally be enum tdb_error.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Subsubsection
			
 
				+Proposed Solution
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+None.
			
 
				+ Introducing lower case variants would please pedants like myself, but if
			
 
				+ it were done the existing ones should be kept.
			
 
				+ There is little point forcing a purely cosmetic change upon tdb users.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Subsection
			
 
				+\begin_inset CommandInset label
			
 
				+LatexCommand label
			
 
				+name "tdb_log_func-Doesnt-Take"
			
 
				+
			
 
				+\end_inset
			
 
				+
			
 
				+tdb_log_func Doesn't Take The Private Pointer
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+For API compatibility reasons, the logging function needs to call tdb_get_loggin
			
 
				+g_private() to retrieve the pointer registered by the tdb_open_ex for logging.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Subsubsection
			
 
				+Proposed Solution
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+It should simply take an extra argument, since we are prepared to break
			
 
				+ the API/ABI.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Subsubsection
			
 
				+Status
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+Complete.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Subsection
			
 
				+Various Callback Functions Are Not Typesafe
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+The callback functions in tdb_set_logging_function (after
			
 
				+\begin_inset CommandInset ref
			
 
				+LatexCommand ref
			
 
				+reference "tdb_log_func-Doesnt-Take"
			
 
				+
			
 
				+\end_inset
			
 
				+
			
 
				+ is resolved), tdb_parse_record, tdb_traverse, tdb_traverse_read and tdb_check
			
 
				+ all take void * and must internally convert it to the argument type they
			
 
				+ were expecting.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+If this type changes, the compiler will not produce warnings on the callers,
			
 
				+ since it only sees void *.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Subsubsection
			
 
				+Proposed Solution
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+With careful use of macros, we can create callback functions which give
			
 
				+ a warning when used on gcc and the types of the callback and its private
			
 
				+ argument differ.
			
 
				+ Unsupported compilers will not give a warning, which is no worse than now.
			
 
				+ In addition, the callbacks become clearer, as they need not use void *
			
 
				+ for their parameter.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+See CCAN's typesafe_cb module at http://ccan.ozlabs.org/info/typesafe_cb.html
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Subsubsection
			
 
				+Status
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+Complete.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Subsection
			
 
				+TDB_CLEAR_IF_FIRST Must Be Specified On All Opens, tdb_reopen_all Problematic
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+The TDB_CLEAR_IF_FIRST flag to tdb_open indicates that the TDB file should
			
 
				+ be cleared if the caller discovers it is the only process with the TDB
			
 
				+ open.
			
 
				+ However, if any caller does not specify TDB_CLEAR_IF_FIRST it will not
			
 
				+ be detected, so will have the TDB erased underneath them (usually resulting
			
 
				+ in a crash).
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+There is a similar issue on fork(); if the parent exits (or otherwise closes
			
 
				+ the tdb) before the child calls tdb_reopen_all() to establish the lock
			
 
				+ used to indicate the TDB is opened by someone, a TDB_CLEAR_IF_FIRST opener
			
 
				+ at that moment will believe it alone has opened the TDB and will erase
			
 
				+ it.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Subsubsection
			
 
				+Proposed Solution
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+Remove TDB_CLEAR_IF_FIRST.
			
 
				+ Other workarounds are possible, but see
			
 
				+\begin_inset CommandInset ref
			
 
				+LatexCommand ref
			
 
				+reference "TDB_CLEAR_IF_FIRST-Imposes-Performance"
			
 
				+
			
 
				+\end_inset
			
 
				+
			
 
				+.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Subsubsection
			
 
				+Status
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+Complete.
			
 
				+ An open hook is provided to replicate this functionality if required.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Subsection
			
 
				+Extending The Header Is Difficult
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+We have reserved (zeroed) words in the TDB header, which can be used for
			
 
				+ future features.
			
 
				+ If the future features are compulsory, the version number must be updated
			
 
				+ to prevent old code from accessing the database.
			
 
				+ But if the future feature is optional, we have no way of telling if older
			
 
				+ code is accessing the database or not.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Subsubsection
			
 
				+Proposed Solution
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+The header should contain a
			
 
				+\begin_inset Quotes eld
			
 
				+\end_inset
			
 
				+
			
 
				+format variant
			
 
				+\begin_inset Quotes erd
			
 
				+\end_inset
			
 
				+
			
 
				+ value (64-bit).
			
 
				+ This is divided into two 32-bit parts:
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Enumerate
			
 
				+The lower part reflects the format variant understood by code accessing
			
 
				+ the database.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Enumerate
			
 
				+The upper part reflects the format variant you must understand to write
			
 
				+ to the database (otherwise you can only open for reading).
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+The latter field can only be written at creation time, the former should
			
 
				+ be written under the OPEN_LOCK when opening the database for writing, if
			
 
				+ the variant of the code is lower than the current lowest variant.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+This should allow backwards-compatible features to be added, and detection
			
 
				+ if older code (which doesn't understand the feature) writes to the database.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Subsubsection
			
 
				+Status
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+Complete.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Subsection
			
 
				+Record Headers Are Not Expandible
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+If we later want to add (say) checksums on keys and data, it would require
			
 
				+ another format change, which we'd like to avoid.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Subsubsection
			
 
				+Proposed Solution
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+We often have extra padding at the tail of a record.
			
 
				+ If we ensure that the first byte (if any) of this padding is zero, we will
			
 
				+ have a way for future changes to detect code which doesn't understand a
			
 
				+ new format: the new code would write (say) a 1 at the tail, and thus if
			
 
				+ there is no tail or the first byte is 0, we would know the extension is
			
 
				+ not present on that record.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Subsubsection
			
 
				+Status
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+Complete.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Subsection
			
 
				+TDB Does Not Use Talloc
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+Many users of TDB (particularly Samba) use the talloc allocator, and thus
			
 
				+ have to wrap TDB in a talloc context to use it conveniently.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Subsubsection
			
 
				+Proposed Solution
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+The allocation within TDB is not complicated enough to justify the use of
			
 
				+ talloc, and I am reluctant to force another (excellent) library on TDB
			
 
				+ users.
			
 
				+ Nonetheless a compromise is possible.
			
 
				+ An attribute (see
			
 
				+\begin_inset CommandInset ref
			
 
				+LatexCommand ref
			
 
				+reference "attributes"
			
 
				+
			
 
				+\end_inset
			
 
				+
			
 
				+) can be added later to tdb_open() to provide an alternate allocation mechanism,
			
 
				+ specifically for talloc but usable by any other allocator (which would
			
 
				+ ignore the
			
 
				+\begin_inset Quotes eld
			
 
				+\end_inset
			
 
				+
			
 
				+context
			
 
				+\begin_inset Quotes erd
			
 
				+\end_inset
			
 
				+
			
 
				+ argument).
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+This would form a talloc heirarchy as expected, but the caller would still
			
 
				+ have to attach a destructor to the tdb context returned from tdb_open to
			
 
				+ close it.
			
 
				+ All TDB_DATA fields would be children of the tdb_context, and the caller
			
 
				+ would still have to manage them (using talloc_free() or talloc_steal()).
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Subsubsection
			
 
				+Status
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+Complete, using the NTDB_ATTRIBUTE_ALLOCATOR attribute.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Section
			
 
				+Performance And Scalability Issues
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Subsection
			
 
				+\begin_inset CommandInset label
			
 
				+LatexCommand label
			
 
				+name "TDB_CLEAR_IF_FIRST-Imposes-Performance"
			
 
				+
			
 
				+\end_inset
			
 
				+
			
 
				+TDB_CLEAR_IF_FIRST Imposes Performance Penalty
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+When TDB_CLEAR_IF_FIRST is specified, a 1-byte read lock is placed at offset
			
 
				+ 4 (aka.
			
 
				+ the ACTIVE_LOCK).
			
 
				+ While these locks never conflict in normal tdb usage, they do add substantial
			
 
				+ overhead for most fcntl lock implementations when the kernel scans to detect
			
 
				+ if a lock conflict exists.
			
 
				+ This is often a single linked list, making the time to acquire and release
			
 
				+ a fcntl lock O(N) where N is the number of processes with the TDB open,
			
 
				+ not the number actually doing work.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+In a Samba server it is common to have huge numbers of clients sitting idle,
			
 
				+ and thus they have weaned themselves off the TDB_CLEAR_IF_FIRST flag.
			
 
				+\begin_inset Foot
			
 
				+status collapsed
			
 
				+
			
 
				+\begin_layout Plain Layout
			
 
				+There is a flag to tdb_reopen_all() which is used for this optimization:
			
 
				+ if the parent process will outlive the child, the child does not need the
			
 
				+ ACTIVE_LOCK.
			
 
				+ This is a workaround for this very performance issue.
			
 
				+\end_layout
			
 
				+
			
 
				+\end_inset
			
 
				+
			
 
				+
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Subsubsection
			
 
				+Proposed Solution
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+Remove the flag.
			
 
				+ It was a neat idea, but even trivial servers tend to know when they are
			
 
				+ initializing for the first time and can simply unlink the old tdb at that
			
 
				+ point.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Subsubsection
			
 
				+Status
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+Complete.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Subsection
			
 
				+TDB Files Have a 4G Limit
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+This seems to be becoming an issue (so much for
			
 
				+\begin_inset Quotes eld
			
 
				+\end_inset
			
 
				+
			
 
				+trivial
			
 
				+\begin_inset Quotes erd
			
 
				+\end_inset
			
 
				+
			
 
				+!), particularly for ldb.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Subsubsection
			
 
				+Proposed Solution
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+A new, incompatible TDB format which uses 64 bit offsets internally rather
			
 
				+ than 32 bit as now.
			
 
				+ For simplicity of endian conversion (which TDB does on the fly if required),
			
 
				+ all values will be 64 bit on disk.
			
 
				+ In practice, some upper bits may be used for other purposes, but at least
			
 
				+ 56 bits will be available for file offsets.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+tdb_open() will automatically detect the old version, and even create them
			
 
				+ if TDB_VERSION6 is specified to tdb_open.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+32 bit processes will still be able to access TDBs larger than 4G (assuming
			
 
				+ that their off_t allows them to seek to 64 bits), they will gracefully
			
 
				+ fall back as they fail to mmap.
			
 
				+ This can happen already with large TDBs.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+Old versions of tdb will fail to open the new TDB files (since 28 August
			
 
				+ 2009, commit 398d0c29290: prior to that any unrecognized file format would
			
 
				+ be erased and initialized as a fresh tdb!)
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Subsubsection
			
 
				+Status
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+Complete.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Subsection
			
 
				+TDB Records Have a 4G Limit
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+This has not been a reported problem, and the API uses size_t which can
			
 
				+ be 64 bit on 64 bit platforms.
			
 
				+ However, other limits may have made such an issue moot.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Subsubsection
			
 
				+Proposed Solution
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+Record sizes will be 64 bit, with an error returned on 32 bit platforms
			
 
				+ which try to access such records (the current implementation would return
			
 
				+ TDB_ERR_OOM in a similar case).
			
 
				+ It seems unlikely that 32 bit keys will be a limitation, so the implementation
			
 
				+ may not support this (see
			
 
				+\begin_inset CommandInset ref
			
 
				+LatexCommand ref
			
 
				+reference "sub:Records-Incur-A"
			
 
				+
			
 
				+\end_inset
			
 
				+
			
 
				+).
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Subsubsection
			
 
				+Status
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+Complete.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Subsection
			
 
				+Hash Size Is Determined At TDB Creation Time
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+TDB contains a number of hash chains in the header; the number is specified
			
 
				+ at creation time, and defaults to 131.
			
 
				+ This is such a bottleneck on large databases (as each hash chain gets quite
			
 
				+ long), that LDB uses 10,000 for this hash.
			
 
				+ In general it is impossible to know what the 'right' answer is at database
			
 
				+ creation time.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Subsubsection
			
 
				+\begin_inset CommandInset label
			
 
				+LatexCommand label
			
 
				+name "sub:Hash-Size-Solution"
			
 
				+
			
 
				+\end_inset
			
 
				+
			
 
				+Proposed Solution
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+After comprehensive performance testing on various scalable hash variants
			
 
				+\begin_inset Foot
			
 
				+status collapsed
			
 
				+
			
 
				+\begin_layout Plain Layout
			
 
				+http://rusty.ozlabs.org/?p=89 and http://rusty.ozlabs.org/?p=94 This was annoying
			
 
				+ because I was previously convinced that an expanding tree of hashes would
			
 
				+ be very close to optimal.
			
 
				+\end_layout
			
 
				+
			
 
				+\end_inset
			
 
				+
			
 
				+, it became clear that it is hard to beat a straight linear hash table which
			
 
				+ doubles in size when it reaches saturation.
			
 
				+ Unfortunately, altering the hash table introduces serious locking complications
			
 
				+: the entire hash table needs to be locked to enlarge the hash table, and
			
 
				+ others might be holding locks.
			
 
				+ Particularly insidious are insertions done under tdb_chainlock.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+Thus an expanding layered hash will be used: an array of hash groups, with
			
 
				+ each hash group exploding into pointers to lower hash groups once it fills,
			
 
				+ turning into a hash tree.
			
 
				+ This has implications for locking: we must lock the entire group in case
			
 
				+ we need to expand it, yet we don't know how deep the tree is at that point.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+Note that bits from the hash table entries should be stolen to hold more
			
 
				+ hash bits to reduce the penalty of collisions.
			
 
				+ We can use the otherwise-unused lower 3 bits.
			
 
				+ If we limit the size of the database to 64 exabytes, we can use the top
			
 
				+ 8 bits of the hash entry as well.
			
 
				+ These 11 bits would reduce false positives down to 1 in 2000 which is more
			
 
				+ than we need: we can use one of the bits to indicate that the extra hash
			
 
				+ bits are valid.
			
 
				+ This means we can choose not to re-hash all entries when we expand a hash
			
 
				+ group; simply use the next bits we need and mark them invalid.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Subsubsection
			
 
				+Status
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+Ignore.
			
 
				+ Scaling the hash automatically proved inefficient at small hash sizes;
			
 
				+ we default to a 8192-element hash (changable via NTDB_ATTRIBUTE_HASHSIZE),
			
 
				+ and when buckets clash we expand to an array of hash entries.
			
 
				+ This scales slightly better than the tdb chain (due to the 8 top bits containin
			
 
				+g extra hash).
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Subsection
			
 
				+\begin_inset CommandInset label
			
 
				+LatexCommand label
			
 
				+name "TDB-Freelist-Is"
			
 
				+
			
 
				+\end_inset
			
 
				+
			
 
				+TDB Freelist Is Highly Contended
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+TDB uses a single linked list for the free list.
			
 
				+ Allocation occurs as follows, using heuristics which have evolved over
			
 
				+ time:
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Enumerate
			
 
				+Get the free list lock for this whole operation.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Enumerate
			
 
				+Multiply length by 1.25, so we always over-allocate by 25%.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Enumerate
			
 
				+Set the slack multiplier to 1.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Enumerate
			
 
				+Examine the current freelist entry: if it is > length but < the current
			
 
				+ best case, remember it as the best case.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Enumerate
			
 
				+Multiply the slack multiplier by 1.05.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Enumerate
			
 
				+If our best fit so far is less than length * slack multiplier, return it.
			
 
				+ The slack will be turned into a new free record if it's large enough.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Enumerate
			
 
				+Otherwise, go onto the next freelist entry.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+Deleting a record occurs as follows:
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Enumerate
			
 
				+Lock the hash chain for this whole operation.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Enumerate
			
 
				+Walk the chain to find the record, keeping the prev pointer offset.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Enumerate
			
 
				+If max_dead is non-zero:
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_deeper
			
 
				+\begin_layout Enumerate
			
 
				+Walk the hash chain again and count the dead records.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Enumerate
			
 
				+If it's more than max_dead, bulk free all the dead ones (similar to steps
			
 
				+ 4 and below, but the lock is only obtained once).
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Enumerate
			
 
				+Simply mark this record as dead and return.
			
 
				+\end_layout
			
 
				+
			
 
				+\end_deeper
			
 
				+\begin_layout Enumerate
			
 
				+Get the free list lock for the remainder of this operation.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Enumerate
			
 
				+\begin_inset CommandInset label
			
 
				+LatexCommand label
			
 
				+name "right-merging"
			
 
				+
			
 
				+\end_inset
			
 
				+
			
 
				+Examine the following block to see if it is free; if so, enlarge the current
			
 
				+ block and remove that block from the free list.
			
 
				+ This was disabled, as removal from the free list was O(entries-in-free-list).
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Enumerate
			
 
				+Examine the preceeding block to see if it is free: for this reason, each
			
 
				+ block has a 32-bit tailer which indicates its length.
			
 
				+ If it is free, expand it to cover our new block and return.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Enumerate
			
 
				+Otherwise, prepend ourselves to the free list.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+Disabling right-merging (step
			
 
				+\begin_inset CommandInset ref
			
 
				+LatexCommand ref
			
 
				+reference "right-merging"
			
 
				+
			
 
				+\end_inset
			
 
				+
			
 
				+) causes fragmentation; the other heuristics proved insufficient to address
			
 
				+ this, so the final answer to this was that when we expand the TDB file
			
 
				+ inside a transaction commit, we repack the entire tdb.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+The single list lock limits our allocation rate; due to the other issues
			
 
				+ this is not currently seen as a bottleneck.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Subsubsection
			
 
				+Proposed Solution
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+The first step is to remove all the current heuristics, as they obviously
			
 
				+ interact, then examine them once the lock contention is addressed.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+The free list must be split to reduce contention.
			
 
				+ Assuming perfect free merging, we can at most have 1 free list entry for
			
 
				+ each entry.
			
 
				+ This implies that the number of free lists is related to the size of the
			
 
				+ hash table, but as it is rare to walk a large number of free list entries
			
 
				+ we can use far fewer, say 1/32 of the number of hash buckets.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+It seems tempting to try to reuse the hash implementation which we use for
			
 
				+ records here, but we have two ways of searching for free entries: for allocatio
			
 
				+n we search by size (and possibly zone) which produces too many clashes
			
 
				+ for our hash table to handle well, and for coalescing we search by address.
			
 
				+ Thus an array of doubly-linked free lists seems preferable.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+There are various benefits in using per-size free lists (see
			
 
				+\begin_inset CommandInset ref
			
 
				+LatexCommand ref
			
 
				+reference "sub:TDB-Becomes-Fragmented"
			
 
				+
			
 
				+\end_inset
			
 
				+
			
 
				+) but it's not clear this would reduce contention in the common case where
			
 
				+ all processes are allocating/freeing the same size.
			
 
				+ Thus we almost certainly need to divide in other ways: the most obvious
			
 
				+ is to divide the file into zones, and using a free list (or table of free
			
 
				+ lists) for each.
			
 
				+ This approximates address ordering.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+Unfortunately it is difficult to know what heuristics should be used to
			
 
				+ determine zone sizes, and our transaction code relies on being able to
			
 
				+ create a
			
 
				+\begin_inset Quotes eld
			
 
				+\end_inset
			
 
				+
			
 
				+recovery area
			
 
				+\begin_inset Quotes erd
			
 
				+\end_inset
			
 
				+
			
 
				+ by simply appending to the file (difficult if it would need to create a
			
 
				+ new zone header).
			
 
				+ Thus we use a linked-list of free tables; currently we only ever create
			
 
				+ one, but if there is more than one we choose one at random to use.
			
 
				+ In future we may use heuristics to add new free tables on contention.
			
 
				+ We only expand the file when all free tables are exhausted.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+The basic algorithm is as follows.
			
 
				+ Freeing is simple:
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Enumerate
			
 
				+Identify the correct free list.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Enumerate
			
 
				+Lock the corresponding list.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Enumerate
			
 
				+Re-check the list (we didn't have a lock, sizes could have changed): relock
			
 
				+ if necessary.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Enumerate
			
 
				+Place the freed entry in the list.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+Allocation is a little more complicated, as we perform delayed coalescing
			
 
				+ at this point:
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Enumerate
			
 
				+Pick a free table; usually the previous one.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Enumerate
			
 
				+Lock the corresponding list.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Enumerate
			
 
				+If the top entry is -large enough, remove it from the list and return it.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Enumerate
			
 
				+Otherwise, coalesce entries in the list.If there was no entry large enough,
			
 
				+ unlock the list and try the next largest list
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Enumerate
			
 
				+If no list has an entry which meets our needs, try the next free table.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Enumerate
			
 
				+If no zone satisfies, expand the file.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+This optimizes rapid insert/delete of free list entries by not coalescing
			
 
				+ them all the time..
			
 
				+ First-fit address ordering ordering seems to be fairly good for keeping
			
 
				+ fragmentation low (see
			
 
				+\begin_inset CommandInset ref
			
 
				+LatexCommand ref
			
 
				+reference "sub:TDB-Becomes-Fragmented"
			
 
				+
			
 
				+\end_inset
			
 
				+
			
 
				+).
			
 
				+ Note that address ordering does not need a tailer to coalesce, though if
			
 
				+ we needed one we could have one cheaply: see
			
 
				+\begin_inset CommandInset ref
			
 
				+LatexCommand ref
			
 
				+reference "sub:Records-Incur-A"
			
 
				+
			
 
				+\end_inset
			
 
				+
			
 
				+.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+Each free entry has the free table number in the header: less than 255.
			
 
				+ It also contains a doubly-linked list for easy deletion.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Subsection
			
 
				+\begin_inset CommandInset label
			
 
				+LatexCommand label
			
 
				+name "sub:TDB-Becomes-Fragmented"
			
 
				+
			
 
				+\end_inset
			
 
				+
			
 
				+TDB Becomes Fragmented
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+Much of this is a result of allocation strategy
			
 
				+\begin_inset Foot
			
 
				+status collapsed
			
 
				+
			
 
				+\begin_layout Plain Layout
			
 
				+The Memory Fragmentation Problem: Solved? Johnstone & Wilson 1995 ftp://ftp.cs.ute
			
 
				+xas.edu/pub/garbage/malloc/ismm98.ps
			
 
				+\end_layout
			
 
				+
			
 
				+\end_inset
			
 
				+
			
 
				+ and deliberate hobbling of coalescing; internal fragmentation (aka overallocati
			
 
				+on) is deliberately set at 25%, and external fragmentation is only cured
			
 
				+ by the decision to repack the entire db when a transaction commit needs
			
 
				+ to enlarge the file.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Subsubsection
			
 
				+Proposed Solution
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+The 25% overhead on allocation works in practice for ldb because indexes
			
 
				+ tend to expand by one record at a time.
			
 
				+ This internal fragmentation can be resolved by having an
			
 
				+\begin_inset Quotes eld
			
 
				+\end_inset
			
 
				+
			
 
				+expanded
			
 
				+\begin_inset Quotes erd
			
 
				+\end_inset
			
 
				+
			
 
				+ bit in the header to note entries that have previously expanded, and allocating
			
 
				+ more space for them.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+There are is a spectrum of possible solutions for external fragmentation:
			
 
				+ one is to use a fragmentation-avoiding allocation strategy such as best-fit
			
 
				+ address-order allocator.
			
 
				+ The other end of the spectrum would be to use a bump allocator (very fast
			
 
				+ and simple) and simply repack the file when we reach the end.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+There are three problems with efficient fragmentation-avoiding allocators:
			
 
				+ they are non-trivial, they tend to use a single free list for each size,
			
 
				+ and there's no evidence that tdb allocation patterns will match those recorded
			
 
				+ for general allocators (though it seems likely).
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+Thus we don't spend too much effort on external fragmentation; we will be
			
 
				+ no worse than the current code if we need to repack on occasion.
			
 
				+ More effort is spent on reducing freelist contention, and reducing overhead.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Subsection
			
 
				+\begin_inset CommandInset label
			
 
				+LatexCommand label
			
 
				+name "sub:Records-Incur-A"
			
 
				+
			
 
				+\end_inset
			
 
				+
			
 
				+Records Incur A 28-Byte Overhead
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+Each TDB record has a header as follows:
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout LyX-Code
			
 
				+struct tdb_record {
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout LyX-Code
			
 
				+        tdb_off_t next; /* offset of the next record in the list */
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout LyX-Code
			
 
				+        tdb_len_t rec_len; /* total byte length of record */
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout LyX-Code
			
 
				+        tdb_len_t key_len; /* byte length of key */
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout LyX-Code
			
 
				+        tdb_len_t data_len; /* byte length of data */
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout LyX-Code
			
 
				+        uint32_t full_hash; /* the full 32 bit hash of the key */
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout LyX-Code
			
 
				+        uint32_t magic;   /* try to catch errors */
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout LyX-Code
			
 
				+        /* the following union is implied:
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout LyX-Code
			
 
				+                union {
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout LyX-Code
			
 
				+                        char record[rec_len];
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout LyX-Code
			
 
				+                        struct {
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout LyX-Code
			
 
				+                                char key[key_len];
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout LyX-Code
			
 
				+                                char data[data_len];
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout LyX-Code
			
 
				+                        }
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout LyX-Code
			
 
				+                        uint32_t totalsize; (tailer)
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout LyX-Code
			
 
				+                }
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout LyX-Code
			
 
				+        */
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout LyX-Code
			
 
				+};
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+Naively, this would double to a 56-byte overhead on a 64 bit implementation.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Subsubsection
			
 
				+Proposed Solution
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+We can use various techniques to reduce this for an allocated block:
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Enumerate
			
 
				+The 'next' pointer is not required, as we are using a flat hash table.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Enumerate
			
 
				+'rec_len' can instead be expressed as an addition to key_len and data_len
			
 
				+ (it accounts for wasted or overallocated length in the record).
			
 
				+ Since the record length is always a multiple of 8, we can conveniently
			
 
				+ fit it in 32 bits (representing up to 35 bits).
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Enumerate
			
 
				+'key_len' and 'data_len' can be reduced.
			
 
				+ I'm unwilling to restrict 'data_len' to 32 bits, but instead we can combine
			
 
				+ the two into one 64-bit field and using a 5 bit value which indicates at
			
 
				+ what bit to divide the two.
			
 
				+ Keys are unlikely to scale as fast as data, so I'm assuming a maximum key
			
 
				+ size of 32 bits.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Enumerate
			
 
				+'full_hash' is used to avoid a memcmp on the
			
 
				+\begin_inset Quotes eld
			
 
				+\end_inset
			
 
				+
			
 
				+miss
			
 
				+\begin_inset Quotes erd
			
 
				+\end_inset
			
 
				+
			
 
				+ case, but this is diminishing returns after a handful of bits (at 10 bits,
			
 
				+ it reduces 99.9% of false memcmp).
			
 
				+ As an aside, as the lower bits are already incorporated in the hash table
			
 
				+ resolution, the upper bits should be used here.
			
 
				+ Note that it's not clear that these bits will be a win, given the extra
			
 
				+ bits in the hash table itself (see
			
 
				+\begin_inset CommandInset ref
			
 
				+LatexCommand ref
			
 
				+reference "sub:Hash-Size-Solution"
			
 
				+
			
 
				+\end_inset
			
 
				+
			
 
				+).
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Enumerate
			
 
				+'magic' does not need to be enlarged: it currently reflects one of 5 values
			
 
				+ (used, free, dead, recovery, and unused_recovery).
			
 
				+ It is useful for quick sanity checking however, and should not be eliminated.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Enumerate
			
 
				+'tailer' is only used to coalesce free blocks (so a block to the right can
			
 
				+ find the header to check if this block is free).
			
 
				+ This can be replaced by a single 'free' bit in the header of the following
			
 
				+ block (and the tailer only exists in free blocks).
			
 
				+\begin_inset Foot
			
 
				+status collapsed
			
 
				+
			
 
				+\begin_layout Plain Layout
			
 
				+This technique from Thomas Standish.
			
 
				+ Data Structure Techniques.
			
 
				+ Addison-Wesley, Reading, Massachusetts, 1980.
			
 
				+\end_layout
			
 
				+
			
 
				+\end_inset
			
 
				+
			
 
				+ The current proposed coalescing algorithm doesn't need this, however.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+This produces a 16 byte used header like this:
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout LyX-Code
			
 
				+struct tdb_used_record {
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout LyX-Code
			
 
				+        uint32_t used_magic : 16,
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout LyX-Code
			
 
				+
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout LyX-Code
			
 
				+                 key_data_divide: 5,
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout LyX-Code
			
 
				+                 top_hash: 11;
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout LyX-Code
			
 
				+        uint32_t extra_octets;
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout LyX-Code
			
 
				+        uint64_t key_and_data_len;
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout LyX-Code
			
 
				+};
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+And a free record like this:
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout LyX-Code
			
 
				+struct tdb_free_record {
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout LyX-Code
			
 
				+        uint64_t free_magic: 8,
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout LyX-Code
			
 
				+                   prev : 56;
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout LyX-Code
			
 
				+
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout LyX-Code
			
 
				+        uint64_t free_table: 8,
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout LyX-Code
			
 
				+                 total_length : 56
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout LyX-Code
			
 
				+        uint64_t next;;
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout LyX-Code
			
 
				+};
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+Note that by limiting valid offsets to 56 bits, we can pack everything we
			
 
				+ need into 3 64-byte words, meaning our minimum record size is 8 bytes.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Subsubsection
			
 
				+Status
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+Complete.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Subsection
			
 
				+Transaction Commit Requires 4 fdatasync
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+The current transaction algorithm is:
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Enumerate
			
 
				+write_recovery_data();
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Enumerate
			
 
				+sync();
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Enumerate
			
 
				+write_recovery_header();
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Enumerate
			
 
				+sync();
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Enumerate
			
 
				+overwrite_with_new_data();
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Enumerate
			
 
				+sync();
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Enumerate
			
 
				+remove_recovery_header();
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Enumerate
			
 
				+sync();
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+On current ext3, each sync flushes all data to disk, so the next 3 syncs
			
 
				+ are relatively expensive.
			
 
				+ But this could become a performance bottleneck on other filesystems such
			
 
				+ as ext4.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Subsubsection
			
 
				+Proposed Solution
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+Neil Brown points out that this is overzealous, and only one sync is needed:
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Enumerate
			
 
				+Bundle the recovery data, a transaction counter and a strong checksum of
			
 
				+ the new data.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Enumerate
			
 
				+Strong checksum that whole bundle.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Enumerate
			
 
				+Store the bundle in the database.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Enumerate
			
 
				+Overwrite the oldest of the two recovery pointers in the header (identified
			
 
				+ using the transaction counter) with the offset of this bundle.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Enumerate
			
 
				+sync.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Enumerate
			
 
				+Write the new data to the file.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+Checking for recovery means identifying the latest bundle with a valid checksum
			
 
				+ and using the new data checksum to ensure that it has been applied.
			
 
				+ This is more expensive than the current check, but need only be done at
			
 
				+ open.
			
 
				+ For running databases, a separate header field can be used to indicate
			
 
				+ a transaction in progress; we need only check for recovery if this is set.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Subsubsection
			
 
				+Status
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+Deferred.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Subsection
			
 
				+\begin_inset CommandInset label
			
 
				+LatexCommand label
			
 
				+name "sub:TDB-Does-Not"
			
 
				+
			
 
				+\end_inset
			
 
				+
			
 
				+TDB Does Not Have Snapshot Support
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Subsubsection
			
 
				+Proposed Solution
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+None.
			
 
				+ At some point you say
			
 
				+\begin_inset Quotes eld
			
 
				+\end_inset
			
 
				+
			
 
				+use a real database
			
 
				+\begin_inset Quotes erd
			
 
				+\end_inset
			
 
				+
			
 
				+ (but see
			
 
				+\begin_inset CommandInset ref
			
 
				+LatexCommand ref
			
 
				+reference "replay-attribute"
			
 
				+
			
 
				+\end_inset
			
 
				+
			
 
				+).
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+But as a thought experiment, if we implemented transactions to only overwrite
			
 
				+ free entries (this is tricky: there must not be a header in each entry
			
 
				+ which indicates whether it is free, but use of presence in metadata elsewhere),
			
 
				+ and a pointer to the hash table, we could create an entirely new commit
			
 
				+ without destroying existing data.
			
 
				+ Then it would be easy to implement snapshots in a similar way.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+This would not allow arbitrary changes to the database, such as tdb_repack
			
 
				+ does, and would require more space (since we have to preserve the current
			
 
				+ and future entries at once).
			
 
				+ If we used hash trees rather than one big hash table, we might only have
			
 
				+ to rewrite some sections of the hash, too.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+We could then implement snapshots using a similar method, using multiple
			
 
				+ different hash tables/free tables.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Subsubsection
			
 
				+Status
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+Deferred.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Subsection
			
 
				+Transactions Cannot Operate in Parallel
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+This would be useless for ldb, as it hits the index records with just about
			
 
				+ every update.
			
 
				+ It would add significant complexity in resolving clashes, and cause the
			
 
				+ all transaction callers to write their code to loop in the case where the
			
 
				+ transactions spuriously failed.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Subsubsection
			
 
				+Proposed Solution
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+None (but see
			
 
				+\begin_inset CommandInset ref
			
 
				+LatexCommand ref
			
 
				+reference "replay-attribute"
			
 
				+
			
 
				+\end_inset
			
 
				+
			
 
				+).
			
 
				+ We could solve a small part of the problem by providing read-only transactions.
			
 
				+ These would allow one write transaction to begin, but it could not commit
			
 
				+ until all r/o transactions are done.
			
 
				+ This would require a new RO_TRANSACTION_LOCK, which would be upgraded on
			
 
				+ commit.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Subsubsection
			
 
				+Status
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+Deferred.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Subsection
			
 
				+Default Hash Function Is Suboptimal
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+The Knuth-inspired multiplicative hash used by tdb is fairly slow (especially
			
 
				+ if we expand it to 64 bits), and works best when the hash bucket size is
			
 
				+ a prime number (which also means a slow modulus).
			
 
				+ In addition, it is highly predictable which could potentially lead to a
			
 
				+ Denial of Service attack in some TDB uses.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Subsubsection
			
 
				+Proposed Solution
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+The Jenkins lookup3 hash
			
 
				+\begin_inset Foot
			
 
				+status open
			
 
				+
			
 
				+\begin_layout Plain Layout
			
 
				+http://burtleburtle.net/bob/c/lookup3.c
			
 
				+\end_layout
			
 
				+
			
 
				+\end_inset
			
 
				+
			
 
				+ is a fast and superbly-mixing hash.
			
 
				+ It's used by the Linux kernel and almost everything else.
			
 
				+ This has the particular properties that it takes an initial seed, and produces
			
 
				+ two 32 bit hash numbers, which we can combine into a 64-bit hash.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+The seed should be created at tdb-creation time from some random source,
			
 
				+ and placed in the header.
			
 
				+ This is far from foolproof, but adds a little bit of protection against
			
 
				+ hash bombing.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Subsubsection
			
 
				+Status
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+Complete.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Subsection
			
 
				+\begin_inset CommandInset label
			
 
				+LatexCommand label
			
 
				+name "Reliable-Traversal-Adds"
			
 
				+
			
 
				+\end_inset
			
 
				+
			
 
				+Reliable Traversal Adds Complexity
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+We lock a record during traversal iteration, and try to grab that lock in
			
 
				+ the delete code.
			
 
				+ If that grab on delete fails, we simply mark it deleted and continue onwards;
			
 
				+ traversal checks for this condition and does the delete when it moves off
			
 
				+ the record.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+If traversal terminates, the dead record may be left indefinitely.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Subsubsection
			
 
				+Proposed Solution
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+Remove reliability guarantees; see
			
 
				+\begin_inset CommandInset ref
			
 
				+LatexCommand ref
			
 
				+reference "traverse-Proposed-Solution"
			
 
				+
			
 
				+\end_inset
			
 
				+
			
 
				+.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Subsubsection
			
 
				+Status
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+Complete.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Subsection
			
 
				+Fcntl Locking Adds Overhead
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+Placing a fcntl lock means a system call, as does removing one.
			
 
				+ This is actually one reason why transactions can be faster (everything
			
 
				+ is locked once at transaction start).
			
 
				+ In the uncontended case, this overhead can theoretically be eliminated.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Subsubsection
			
 
				+Proposed Solution
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+None.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+We tried this before with spinlock support, in the early days of TDB, and
			
 
				+ it didn't make much difference except in manufactured benchmarks.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+We could use spinlocks (with futex kernel support under Linux), but it means
			
 
				+ that we lose automatic cleanup when a process dies with a lock.
			
 
				+ There is a method of auto-cleanup under Linux, but it's not supported by
			
 
				+ other operating systems.
			
 
				+ We could reintroduce a clear-if-first-style lock and sweep for dead futexes
			
 
				+ on open, but that wouldn't help the normal case of one concurrent opener
			
 
				+ dying.
			
 
				+ Increasingly elaborate repair schemes could be considered, but they require
			
 
				+ an ABI change (everyone must use them) anyway, so there's no need to do
			
 
				+ this at the same time as everything else.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Subsection
			
 
				+Some Transactions Don't Require Durability
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+Volker points out that gencache uses a CLEAR_IF_FIRST tdb for normal (fast)
			
 
				+ usage, and occasionally empties the results into a transactional TDB.
			
 
				+ This kind of usage prioritizes performance over durability: as long as
			
 
				+ we are consistent, data can be lost.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+This would be more neatly implemented inside tdb: a
			
 
				+\begin_inset Quotes eld
			
 
				+\end_inset
			
 
				+
			
 
				+soft
			
 
				+\begin_inset Quotes erd
			
 
				+\end_inset
			
 
				+
			
 
				+ transaction commit (ie.
			
 
				+ syncless) which meant that data may be reverted on a crash.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Subsubsection
			
 
				+Proposed Solution
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+None.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+Unfortunately any transaction scheme which overwrites old data requires
			
 
				+ a sync before that overwrite to avoid the possibility of corruption.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+It seems possible to use a scheme similar to that described in
			
 
				+\begin_inset CommandInset ref
			
 
				+LatexCommand ref
			
 
				+reference "sub:TDB-Does-Not"
			
 
				+
			
 
				+\end_inset
			
 
				+
			
 
				+,where transactions are committed without overwriting existing data, and
			
 
				+ an array of top-level pointers were available in the header.
			
 
				+ If the transaction is
			
 
				+\begin_inset Quotes eld
			
 
				+\end_inset
			
 
				+
			
 
				+soft
			
 
				+\begin_inset Quotes erd
			
 
				+\end_inset
			
 
				+
			
 
				+ then we would not need a sync at all: existing processes would pick up
			
 
				+ the new hash table and free list and work with that.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+At some later point, a sync would allow recovery of the old data into the
			
 
				+ free lists (perhaps when the array of top-level pointers filled).
			
 
				+ On crash, tdb_open() would examine the array of top levels, and apply the
			
 
				+ transactions until it encountered an invalid checksum.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Subsection
			
 
				+Tracing Is Fragile, Replay Is External
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+The current TDB has compile-time-enabled tracing code, but it often breaks
			
 
				+ as it is not enabled by default.
			
 
				+ In a similar way, the ctdb code has an external wrapper which does replay
			
 
				+ tracing so it can coordinate cluster-wide transactions.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Subsubsection
			
 
				+Proposed Solution
			
 
				+\begin_inset CommandInset label
			
 
				+LatexCommand label
			
 
				+name "replay-attribute"
			
 
				+
			
 
				+\end_inset
			
 
				+
			
 
				+
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+Tridge points out that an attribute can be later added to tdb_open (see
			
 
				+\begin_inset CommandInset ref
			
 
				+LatexCommand ref
			
 
				+reference "attributes"
			
 
				+
			
 
				+\end_inset
			
 
				+
			
 
				+) to provide replay/trace hooks, which could become the basis for this and
			
 
				+ future parallel transactions and snapshot support.
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Subsubsection
			
 
				+Status
			
 
				+\end_layout
			
 
				+
			
 
				+\begin_layout Standard
			
 
				+Deferred.
			
 
				+\end_layout
			
 
				+
			
 
				+\end_body
			
 
				+\end_document
			
--- a/ccan/ntdb/doc/design.pdf
+++ b/ccan/ntdb/doc/design.pdf
--- a/ccan/ntdb/doc/design.txt
+++ b/ccan/ntdb/doc/design.txt
@@ -0,0 +1,1270 @@
 
				+NTDB: Redesigning The Trivial DataBase
			
 
				+
			
 
				+Rusty Russell, IBM Corporation
			
 
				+
			
 
				+19 June 2012
			
 
				+
			
 
				+Abstract
			
 
				+
			
 
				+The Trivial DataBase on-disk format is 32 bits; with usage cases
			
 
				+heading towards the 4G limit, that must change. This required
			
 
				+breakage provides an opportunity to revisit TDB's other design
			
 
				+decisions and reassess them.
			
 
				+
			
 
				+1 Introduction
			
 
				+
			
 
				+The Trivial DataBase was originally written by Andrew Tridgell as
			
 
				+a simple key/data pair storage system with the same API as dbm,
			
 
				+but allowing multiple readers and writers while being small
			
 
				+enough (< 1000 lines of C) to include in SAMBA. The simple design
			
 
				+created in 1999 has proven surprisingly robust and performant,
			
 
				+used in Samba versions 3 and 4 as well as numerous other
			
 
				+projects. Its useful life was greatly increased by the
			
 
				+(backwards-compatible!) addition of transaction support in 2005.
			
 
				+
			
 
				+The wider variety and greater demands of TDB-using code has lead
			
 
				+to some organic growth of the API, as well as some compromises on
			
 
				+the implementation. None of these, by themselves, are seen as
			
 
				+show-stoppers, but the cumulative effect is to a loss of elegance
			
 
				+over the initial, simple TDB implementation. Here is a table of
			
 
				+the approximate number of lines of implementation code and number
			
 
				+of API functions at the end of each year:
			
 
				+
			
 
				+
			
 
				++-----------+----------------+--------------------------------+
			
 
				+| Year End  | API Functions  | Lines of C Code Implementation |
			
 
				++-----------+----------------+--------------------------------+
			
 
				++-----------+----------------+--------------------------------+
			
 
				+|   1999    |      13        |              1195              |
			
 
				++-----------+----------------+--------------------------------+
			
 
				+|   2000    |      24        |              1725              |
			
 
				++-----------+----------------+--------------------------------+
			
 
				+|   2001    |      32        |              2228              |
			
 
				++-----------+----------------+--------------------------------+
			
 
				+|   2002    |      35        |              2481              |
			
 
				++-----------+----------------+--------------------------------+
			
 
				+|   2003    |      35        |              2552              |
			
 
				++-----------+----------------+--------------------------------+
			
 
				+|   2004    |      40        |              2584              |
			
 
				++-----------+----------------+--------------------------------+
			
 
				+|   2005    |      38        |              2647              |
			
 
				++-----------+----------------+--------------------------------+
			
 
				+|   2006    |      52        |              3754              |
			
 
				++-----------+----------------+--------------------------------+
			
 
				+|   2007    |      66        |              4398              |
			
 
				++-----------+----------------+--------------------------------+
			
 
				+|   2008    |      71        |              4768              |
			
 
				++-----------+----------------+--------------------------------+
			
 
				+|   2009    |      73        |              5715              |
			
 
				++-----------+----------------+--------------------------------+
			
 
				+
			
 
				+
			
 
				+This review is an attempt to catalog and address all the known
			
 
				+issues with TDB and create solutions which address the problems
			
 
				+without significantly increasing complexity; all involved are far
			
 
				+too aware of the dangers of second system syndrome in rewriting a
			
 
				+successful project like this.
			
 
				+
			
 
				+Note: the final decision was to make ntdb a separate library,
			
 
				+with a separarate 'ntdb' namespace so both can potentially be
			
 
				+linked together. This document still refers to “tdb” everywhere,
			
 
				+for simplicity.
			
 
				+
			
 
				+2 API Issues
			
 
				+
			
 
				+2.1 tdb_open_ex Is Not Expandable
			
 
				+
			
 
				+The tdb_open() call was expanded to tdb_open_ex(), which added an
			
 
				+optional hashing function and an optional logging function
			
 
				+argument. Additional arguments to open would require the
			
 
				+introduction of a tdb_open_ex2 call etc.
			
 
				+
			
 
				+2.1.1 Proposed Solution<attributes>
			
 
				+
			
 
				+tdb_open() will take a linked-list of attributes:
			
 
				+
			
 
				+enum tdb_attribute {
			
 
				+
			
 
				+    TDB_ATTRIBUTE_LOG = 0,
			
 
				+
			
 
				+    TDB_ATTRIBUTE_HASH = 1
			
 
				+
			
 
				+};
			
 
				+
			
 
				+struct tdb_attribute_base {
			
 
				+
			
 
				+    enum tdb_attribute attr;
			
 
				+
			
 
				+    union tdb_attribute *next;
			
 
				+
			
 
				+};
			
 
				+
			
 
				+struct tdb_attribute_log {
			
 
				+
			
 
				+    struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_LOG
			
 
				+*/
			
 
				+
			
 
				+    tdb_log_func log_fn;
			
 
				+
			
 
				+    void *log_private;
			
 
				+
			
 
				+};
			
 
				+
			
 
				+struct tdb_attribute_hash {
			
 
				+
			
 
				+    struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_HASH
			
 
				+*/
			
 
				+
			
 
				+    tdb_hash_func hash_fn;
			
 
				+
			
 
				+    void *hash_private;
			
 
				+
			
 
				+};
			
 
				+
			
 
				+union tdb_attribute {
			
 
				+
			
 
				+    struct tdb_attribute_base base;
			
 
				+
			
 
				+    struct tdb_attribute_log log;
			
 
				+
			
 
				+    struct tdb_attribute_hash hash;
			
 
				+
			
 
				+};
			
 
				+
			
 
				+This allows future attributes to be added, even if this expands
			
 
				+the size of the union.
			
 
				+
			
 
				+2.1.2 Status
			
 
				+
			
 
				+Complete.
			
 
				+
			
 
				+2.2 tdb_traverse Makes Impossible Guarantees
			
 
				+
			
 
				+tdb_traverse (and tdb_firstkey/tdb_nextkey) predate transactions,
			
 
				+and it was thought that it was important to guarantee that all
			
 
				+records which exist at the start and end of the traversal would
			
 
				+be included, and no record would be included twice.
			
 
				+
			
 
				+This adds complexity (see[Reliable-Traversal-Adds]) and does not
			
 
				+work anyway for records which are altered (in particular, those
			
 
				+which are expanded may be effectively deleted and re-added behind
			
 
				+the traversal).
			
 
				+
			
 
				+2.2.1 <traverse-Proposed-Solution>Proposed Solution
			
 
				+
			
 
				+Abandon the guarantee. You will see every record if no changes
			
 
				+occur during your traversal, otherwise you will see some subset.
			
 
				+You can prevent changes by using a transaction or the locking
			
 
				+API.
			
 
				+
			
 
				+2.2.2 Status
			
 
				+
			
 
				+Complete. Delete-during-traverse will still delete every record,
			
 
				+too (assuming no other changes).
			
 
				+
			
 
				+2.3 Nesting of Transactions Is Fraught
			
 
				+
			
 
				+TDB has alternated between allowing nested transactions and not
			
 
				+allowing them. Various paths in the Samba codebase assume that
			
 
				+transactions will nest, and in a sense they can: the operation is
			
 
				+only committed to disk when the outer transaction is committed.
			
 
				+There are two problems, however:
			
 
				+
			
 
				+1. Canceling the inner transaction will cause the outer
			
 
				+  transaction commit to fail, and will not undo any operations
			
 
				+  since the inner transaction began. This problem is soluble with
			
 
				+  some additional internal code.
			
 
				+
			
 
				+2. An inner transaction commit can be cancelled by the outer
			
 
				+  transaction. This is desirable in the way which Samba's
			
 
				+  database initialization code uses transactions, but could be a
			
 
				+  surprise to any users expecting a successful transaction commit
			
 
				+  to expose changes to others.
			
 
				+
			
 
				+The current solution is to specify the behavior at tdb_open(),
			
 
				+with the default currently that nested transactions are allowed.
			
 
				+This flag can also be changed at runtime.
			
 
				+
			
 
				+2.3.1 Proposed Solution
			
 
				+
			
 
				+Given the usage patterns, it seems that the“least-surprise”
			
 
				+behavior of disallowing nested transactions should become the
			
 
				+default. Additionally, it seems the outer transaction is the only
			
 
				+code which knows whether inner transactions should be allowed, so
			
 
				+a flag to indicate this could be added to tdb_transaction_start.
			
 
				+However, this behavior can be simulated with a wrapper which uses
			
 
				+tdb_add_flags() and tdb_remove_flags(), so the API should not be
			
 
				+expanded for this relatively-obscure case.
			
 
				+
			
 
				+2.3.2 Status
			
 
				+
			
 
				+Complete; the nesting flag has been removed.
			
 
				+
			
 
				+2.4 Incorrect Hash Function is Not Detected
			
 
				+
			
 
				+tdb_open_ex() allows the calling code to specify a different hash
			
 
				+function to use, but does not check that all other processes
			
 
				+accessing this tdb are using the same hash function. The result
			
 
				+is that records are missing from tdb_fetch().
			
 
				+
			
 
				+2.4.1 Proposed Solution
			
 
				+
			
 
				+The header should contain an example hash result (eg. the hash of
			
 
				+0xdeadbeef), and tdb_open_ex() should check that the given hash
			
 
				+function produces the same answer, or fail the tdb_open call.
			
 
				+
			
 
				+2.4.2 Status
			
 
				+
			
 
				+Complete.
			
 
				+
			
 
				+2.5 tdb_set_max_dead/TDB_VOLATILE Expose Implementation
			
 
				+
			
 
				+In response to scalability issues with the free list ([TDB-Freelist-Is]
			
 
				+) two API workarounds have been incorporated in TDB:
			
 
				+tdb_set_max_dead() and the TDB_VOLATILE flag to tdb_open. The
			
 
				+latter actually calls the former with an argument of“5”.
			
 
				+
			
 
				+This code allows deleted records to accumulate without putting
			
 
				+them in the free list. On delete we iterate through each chain
			
 
				+and free them in a batch if there are more than max_dead entries.
			
 
				+These are never otherwise recycled except as a side-effect of a
			
 
				+tdb_repack.
			
 
				+
			
 
				+2.5.1 Proposed Solution
			
 
				+
			
 
				+With the scalability problems of the freelist solved, this API
			
 
				+can be removed. The TDB_VOLATILE flag may still be useful as a
			
 
				+hint that store and delete of records will be at least as common
			
 
				+as fetch in order to allow some internal tuning, but initially
			
 
				+will become a no-op.
			
 
				+
			
 
				+2.5.2 Status
			
 
				+
			
 
				+Complete. Unknown flags cause tdb_open() to fail as well, so they
			
 
				+can be detected at runtime.
			
 
				+
			
 
				+2.6 <TDB-Files-Cannot>TDB Files Cannot Be Opened Multiple Times
			
 
				+  In The Same Process
			
 
				+
			
 
				+No process can open the same TDB twice; we check and disallow it.
			
 
				+This is an unfortunate side-effect of fcntl locks, which operate
			
 
				+on a per-file rather than per-file-descriptor basis, and do not
			
 
				+nest. Thus, closing any file descriptor on a file clears all the
			
 
				+locks obtained by this process, even if they were placed using a
			
 
				+different file descriptor!
			
 
				+
			
 
				+Note that even if this were solved, deadlock could occur if
			
 
				+operations were nested: this is a more manageable programming
			
 
				+error in most cases.
			
 
				+
			
 
				+2.6.1 Proposed Solution
			
 
				+
			
 
				+We could lobby POSIX to fix the perverse rules, or at least lobby
			
 
				+Linux to violate them so that the most common implementation does
			
 
				+not have this restriction. This would be a generally good idea
			
 
				+for other fcntl lock users.
			
 
				+
			
 
				+Samba uses a wrapper which hands out the same tdb_context to
			
 
				+multiple callers if this happens, and does simple reference
			
 
				+counting. We should do this inside the tdb library, which already
			
 
				+emulates lock nesting internally; it would need to recognize when
			
 
				+deadlock occurs within a single process. This would create a new
			
 
				+failure mode for tdb operations (while we currently handle
			
 
				+locking failures, they are impossible in normal use and a process
			
 
				+encountering them can do little but give up).
			
 
				+
			
 
				+I do not see benefit in an additional tdb_open flag to indicate
			
 
				+whether re-opening is allowed, as though there may be some
			
 
				+benefit to adding a call to detect when a tdb_context is shared,
			
 
				+to allow other to create such an API.
			
 
				+
			
 
				+2.6.2 Status
			
 
				+
			
 
				+Complete.
			
 
				+
			
 
				+2.7 TDB API Is Not POSIX Thread-safe
			
 
				+
			
 
				+The TDB API uses an error code which can be queried after an
			
 
				+operation to determine what went wrong. This programming model
			
 
				+does not work with threads, unless specific additional guarantees
			
 
				+are given by the implementation. In addition, even
			
 
				+otherwise-independent threads cannot open the same TDB (as in[TDB-Files-Cannot]
			
 
				+).
			
 
				+
			
 
				+2.7.1 Proposed Solution
			
 
				+
			
 
				+Reachitecting the API to include a tdb_errcode pointer would be a
			
 
				+great deal of churn, but fortunately most functions return 0 on
			
 
				+success and -1 on error: we can change these to return 0 on
			
 
				+success and a negative error code on error, and the API remains
			
 
				+similar to previous. The tdb_fetch, tdb_firstkey and tdb_nextkey
			
 
				+functions need to take a TDB_DATA pointer and return an error
			
 
				+code. It is also simpler to have tdb_nextkey replace its key
			
 
				+argument in place, freeing up any old .dptr.
			
 
				+
			
 
				+Internal locking is required to make sure that fcntl locks do not
			
 
				+overlap between threads, and also that the global list of tdbs is
			
 
				+maintained.
			
 
				+
			
 
				+The aim is that building tdb with -DTDB_PTHREAD will result in a
			
 
				+pthread-safe version of the library, and otherwise no overhead
			
 
				+will exist. Alternatively, a hooking mechanism similar to that
			
 
				+proposed for[Proposed-Solution-locking-hook] could be used to
			
 
				+enable pthread locking at runtime.
			
 
				+
			
 
				+2.7.2 Status
			
 
				+
			
 
				+Incomplete; API has been changed but thread safety has not been
			
 
				+implemented.
			
 
				+
			
 
				+2.8 *_nonblock Functions And *_mark Functions Expose
			
 
				+  Implementation
			
 
				+
			
 
				+CTDB[footnote:
			
 
				+Clustered TDB, see http://ctdb.samba.org
			
 
				+] wishes to operate on TDB in a non-blocking manner. This is
			
 
				+currently done as follows:
			
 
				+
			
 
				+1. Call the _nonblock variant of an API function (eg.
			
 
				+  tdb_lockall_nonblock). If this fails:
			
 
				+
			
 
				+2. Fork a child process, and wait for it to call the normal
			
 
				+  variant (eg. tdb_lockall).
			
 
				+
			
 
				+3. If the child succeeds, call the _mark variant to indicate we
			
 
				+  already have the locks (eg. tdb_lockall_mark).
			
 
				+
			
 
				+4. Upon completion, tell the child to release the locks (eg.
			
 
				+  tdb_unlockall).
			
 
				+
			
 
				+5. Indicate to tdb that it should consider the locks removed (eg.
			
 
				+  tdb_unlockall_mark).
			
 
				+
			
 
				+There are several issues with this approach. Firstly, adding two
			
 
				+new variants of each function clutters the API for an obscure
			
 
				+use, and so not all functions have three variants. Secondly, it
			
 
				+assumes that all paths of the functions ask for the same locks,
			
 
				+otherwise the parent process will have to get a lock which the
			
 
				+child doesn't have under some circumstances. I don't believe this
			
 
				+is currently the case, but it constrains the implementation.
			
 
				+
			
 
				+2.8.1 <Proposed-Solution-locking-hook>Proposed Solution
			
 
				+
			
 
				+Implement a hook for locking methods, so that the caller can
			
 
				+control the calls to create and remove fcntl locks. In this
			
 
				+scenario, ctdbd would operate as follows:
			
 
				+
			
 
				+1. Call the normal API function, eg tdb_lockall().
			
 
				+
			
 
				+2. When the lock callback comes in, check if the child has the
			
 
				+  lock. Initially, this is always false. If so, return 0.
			
 
				+  Otherwise, try to obtain it in non-blocking mode. If that
			
 
				+  fails, return EWOULDBLOCK.
			
 
				+
			
 
				+3. Release locks in the unlock callback as normal.
			
 
				+
			
 
				+4. If tdb_lockall() fails, see if we recorded a lock failure; if
			
 
				+  so, call the child to repeat the operation.
			
 
				+
			
 
				+5. The child records what locks it obtains, and returns that
			
 
				+  information to the parent.
			
 
				+
			
 
				+6. When the child has succeeded, goto 1.
			
 
				+
			
 
				+This is flexible enough to handle any potential locking scenario,
			
 
				+even when lock requirements change. It can be optimized so that
			
 
				+the parent does not release locks, just tells the child which
			
 
				+locks it doesn't need to obtain.
			
 
				+
			
 
				+It also keeps the complexity out of the API, and in ctdbd where
			
 
				+it is needed.
			
 
				+
			
 
				+2.8.2 Status
			
 
				+
			
 
				+Complete.
			
 
				+
			
 
				+2.9 tdb_chainlock Functions Expose Implementation
			
 
				+
			
 
				+tdb_chainlock locks some number of records, including the record
			
 
				+indicated by the given key. This gave atomicity guarantees;
			
 
				+no-one can start a transaction, alter, read or delete that key
			
 
				+while the lock is held.
			
 
				+
			
 
				+It also makes the same guarantee for any other key in the chain,
			
 
				+which is an internal implementation detail and potentially a
			
 
				+cause for deadlock.
			
 
				+
			
 
				+2.9.1 Proposed Solution
			
 
				+
			
 
				+None. It would be nice to have an explicit single entry lock
			
 
				+which effected no other keys. Unfortunately, this won't work for
			
 
				+an entry which doesn't exist. Thus while chainlock may be
			
 
				+implemented more efficiently for the existing case, it will still
			
 
				+have overlap issues with the non-existing case. So it is best to
			
 
				+keep the current (lack of) guarantee about which records will be
			
 
				+effected to avoid constraining our implementation.
			
 
				+
			
 
				+2.10 Signal Handling is Not Race-Free
			
 
				+
			
 
				+The tdb_setalarm_sigptr() call allows the caller's signal handler
			
 
				+to indicate that the tdb locking code should return with a
			
 
				+failure, rather than trying again when a signal is received (and
			
 
				+errno == EAGAIN). This is usually used to implement timeouts.
			
 
				+
			
 
				+Unfortunately, this does not work in the case where the signal is
			
 
				+received before the tdb code enters the fcntl() call to place the
			
 
				+lock: the code will sleep within the fcntl() code, unaware that
			
 
				+the signal wants it to exit. In the case of long timeouts, this
			
 
				+does not happen in practice.
			
 
				+
			
 
				+2.10.1 Proposed Solution
			
 
				+
			
 
				+The locking hooks proposed in[Proposed-Solution-locking-hook]
			
 
				+would allow the user to decide on whether to fail the lock
			
 
				+acquisition on a signal. This allows the caller to choose their
			
 
				+own compromise: they could narrow the race by checking
			
 
				+immediately before the fcntl call.[footnote:
			
 
				+It may be possible to make this race-free in some implementations
			
 
				+by having the signal handler alter the struct flock to make it
			
 
				+invalid. This will cause the fcntl() lock call to fail with
			
 
				+EINVAL if the signal occurs before the kernel is entered,
			
 
				+otherwise EAGAIN.
			
 
				+]
			
 
				+
			
 
				+2.10.2 Status
			
 
				+
			
 
				+Complete.
			
 
				+
			
 
				+2.11 The API Uses Gratuitous Typedefs, Capitals
			
 
				+
			
 
				+typedefs are useful for providing source compatibility when types
			
 
				+can differ across implementations, or arguably in the case of
			
 
				+function pointer definitions which are hard for humans to parse.
			
 
				+Otherwise it is simply obfuscation and pollutes the namespace.
			
 
				+
			
 
				+Capitalization is usually reserved for compile-time constants and
			
 
				+macros.
			
 
				+
			
 
				+  TDB_CONTEXT There is no reason to use this over 'struct
			
 
				+  tdb_context'; the definition isn't visible to the API user
			
 
				+  anyway.
			
 
				+
			
 
				+  TDB_DATA There is no reason to use this over struct TDB_DATA;
			
 
				+  the struct needs to be understood by the API user.
			
 
				+
			
 
				+  struct TDB_DATA This would normally be called 'struct
			
 
				+  tdb_data'.
			
 
				+
			
 
				+  enum TDB_ERROR Similarly, this would normally be enum
			
 
				+  tdb_error.
			
 
				+
			
 
				+2.11.1 Proposed Solution
			
 
				+
			
 
				+None. Introducing lower case variants would please pedants like
			
 
				+myself, but if it were done the existing ones should be kept.
			
 
				+There is little point forcing a purely cosmetic change upon tdb
			
 
				+users.
			
 
				+
			
 
				+2.12 <tdb_log_func-Doesnt-Take>tdb_log_func Doesn't Take The
			
 
				+  Private Pointer
			
 
				+
			
 
				+For API compatibility reasons, the logging function needs to call
			
 
				+tdb_get_logging_private() to retrieve the pointer registered by
			
 
				+the tdb_open_ex for logging.
			
 
				+
			
 
				+2.12.1 Proposed Solution
			
 
				+
			
 
				+It should simply take an extra argument, since we are prepared to
			
 
				+break the API/ABI.
			
 
				+
			
 
				+2.12.2 Status
			
 
				+
			
 
				+Complete.
			
 
				+
			
 
				+2.13 Various Callback Functions Are Not Typesafe
			
 
				+
			
 
				+The callback functions in tdb_set_logging_function (after[tdb_log_func-Doesnt-Take]
			
 
				+ is resolved), tdb_parse_record, tdb_traverse, tdb_traverse_read
			
 
				+and tdb_check all take void * and must internally convert it to
			
 
				+the argument type they were expecting.
			
 
				+
			
 
				+If this type changes, the compiler will not produce warnings on
			
 
				+the callers, since it only sees void *.
			
 
				+
			
 
				+2.13.1 Proposed Solution
			
 
				+
			
 
				+With careful use of macros, we can create callback functions
			
 
				+which give a warning when used on gcc and the types of the
			
 
				+callback and its private argument differ. Unsupported compilers
			
 
				+will not give a warning, which is no worse than now. In addition,
			
 
				+the callbacks become clearer, as they need not use void * for
			
 
				+their parameter.
			
 
				+
			
 
				+See CCAN's typesafe_cb module at
			
 
				+http://ccan.ozlabs.org/info/typesafe_cb.html
			
 
				+
			
 
				+2.13.2 Status
			
 
				+
			
 
				+Complete.
			
 
				+
			
 
				+2.14 TDB_CLEAR_IF_FIRST Must Be Specified On All Opens,
			
 
				+  tdb_reopen_all Problematic
			
 
				+
			
 
				+The TDB_CLEAR_IF_FIRST flag to tdb_open indicates that the TDB
			
 
				+file should be cleared if the caller discovers it is the only
			
 
				+process with the TDB open. However, if any caller does not
			
 
				+specify TDB_CLEAR_IF_FIRST it will not be detected, so will have
			
 
				+the TDB erased underneath them (usually resulting in a crash).
			
 
				+
			
 
				+There is a similar issue on fork(); if the parent exits (or
			
 
				+otherwise closes the tdb) before the child calls tdb_reopen_all()
			
 
				+to establish the lock used to indicate the TDB is opened by
			
 
				+someone, a TDB_CLEAR_IF_FIRST opener at that moment will believe
			
 
				+it alone has opened the TDB and will erase it.
			
 
				+
			
 
				+2.14.1 Proposed Solution
			
 
				+
			
 
				+Remove TDB_CLEAR_IF_FIRST. Other workarounds are possible, but
			
 
				+see[TDB_CLEAR_IF_FIRST-Imposes-Performance].
			
 
				+
			
 
				+2.14.2 Status
			
 
				+
			
 
				+Complete. An open hook is provided to replicate this
			
 
				+functionality if required.
			
 
				+
			
 
				+2.15 Extending The Header Is Difficult
			
 
				+
			
 
				+We have reserved (zeroed) words in the TDB header, which can be
			
 
				+used for future features. If the future features are compulsory,
			
 
				+the version number must be updated to prevent old code from
			
 
				+accessing the database. But if the future feature is optional, we
			
 
				+have no way of telling if older code is accessing the database or
			
 
				+not.
			
 
				+
			
 
				+2.15.1 Proposed Solution
			
 
				+
			
 
				+The header should contain a“format variant” value (64-bit). This
			
 
				+is divided into two 32-bit parts:
			
 
				+
			
 
				+1. The lower part reflects the format variant understood by code
			
 
				+  accessing the database.
			
 
				+
			
 
				+2. The upper part reflects the format variant you must understand
			
 
				+  to write to the database (otherwise you can only open for
			
 
				+  reading).
			
 
				+
			
 
				+The latter field can only be written at creation time, the former
			
 
				+should be written under the OPEN_LOCK when opening the database
			
 
				+for writing, if the variant of the code is lower than the current
			
 
				+lowest variant.
			
 
				+
			
 
				+This should allow backwards-compatible features to be added, and
			
 
				+detection if older code (which doesn't understand the feature)
			
 
				+writes to the database.
			
 
				+
			
 
				+2.15.2 Status
			
 
				+
			
 
				+Complete.
			
 
				+
			
 
				+2.16 Record Headers Are Not Expandible
			
 
				+
			
 
				+If we later want to add (say) checksums on keys and data, it
			
 
				+would require another format change, which we'd like to avoid.
			
 
				+
			
 
				+2.16.1 Proposed Solution
			
 
				+
			
 
				+We often have extra padding at the tail of a record. If we ensure
			
 
				+that the first byte (if any) of this padding is zero, we will
			
 
				+have a way for future changes to detect code which doesn't
			
 
				+understand a new format: the new code would write (say) a 1 at
			
 
				+the tail, and thus if there is no tail or the first byte is 0, we
			
 
				+would know the extension is not present on that record.
			
 
				+
			
 
				+2.16.2 Status
			
 
				+
			
 
				+Complete.
			
 
				+
			
 
				+2.17 TDB Does Not Use Talloc
			
 
				+
			
 
				+Many users of TDB (particularly Samba) use the talloc allocator,
			
 
				+and thus have to wrap TDB in a talloc context to use it
			
 
				+conveniently.
			
 
				+
			
 
				+2.17.1 Proposed Solution
			
 
				+
			
 
				+The allocation within TDB is not complicated enough to justify
			
 
				+the use of talloc, and I am reluctant to force another
			
 
				+(excellent) library on TDB users. Nonetheless a compromise is
			
 
				+possible. An attribute (see[attributes]) can be added later to
			
 
				+tdb_open() to provide an alternate allocation mechanism,
			
 
				+specifically for talloc but usable by any other allocator (which
			
 
				+would ignore the“context” argument).
			
 
				+
			
 
				+This would form a talloc heirarchy as expected, but the caller
			
 
				+would still have to attach a destructor to the tdb context
			
 
				+returned from tdb_open to close it. All TDB_DATA fields would be
			
 
				+children of the tdb_context, and the caller would still have to
			
 
				+manage them (using talloc_free() or talloc_steal()).
			
 
				+
			
 
				+2.17.2 Status
			
 
				+
			
 
				+Complete, using the NTDB_ATTRIBUTE_ALLOCATOR attribute.
			
 
				+
			
 
				+3 Performance And Scalability Issues
			
 
				+
			
 
				+3.1 <TDB_CLEAR_IF_FIRST-Imposes-Performance>TDB_CLEAR_IF_FIRST
			
 
				+  Imposes Performance Penalty
			
 
				+
			
 
				+When TDB_CLEAR_IF_FIRST is specified, a 1-byte read lock is
			
 
				+placed at offset 4 (aka. the ACTIVE_LOCK). While these locks
			
 
				+never conflict in normal tdb usage, they do add substantial
			
 
				+overhead for most fcntl lock implementations when the kernel
			
 
				+scans to detect if a lock conflict exists. This is often a single
			
 
				+linked list, making the time to acquire and release a fcntl lock
			
 
				+O(N) where N is the number of processes with the TDB open, not
			
 
				+the number actually doing work.
			
 
				+
			
 
				+In a Samba server it is common to have huge numbers of clients
			
 
				+sitting idle, and thus they have weaned themselves off the
			
 
				+TDB_CLEAR_IF_FIRST flag.[footnote:
			
 
				+There is a flag to tdb_reopen_all() which is used for this
			
 
				+optimization: if the parent process will outlive the child, the
			
 
				+child does not need the ACTIVE_LOCK. This is a workaround for
			
 
				+this very performance issue.
			
 
				+]
			
 
				+
			
 
				+3.1.1 Proposed Solution
			
 
				+
			
 
				+Remove the flag. It was a neat idea, but even trivial servers
			
 
				+tend to know when they are initializing for the first time and
			
 
				+can simply unlink the old tdb at that point.
			
 
				+
			
 
				+3.1.2 Status
			
 
				+
			
 
				+Complete.
			
 
				+
			
 
				+3.2 TDB Files Have a 4G Limit
			
 
				+
			
 
				+This seems to be becoming an issue (so much for“trivial”!),
			
 
				+particularly for ldb.
			
 
				+
			
 
				+3.2.1 Proposed Solution
			
 
				+
			
 
				+A new, incompatible TDB format which uses 64 bit offsets
			
 
				+internally rather than 32 bit as now. For simplicity of endian
			
 
				+conversion (which TDB does on the fly if required), all values
			
 
				+will be 64 bit on disk. In practice, some upper bits may be used
			
 
				+for other purposes, but at least 56 bits will be available for
			
 
				+file offsets.
			
 
				+
			
 
				+tdb_open() will automatically detect the old version, and even
			
 
				+create them if TDB_VERSION6 is specified to tdb_open.
			
 
				+
			
 
				+32 bit processes will still be able to access TDBs larger than 4G
			
 
				+(assuming that their off_t allows them to seek to 64 bits), they
			
 
				+will gracefully fall back as they fail to mmap. This can happen
			
 
				+already with large TDBs.
			
 
				+
			
 
				+Old versions of tdb will fail to open the new TDB files (since 28
			
 
				+August 2009, commit 398d0c29290: prior to that any unrecognized
			
 
				+file format would be erased and initialized as a fresh tdb!)
			
 
				+
			
 
				+3.2.2 Status
			
 
				+
			
 
				+Complete.
			
 
				+
			
 
				+3.3 TDB Records Have a 4G Limit
			
 
				+
			
 
				+This has not been a reported problem, and the API uses size_t
			
 
				+which can be 64 bit on 64 bit platforms. However, other limits
			
 
				+may have made such an issue moot.
			
 
				+
			
 
				+3.3.1 Proposed Solution
			
 
				+
			
 
				+Record sizes will be 64 bit, with an error returned on 32 bit
			
 
				+platforms which try to access such records (the current
			
 
				+implementation would return TDB_ERR_OOM in a similar case). It
			
 
				+seems unlikely that 32 bit keys will be a limitation, so the
			
 
				+implementation may not support this (see[sub:Records-Incur-A]).
			
 
				+
			
 
				+3.3.2 Status
			
 
				+
			
 
				+Complete.
			
 
				+
			
 
				+3.4 Hash Size Is Determined At TDB Creation Time
			
 
				+
			
 
				+TDB contains a number of hash chains in the header; the number is
			
 
				+specified at creation time, and defaults to 131. This is such a
			
 
				+bottleneck on large databases (as each hash chain gets quite
			
 
				+long), that LDB uses 10,000 for this hash. In general it is
			
 
				+impossible to know what the 'right' answer is at database
			
 
				+creation time.
			
 
				+
			
 
				+3.4.1 <sub:Hash-Size-Solution>Proposed Solution
			
 
				+
			
 
				+After comprehensive performance testing on various scalable hash
			
 
				+variants[footnote:
			
 
				+http://rusty.ozlabs.org/?p=89 and http://rusty.ozlabs.org/?p=94
			
 
				+This was annoying because I was previously convinced that an
			
 
				+expanding tree of hashes would be very close to optimal.
			
 
				+], it became clear that it is hard to beat a straight linear hash
			
 
				+table which doubles in size when it reaches saturation.
			
 
				+Unfortunately, altering the hash table introduces serious locking
			
 
				+complications: the entire hash table needs to be locked to
			
 
				+enlarge the hash table, and others might be holding locks.
			
 
				+Particularly insidious are insertions done under tdb_chainlock.
			
 
				+
			
 
				+Thus an expanding layered hash will be used: an array of hash
			
 
				+groups, with each hash group exploding into pointers to lower
			
 
				+hash groups once it fills, turning into a hash tree. This has
			
 
				+implications for locking: we must lock the entire group in case
			
 
				+we need to expand it, yet we don't know how deep the tree is at
			
 
				+that point.
			
 
				+
			
 
				+Note that bits from the hash table entries should be stolen to
			
 
				+hold more hash bits to reduce the penalty of collisions. We can
			
 
				+use the otherwise-unused lower 3 bits. If we limit the size of
			
 
				+the database to 64 exabytes, we can use the top 8 bits of the
			
 
				+hash entry as well. These 11 bits would reduce false positives
			
 
				+down to 1 in 2000 which is more than we need: we can use one of
			
 
				+the bits to indicate that the extra hash bits are valid. This
			
 
				+means we can choose not to re-hash all entries when we expand a
			
 
				+hash group; simply use the next bits we need and mark them
			
 
				+invalid.
			
 
				+
			
 
				+3.4.2 Status
			
 
				+
			
 
				+Ignore. Scaling the hash automatically proved inefficient at
			
 
				+small hash sizes; we default to a 8192-element hash (changable
			
 
				+via NTDB_ATTRIBUTE_HASHSIZE), and when buckets clash we expand to
			
 
				+an array of hash entries. This scales slightly better than the
			
 
				+tdb chain (due to the 8 top bits containing extra hash).
			
 
				+
			
 
				+3.5 <TDB-Freelist-Is>TDB Freelist Is Highly Contended
			
 
				+
			
 
				+TDB uses a single linked list for the free list. Allocation
			
 
				+occurs as follows, using heuristics which have evolved over time:
			
 
				+
			
 
				+1. Get the free list lock for this whole operation.
			
 
				+
			
 
				+2. Multiply length by 1.25, so we always over-allocate by 25%.
			
 
				+
			
 
				+3. Set the slack multiplier to 1.
			
 
				+
			
 
				+4. Examine the current freelist entry: if it is > length but <
			
 
				+  the current best case, remember it as the best case.
			
 
				+
			
 
				+5. Multiply the slack multiplier by 1.05.
			
 
				+
			
 
				+6. If our best fit so far is less than length * slack multiplier,
			
 
				+  return it. The slack will be turned into a new free record if
			
 
				+  it's large enough.
			
 
				+
			
 
				+7. Otherwise, go onto the next freelist entry.
			
 
				+
			
 
				+Deleting a record occurs as follows:
			
 
				+
			
 
				+1. Lock the hash chain for this whole operation.
			
 
				+
			
 
				+2. Walk the chain to find the record, keeping the prev pointer
			
 
				+  offset.
			
 
				+
			
 
				+3. If max_dead is non-zero:
			
 
				+
			
 
				+  (a) Walk the hash chain again and count the dead records.
			
 
				+
			
 
				+  (b) If it's more than max_dead, bulk free all the dead ones
			
 
				+    (similar to steps 4 and below, but the lock is only obtained
			
 
				+    once).
			
 
				+
			
 
				+  (c) Simply mark this record as dead and return.
			
 
				+
			
 
				+4. Get the free list lock for the remainder of this operation.
			
 
				+
			
 
				+5. <right-merging>Examine the following block to see if it is
			
 
				+  free; if so, enlarge the current block and remove that block
			
 
				+  from the free list. This was disabled, as removal from the free
			
 
				+  list was O(entries-in-free-list).
			
 
				+
			
 
				+6. Examine the preceeding block to see if it is free: for this
			
 
				+  reason, each block has a 32-bit tailer which indicates its
			
 
				+  length. If it is free, expand it to cover our new block and
			
 
				+  return.
			
 
				+
			
 
				+7. Otherwise, prepend ourselves to the free list.
			
 
				+
			
 
				+Disabling right-merging (step[right-merging]) causes
			
 
				+fragmentation; the other heuristics proved insufficient to
			
 
				+address this, so the final answer to this was that when we expand
			
 
				+the TDB file inside a transaction commit, we repack the entire
			
 
				+tdb.
			
 
				+
			
 
				+The single list lock limits our allocation rate; due to the other
			
 
				+issues this is not currently seen as a bottleneck.
			
 
				+
			
 
				+3.5.1 Proposed Solution
			
 
				+
			
 
				+The first step is to remove all the current heuristics, as they
			
 
				+obviously interact, then examine them once the lock contention is
			
 
				+addressed.
			
 
				+
			
 
				+The free list must be split to reduce contention. Assuming
			
 
				+perfect free merging, we can at most have 1 free list entry for
			
 
				+each entry. This implies that the number of free lists is related
			
 
				+to the size of the hash table, but as it is rare to walk a large
			
 
				+number of free list entries we can use far fewer, say 1/32 of the
			
 
				+number of hash buckets.
			
 
				+
			
 
				+It seems tempting to try to reuse the hash implementation which
			
 
				+we use for records here, but we have two ways of searching for
			
 
				+free entries: for allocation we search by size (and possibly
			
 
				+zone) which produces too many clashes for our hash table to
			
 
				+handle well, and for coalescing we search by address. Thus an
			
 
				+array of doubly-linked free lists seems preferable.
			
 
				+
			
 
				+There are various benefits in using per-size free lists (see[sub:TDB-Becomes-Fragmented]
			
 
				+) but it's not clear this would reduce contention in the common
			
 
				+case where all processes are allocating/freeing the same size.
			
 
				+Thus we almost certainly need to divide in other ways: the most
			
 
				+obvious is to divide the file into zones, and using a free list
			
 
				+(or table of free lists) for each. This approximates address
			
 
				+ordering.
			
 
				+
			
 
				+Unfortunately it is difficult to know what heuristics should be
			
 
				+used to determine zone sizes, and our transaction code relies on
			
 
				+being able to create a“recovery area” by simply appending to the
			
 
				+file (difficult if it would need to create a new zone header).
			
 
				+Thus we use a linked-list of free tables; currently we only ever
			
 
				+create one, but if there is more than one we choose one at random
			
 
				+to use. In future we may use heuristics to add new free tables on
			
 
				+contention. We only expand the file when all free tables are
			
 
				+exhausted.
			
 
				+
			
 
				+The basic algorithm is as follows. Freeing is simple:
			
 
				+
			
 
				+1. Identify the correct free list.
			
 
				+
			
 
				+2. Lock the corresponding list.
			
 
				+
			
 
				+3. Re-check the list (we didn't have a lock, sizes could have
			
 
				+  changed): relock if necessary.
			
 
				+
			
 
				+4. Place the freed entry in the list.
			
 
				+
			
 
				+Allocation is a little more complicated, as we perform delayed
			
 
				+coalescing at this point:
			
 
				+
			
 
				+1. Pick a free table; usually the previous one.
			
 
				+
			
 
				+2. Lock the corresponding list.
			
 
				+
			
 
				+3. If the top entry is -large enough, remove it from the list and
			
 
				+  return it.
			
 
				+
			
 
				+4. Otherwise, coalesce entries in the list.If there was no entry
			
 
				+  large enough, unlock the list and try the next largest list
			
 
				+
			
 
				+5. If no list has an entry which meets our needs, try the next
			
 
				+  free table.
			
 
				+
			
 
				+6. If no zone satisfies, expand the file.
			
 
				+
			
 
				+This optimizes rapid insert/delete of free list entries by not
			
 
				+coalescing them all the time.. First-fit address ordering
			
 
				+ordering seems to be fairly good for keeping fragmentation low
			
 
				+(see[sub:TDB-Becomes-Fragmented]). Note that address ordering
			
 
				+does not need a tailer to coalesce, though if we needed one we
			
 
				+could have one cheaply: see[sub:Records-Incur-A].
			
 
				+
			
 
				+Each free entry has the free table number in the header: less
			
 
				+than 255. It also contains a doubly-linked list for easy
			
 
				+deletion.
			
 
				+
			
 
				+3.6 <sub:TDB-Becomes-Fragmented>TDB Becomes Fragmented
			
 
				+
			
 
				+Much of this is a result of allocation strategy[footnote:
			
 
				+The Memory Fragmentation Problem: Solved? Johnstone & Wilson 1995
			
 
				+ftp://ftp.cs.utexas.edu/pub/garbage/malloc/ismm98.ps
			
 
				+] and deliberate hobbling of coalescing; internal fragmentation
			
 
				+(aka overallocation) is deliberately set at 25%, and external
			
 
				+fragmentation is only cured by the decision to repack the entire
			
 
				+db when a transaction commit needs to enlarge the file.
			
 
				+
			
 
				+3.6.1 Proposed Solution
			
 
				+
			
 
				+The 25% overhead on allocation works in practice for ldb because
			
 
				+indexes tend to expand by one record at a time. This internal
			
 
				+fragmentation can be resolved by having an“expanded” bit in the
			
 
				+header to note entries that have previously expanded, and
			
 
				+allocating more space for them.
			
 
				+
			
 
				+There are is a spectrum of possible solutions for external
			
 
				+fragmentation: one is to use a fragmentation-avoiding allocation
			
 
				+strategy such as best-fit address-order allocator. The other end
			
 
				+of the spectrum would be to use a bump allocator (very fast and
			
 
				+simple) and simply repack the file when we reach the end.
			
 
				+
			
 
				+There are three problems with efficient fragmentation-avoiding
			
 
				+allocators: they are non-trivial, they tend to use a single free
			
 
				+list for each size, and there's no evidence that tdb allocation
			
 
				+patterns will match those recorded for general allocators (though
			
 
				+it seems likely).
			
 
				+
			
 
				+Thus we don't spend too much effort on external fragmentation; we
			
 
				+will be no worse than the current code if we need to repack on
			
 
				+occasion. More effort is spent on reducing freelist contention,
			
 
				+and reducing overhead.
			
 
				+
			
 
				+3.7 <sub:Records-Incur-A>Records Incur A 28-Byte Overhead
			
 
				+
			
 
				+Each TDB record has a header as follows:
			
 
				+
			
 
				+struct tdb_record {
			
 
				+
			
 
				+        tdb_off_t next; /* offset of the next record in the list
			
 
				+*/
			
 
				+
			
 
				+        tdb_len_t rec_len; /* total byte length of record */
			
 
				+
			
 
				+        tdb_len_t key_len; /* byte length of key */
			
 
				+
			
 
				+        tdb_len_t data_len; /* byte length of data */
			
 
				+
			
 
				+        uint32_t full_hash; /* the full 32 bit hash of the key */
			
 
				+
			
 
				+        uint32_t magic;   /* try to catch errors */
			
 
				+
			
 
				+        /* the following union is implied:
			
 
				+
			
 
				+                union {
			
 
				+
			
 
				+                        char record[rec_len];
			
 
				+
			
 
				+                        struct {
			
 
				+
			
 
				+                                char key[key_len];
			
 
				+
			
 
				+                                char data[data_len];
			
 
				+
			
 
				+                        }
			
 
				+
			
 
				+                        uint32_t totalsize; (tailer)
			
 
				+
			
 
				+                }
			
 
				+
			
 
				+        */
			
 
				+
			
 
				+};
			
 
				+
			
 
				+Naively, this would double to a 56-byte overhead on a 64 bit
			
 
				+implementation.
			
 
				+
			
 
				+3.7.1 Proposed Solution
			
 
				+
			
 
				+We can use various techniques to reduce this for an allocated
			
 
				+block:
			
 
				+
			
 
				+1. The 'next' pointer is not required, as we are using a flat
			
 
				+  hash table.
			
 
				+
			
 
				+2. 'rec_len' can instead be expressed as an addition to key_len
			
 
				+  and data_len (it accounts for wasted or overallocated length in
			
 
				+  the record). Since the record length is always a multiple of 8,
			
 
				+  we can conveniently fit it in 32 bits (representing up to 35
			
 
				+  bits).
			
 
				+
			
 
				+3. 'key_len' and 'data_len' can be reduced. I'm unwilling to
			
 
				+  restrict 'data_len' to 32 bits, but instead we can combine the
			
 
				+  two into one 64-bit field and using a 5 bit value which
			
 
				+  indicates at what bit to divide the two. Keys are unlikely to
			
 
				+  scale as fast as data, so I'm assuming a maximum key size of 32
			
 
				+  bits.
			
 
				+
			
 
				+4. 'full_hash' is used to avoid a memcmp on the“miss” case, but
			
 
				+  this is diminishing returns after a handful of bits (at 10
			
 
				+  bits, it reduces 99.9% of false memcmp). As an aside, as the
			
 
				+  lower bits are already incorporated in the hash table
			
 
				+  resolution, the upper bits should be used here. Note that it's
			
 
				+  not clear that these bits will be a win, given the extra bits
			
 
				+  in the hash table itself (see[sub:Hash-Size-Solution]).
			
 
				+
			
 
				+5. 'magic' does not need to be enlarged: it currently reflects
			
 
				+  one of 5 values (used, free, dead, recovery, and
			
 
				+  unused_recovery). It is useful for quick sanity checking
			
 
				+  however, and should not be eliminated.
			
 
				+
			
 
				+6. 'tailer' is only used to coalesce free blocks (so a block to
			
 
				+  the right can find the header to check if this block is free).
			
 
				+  This can be replaced by a single 'free' bit in the header of
			
 
				+  the following block (and the tailer only exists in free
			
 
				+  blocks).[footnote:
			
 
				+This technique from Thomas Standish. Data Structure Techniques.
			
 
				+Addison-Wesley, Reading, Massachusetts, 1980.
			
 
				+] The current proposed coalescing algorithm doesn't need this,
			
 
				+  however.
			
 
				+
			
 
				+This produces a 16 byte used header like this:
			
 
				+
			
 
				+struct tdb_used_record {
			
 
				+
			
 
				+        uint32_t used_magic : 16,
			
 
				+
			
 
				+
			
 
				+
			
 
				+                 key_data_divide: 5,
			
 
				+
			
 
				+                 top_hash: 11;
			
 
				+
			
 
				+        uint32_t extra_octets;
			
 
				+
			
 
				+        uint64_t key_and_data_len;
			
 
				+
			
 
				+};
			
 
				+
			
 
				+And a free record like this:
			
 
				+
			
 
				+struct tdb_free_record {
			
 
				+
			
 
				+        uint64_t free_magic: 8,
			
 
				+
			
 
				+                   prev : 56;
			
 
				+
			
 
				+
			
 
				+
			
 
				+        uint64_t free_table: 8,
			
 
				+
			
 
				+                 total_length : 56
			
 
				+
			
 
				+        uint64_t next;;
			
 
				+
			
 
				+};
			
 
				+
			
 
				+Note that by limiting valid offsets to 56 bits, we can pack
			
 
				+everything we need into 3 64-byte words, meaning our minimum
			
 
				+record size is 8 bytes.
			
 
				+
			
 
				+3.7.2 Status
			
 
				+
			
 
				+Complete.
			
 
				+
			
 
				+3.8 Transaction Commit Requires 4 fdatasync
			
 
				+
			
 
				+The current transaction algorithm is:
			
 
				+
			
 
				+1. write_recovery_data();
			
 
				+
			
 
				+2. sync();
			
 
				+
			
 
				+3. write_recovery_header();
			
 
				+
			
 
				+4. sync();
			
 
				+
			
 
				+5. overwrite_with_new_data();
			
 
				+
			
 
				+6. sync();
			
 
				+
			
 
				+7. remove_recovery_header();
			
 
				+
			
 
				+8. sync();
			
 
				+
			
 
				+On current ext3, each sync flushes all data to disk, so the next
			
 
				+3 syncs are relatively expensive. But this could become a
			
 
				+performance bottleneck on other filesystems such as ext4.
			
 
				+
			
 
				+3.8.1 Proposed Solution
			
 
				+
			
 
				+Neil Brown points out that this is overzealous, and only one sync
			
 
				+is needed:
			
 
				+
			
 
				+1. Bundle the recovery data, a transaction counter and a strong
			
 
				+  checksum of the new data.
			
 
				+
			
 
				+2. Strong checksum that whole bundle.
			
 
				+
			
 
				+3. Store the bundle in the database.
			
 
				+
			
 
				+4. Overwrite the oldest of the two recovery pointers in the
			
 
				+  header (identified using the transaction counter) with the
			
 
				+  offset of this bundle.
			
 
				+
			
 
				+5. sync.
			
 
				+
			
 
				+6. Write the new data to the file.
			
 
				+
			
 
				+Checking for recovery means identifying the latest bundle with a
			
 
				+valid checksum and using the new data checksum to ensure that it
			
 
				+has been applied. This is more expensive than the current check,
			
 
				+but need only be done at open. For running databases, a separate
			
 
				+header field can be used to indicate a transaction in progress;
			
 
				+we need only check for recovery if this is set.
			
 
				+
			
 
				+3.8.2 Status
			
 
				+
			
 
				+Deferred.
			
 
				+
			
 
				+3.9 <sub:TDB-Does-Not>TDB Does Not Have Snapshot Support
			
 
				+
			
 
				+3.9.1 Proposed Solution
			
 
				+
			
 
				+None. At some point you say“use a real database” (but see[replay-attribute]
			
 
				+).
			
 
				+
			
 
				+But as a thought experiment, if we implemented transactions to
			
 
				+only overwrite free entries (this is tricky: there must not be a
			
 
				+header in each entry which indicates whether it is free, but use
			
 
				+of presence in metadata elsewhere), and a pointer to the hash
			
 
				+table, we could create an entirely new commit without destroying
			
 
				+existing data. Then it would be easy to implement snapshots in a
			
 
				+similar way.
			
 
				+
			
 
				+This would not allow arbitrary changes to the database, such as
			
 
				+tdb_repack does, and would require more space (since we have to
			
 
				+preserve the current and future entries at once). If we used hash
			
 
				+trees rather than one big hash table, we might only have to
			
 
				+rewrite some sections of the hash, too.
			
 
				+
			
 
				+We could then implement snapshots using a similar method, using
			
 
				+multiple different hash tables/free tables.
			
 
				+
			
 
				+3.9.2 Status
			
 
				+
			
 
				+Deferred.
			
 
				+
			
 
				+3.10 Transactions Cannot Operate in Parallel
			
 
				+
			
 
				+This would be useless for ldb, as it hits the index records with
			
 
				+just about every update. It would add significant complexity in
			
 
				+resolving clashes, and cause the all transaction callers to write
			
 
				+their code to loop in the case where the transactions spuriously
			
 
				+failed.
			
 
				+
			
 
				+3.10.1 Proposed Solution
			
 
				+
			
 
				+None (but see[replay-attribute]). We could solve a small part of
			
 
				+the problem by providing read-only transactions. These would
			
 
				+allow one write transaction to begin, but it could not commit
			
 
				+until all r/o transactions are done. This would require a new
			
 
				+RO_TRANSACTION_LOCK, which would be upgraded on commit.
			
 
				+
			
 
				+3.10.2 Status
			
 
				+
			
 
				+Deferred.
			
 
				+
			
 
				+3.11 Default Hash Function Is Suboptimal
			
 
				+
			
 
				+The Knuth-inspired multiplicative hash used by tdb is fairly slow
			
 
				+(especially if we expand it to 64 bits), and works best when the
			
 
				+hash bucket size is a prime number (which also means a slow
			
 
				+modulus). In addition, it is highly predictable which could
			
 
				+potentially lead to a Denial of Service attack in some TDB uses.
			
 
				+
			
 
				+3.11.1 Proposed Solution
			
 
				+
			
 
				+The Jenkins lookup3 hash[footnote:
			
 
				+http://burtleburtle.net/bob/c/lookup3.c
			
 
				+] is a fast and superbly-mixing hash. It's used by the Linux
			
 
				+kernel and almost everything else. This has the particular
			
 
				+properties that it takes an initial seed, and produces two 32 bit
			
 
				+hash numbers, which we can combine into a 64-bit hash.
			
 
				+
			
 
				+The seed should be created at tdb-creation time from some random
			
 
				+source, and placed in the header. This is far from foolproof, but
			
 
				+adds a little bit of protection against hash bombing.
			
 
				+
			
 
				+3.11.2 Status
			
 
				+
			
 
				+Complete.
			
 
				+
			
 
				+3.12 <Reliable-Traversal-Adds>Reliable Traversal Adds Complexity
			
 
				+
			
 
				+We lock a record during traversal iteration, and try to grab that
			
 
				+lock in the delete code. If that grab on delete fails, we simply
			
 
				+mark it deleted and continue onwards; traversal checks for this
			
 
				+condition and does the delete when it moves off the record.
			
 
				+
			
 
				+If traversal terminates, the dead record may be left
			
 
				+indefinitely.
			
 
				+
			
 
				+3.12.1 Proposed Solution
			
 
				+
			
 
				+Remove reliability guarantees; see[traverse-Proposed-Solution].
			
 
				+
			
 
				+3.12.2 Status
			
 
				+
			
 
				+Complete.
			
 
				+
			
 
				+3.13 Fcntl Locking Adds Overhead
			
 
				+
			
 
				+Placing a fcntl lock means a system call, as does removing one.
			
 
				+This is actually one reason why transactions can be faster
			
 
				+(everything is locked once at transaction start). In the
			
 
				+uncontended case, this overhead can theoretically be eliminated.
			
 
				+
			
 
				+3.13.1 Proposed Solution
			
 
				+
			
 
				+None.
			
 
				+
			
 
				+We tried this before with spinlock support, in the early days of
			
 
				+TDB, and it didn't make much difference except in manufactured
			
 
				+benchmarks.
			
 
				+
			
 
				+We could use spinlocks (with futex kernel support under Linux),
			
 
				+but it means that we lose automatic cleanup when a process dies
			
 
				+with a lock. There is a method of auto-cleanup under Linux, but
			
 
				+it's not supported by other operating systems. We could
			
 
				+reintroduce a clear-if-first-style lock and sweep for dead
			
 
				+futexes on open, but that wouldn't help the normal case of one
			
 
				+concurrent opener dying. Increasingly elaborate repair schemes
			
 
				+could be considered, but they require an ABI change (everyone
			
 
				+must use them) anyway, so there's no need to do this at the same
			
 
				+time as everything else.
			
 
				+
			
 
				+3.14 Some Transactions Don't Require Durability
			
 
				+
			
 
				+Volker points out that gencache uses a CLEAR_IF_FIRST tdb for
			
 
				+normal (fast) usage, and occasionally empties the results into a
			
 
				+transactional TDB. This kind of usage prioritizes performance
			
 
				+over durability: as long as we are consistent, data can be lost.
			
 
				+
			
 
				+This would be more neatly implemented inside tdb: a“soft”
			
 
				+transaction commit (ie. syncless) which meant that data may be
			
 
				+reverted on a crash.
			
 
				+
			
 
				+3.14.1 Proposed Solution
			
 
				+
			
 
				+None.
			
 
				+
			
 
				+Unfortunately any transaction scheme which overwrites old data
			
 
				+requires a sync before that overwrite to avoid the possibility of
			
 
				+corruption.
			
 
				+
			
 
				+It seems possible to use a scheme similar to that described in[sub:TDB-Does-Not]
			
 
				+,where transactions are committed without overwriting existing
			
 
				+data, and an array of top-level pointers were available in the
			
 
				+header. If the transaction is“soft” then we would not need a sync
			
 
				+at all: existing processes would pick up the new hash table and
			
 
				+free list and work with that.
			
 
				+
			
 
				+At some later point, a sync would allow recovery of the old data
			
 
				+into the free lists (perhaps when the array of top-level pointers
			
 
				+filled). On crash, tdb_open() would examine the array of top
			
 
				+levels, and apply the transactions until it encountered an
			
 
				+invalid checksum.
			
 
				+
			
 
				+3.15 Tracing Is Fragile, Replay Is External
			
 
				+
			
 
				+The current TDB has compile-time-enabled tracing code, but it
			
 
				+often breaks as it is not enabled by default. In a similar way,
			
 
				+the ctdb code has an external wrapper which does replay tracing
			
 
				+so it can coordinate cluster-wide transactions.
			
 
				+
			
 
				+3.15.1 Proposed Solution<replay-attribute>
			
 
				+
			
 
				+Tridge points out that an attribute can be later added to
			
 
				+tdb_open (see[attributes]) to provide replay/trace hooks, which
			
 
				+could become the basis for this and future parallel transactions
			
 
				+and snapshot support.
			
 
				+
			
 
				+3.15.2 Status
			
 
				+
			
 
				+Deferred.
			
--- a/ccan/ntdb/free.c
+++ b/ccan/ntdb/free.c
@@ -0,0 +1,972 @@
 
				+ /*
			
 
				+   Trivial Database 2: free list/block handling
			
 
				+   Copyright (C) Rusty Russell 2010
			
 
				+
			
 
				+   This library is free software; you can redistribute it and/or
			
 
				+   modify it under the terms of the GNU Lesser General Public
			
 
				+   License as published by the Free Software Foundation; either
			
 
				+   version 3 of the License, or (at your option) any later version.
			
 
				+
			
 
				+   This library is distributed in the hope that it will be useful,
			
 
				+   but WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
			
 
				+   Lesser General Public License for more details.
			
 
				+
			
 
				+   You should have received a copy of the GNU Lesser General Public
			
 
				+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
			
 
				+*/
			
 
				+#include "private.h"
			
 
				+#include <ccan/likely/likely.h>
			
 
				+#include <ccan/ilog/ilog.h>
			
 
				+#include <time.h>
			
 
				+#include <limits.h>
			
 
				+
			
 
				+static unsigned fls64(uint64_t val)
			
 
				+{
			
 
				+	return ilog64(val);
			
 
				+}
			
 
				+
			
 
				+/* In which bucket would we find a particular record size? (ignoring header) */
			
 
				+unsigned int size_to_bucket(ntdb_len_t data_len)
			
 
				+{
			
 
				+	unsigned int bucket;
			
 
				+
			
 
				+	/* We can't have records smaller than this. */
			
 
				+	assert(data_len >= NTDB_MIN_DATA_LEN);
			
 
				+
			
 
				+	/* Ignoring the header... */
			
 
				+	if (data_len - NTDB_MIN_DATA_LEN <= 64) {
			
 
				+		/* 0 in bucket 0, 8 in bucket 1... 64 in bucket 8. */
			
 
				+		bucket = (data_len - NTDB_MIN_DATA_LEN) / 8;
			
 
				+	} else {
			
 
				+		/* After that we go power of 2. */
			
 
				+		bucket = fls64(data_len - NTDB_MIN_DATA_LEN) + 2;
			
 
				+	}
			
 
				+
			
 
				+	if (unlikely(bucket >= NTDB_FREE_BUCKETS))
			
 
				+		bucket = NTDB_FREE_BUCKETS - 1;
			
 
				+	return bucket;
			
 
				+}
			
 
				+
			
 
				+ntdb_off_t first_ftable(struct ntdb_context *ntdb)
			
 
				+{
			
 
				+	return ntdb_read_off(ntdb, offsetof(struct ntdb_header, free_table));
			
 
				+}
			
 
				+
			
 
				+ntdb_off_t next_ftable(struct ntdb_context *ntdb, ntdb_off_t ftable)
			
 
				+{
			
 
				+	return ntdb_read_off(ntdb, ftable + offsetof(struct ntdb_freetable,next));
			
 
				+}
			
 
				+
			
 
				+enum NTDB_ERROR ntdb_ftable_init(struct ntdb_context *ntdb)
			
 
				+{
			
 
				+	/* Use reservoir sampling algorithm to select a free list at random. */
			
 
				+	unsigned int rnd, max = 0, count = 0;
			
 
				+	ntdb_off_t off;
			
 
				+
			
 
				+	ntdb->ftable_off = off = first_ftable(ntdb);
			
 
				+	ntdb->ftable = 0;
			
 
				+
			
 
				+	while (off) {
			
 
				+		if (NTDB_OFF_IS_ERR(off)) {
			
 
				+			return NTDB_OFF_TO_ERR(off);
			
 
				+		}
			
 
				+
			
 
				+		rnd = random();
			
 
				+		if (rnd >= max) {
			
 
				+			ntdb->ftable_off = off;
			
 
				+			ntdb->ftable = count;
			
 
				+			max = rnd;
			
 
				+		}
			
 
				+
			
 
				+		off = next_ftable(ntdb, off);
			
 
				+		count++;
			
 
				+	}
			
 
				+	return NTDB_SUCCESS;
			
 
				+}
			
 
				+
			
 
				+/* Offset of a given bucket. */
			
 
				+ntdb_off_t bucket_off(ntdb_off_t ftable_off, unsigned bucket)
			
 
				+{
			
 
				+	return ftable_off + offsetof(struct ntdb_freetable, buckets)
			
 
				+		+ bucket * sizeof(ntdb_off_t);
			
 
				+}
			
 
				+
			
 
				+/* Returns free_buckets + 1, or list number to search, or -ve error. */
			
 
				+static ntdb_off_t find_free_head(struct ntdb_context *ntdb,
			
 
				+				ntdb_off_t ftable_off,
			
 
				+				ntdb_off_t bucket)
			
 
				+{
			
 
				+	/* Speculatively search for a non-zero bucket. */
			
 
				+	return ntdb_find_nonzero_off(ntdb, bucket_off(ftable_off, 0),
			
 
				+				    bucket, NTDB_FREE_BUCKETS);
			
 
				+}
			
 
				+
			
 
				+static void check_list(struct ntdb_context *ntdb, ntdb_off_t b_off)
			
 
				+{
			
 
				+#ifdef CCAN_NTDB_DEBUG
			
 
				+	ntdb_off_t off, prev = 0, first;
			
 
				+	struct ntdb_free_record r;
			
 
				+
			
 
				+	first = off = (ntdb_read_off(ntdb, b_off) & NTDB_OFF_MASK);
			
 
				+	while (off != 0) {
			
 
				+		ntdb_read_convert(ntdb, off, &r, sizeof(r));
			
 
				+		if (frec_magic(&r) != NTDB_FREE_MAGIC)
			
 
				+			abort();
			
 
				+		if (prev && frec_prev(&r) != prev)
			
 
				+			abort();
			
 
				+		prev = off;
			
 
				+		off = r.next;
			
 
				+	}
			
 
				+
			
 
				+	if (first) {
			
 
				+		ntdb_read_convert(ntdb, first, &r, sizeof(r));
			
 
				+		if (frec_prev(&r) != prev)
			
 
				+			abort();
			
 
				+	}
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+/* Remove from free bucket. */
			
 
				+static enum NTDB_ERROR remove_from_list(struct ntdb_context *ntdb,
			
 
				+				       ntdb_off_t b_off, ntdb_off_t r_off,
			
 
				+				       const struct ntdb_free_record *r)
			
 
				+{
			
 
				+	ntdb_off_t off, prev_next, head;
			
 
				+	enum NTDB_ERROR ecode;
			
 
				+
			
 
				+	/* Is this only element in list?  Zero out bucket, and we're done. */
			
 
				+	if (frec_prev(r) == r_off)
			
 
				+		return ntdb_write_off(ntdb, b_off, 0);
			
 
				+
			
 
				+	/* off = &r->prev->next */
			
 
				+	off = frec_prev(r) + offsetof(struct ntdb_free_record, next);
			
 
				+
			
 
				+	/* Get prev->next */
			
 
				+	prev_next = ntdb_read_off(ntdb, off);
			
 
				+	if (NTDB_OFF_IS_ERR(prev_next))
			
 
				+		return NTDB_OFF_TO_ERR(prev_next);
			
 
				+
			
 
				+	/* If prev->next == 0, we were head: update bucket to point to next. */
			
 
				+	if (prev_next == 0) {
			
 
				+		/* We must preserve upper bits. */
			
 
				+		head = ntdb_read_off(ntdb, b_off);
			
 
				+		if (NTDB_OFF_IS_ERR(head))
			
 
				+			return NTDB_OFF_TO_ERR(head);
			
 
				+
			
 
				+		if ((head & NTDB_OFF_MASK) != r_off) {
			
 
				+			return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
			
 
				+					  "remove_from_list:"
			
 
				+					  " %llu head %llu on list %llu",
			
 
				+					  (long long)r_off,
			
 
				+					  (long long)head,
			
 
				+					  (long long)b_off);
			
 
				+		}
			
 
				+		head = ((head & ~NTDB_OFF_MASK) | r->next);
			
 
				+		ecode = ntdb_write_off(ntdb, b_off, head);
			
 
				+		if (ecode != NTDB_SUCCESS)
			
 
				+			return ecode;
			
 
				+	} else {
			
 
				+		/* r->prev->next = r->next */
			
 
				+		ecode = ntdb_write_off(ntdb, off, r->next);
			
 
				+		if (ecode != NTDB_SUCCESS)
			
 
				+			return ecode;
			
 
				+	}
			
 
				+
			
 
				+	/* If we were the tail, off = &head->prev. */
			
 
				+	if (r->next == 0) {
			
 
				+		head = ntdb_read_off(ntdb, b_off);
			
 
				+		if (NTDB_OFF_IS_ERR(head))
			
 
				+			return NTDB_OFF_TO_ERR(head);
			
 
				+		head &= NTDB_OFF_MASK;
			
 
				+		off = head + offsetof(struct ntdb_free_record, magic_and_prev);
			
 
				+	} else {
			
 
				+		/* off = &r->next->prev */
			
 
				+		off = r->next + offsetof(struct ntdb_free_record,
			
 
				+					 magic_and_prev);
			
 
				+	}
			
 
				+
			
 
				+#ifdef CCAN_NTDB_DEBUG
			
 
				+	/* *off == r */
			
 
				+	if ((ntdb_read_off(ntdb, off) & NTDB_OFF_MASK) != r_off) {
			
 
				+		return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
			
 
				+				  "remove_from_list:"
			
 
				+				  " %llu bad prev in list %llu",
			
 
				+				  (long long)r_off, (long long)b_off);
			
 
				+	}
			
 
				+#endif
			
 
				+	/* r->next->prev = r->prev */
			
 
				+	return ntdb_write_off(ntdb, off, r->magic_and_prev);
			
 
				+}
			
 
				+
			
 
				+/* Enqueue in this free bucket: sets coalesce if we've added 128
			
 
				+ * entries to it. */
			
 
				+static enum NTDB_ERROR enqueue_in_free(struct ntdb_context *ntdb,
			
 
				+				      ntdb_off_t b_off,
			
 
				+				      ntdb_off_t off,
			
 
				+				      ntdb_len_t len,
			
 
				+				      bool *coalesce)
			
 
				+{
			
 
				+	struct ntdb_free_record new;
			
 
				+	enum NTDB_ERROR ecode;
			
 
				+	ntdb_off_t prev, head;
			
 
				+	uint64_t magic = (NTDB_FREE_MAGIC << (64 - NTDB_OFF_UPPER_STEAL));
			
 
				+
			
 
				+	head = ntdb_read_off(ntdb, b_off);
			
 
				+	if (NTDB_OFF_IS_ERR(head))
			
 
				+		return NTDB_OFF_TO_ERR(head);
			
 
				+
			
 
				+	/* We only need to set ftable_and_len; rest is set in enqueue_in_free */
			
 
				+	new.ftable_and_len = ((uint64_t)ntdb->ftable
			
 
				+			      << (64 - NTDB_OFF_UPPER_STEAL))
			
 
				+		| len;
			
 
				+
			
 
				+	/* new->next = head. */
			
 
				+	new.next = (head & NTDB_OFF_MASK);
			
 
				+
			
 
				+	/* First element?  Prev points to ourselves. */
			
 
				+	if (!new.next) {
			
 
				+		new.magic_and_prev = (magic | off);
			
 
				+	} else {
			
 
				+		/* new->prev = next->prev */
			
 
				+		prev = ntdb_read_off(ntdb,
			
 
				+				    new.next + offsetof(struct ntdb_free_record,
			
 
				+							magic_and_prev));
			
 
				+		new.magic_and_prev = prev;
			
 
				+		if (frec_magic(&new) != NTDB_FREE_MAGIC) {
			
 
				+			return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
			
 
				+					  "enqueue_in_free: %llu bad head"
			
 
				+					  " prev %llu",
			
 
				+					  (long long)new.next,
			
 
				+					  (long long)prev);
			
 
				+		}
			
 
				+		/* next->prev = new. */
			
 
				+		ecode = ntdb_write_off(ntdb, new.next
			
 
				+				      + offsetof(struct ntdb_free_record,
			
 
				+						 magic_and_prev),
			
 
				+				      off | magic);
			
 
				+		if (ecode != NTDB_SUCCESS) {
			
 
				+			return ecode;
			
 
				+		}
			
 
				+
			
 
				+#ifdef CCAN_NTDB_DEBUG
			
 
				+		prev = ntdb_read_off(ntdb, frec_prev(&new)
			
 
				+				    + offsetof(struct ntdb_free_record, next));
			
 
				+		if (prev != 0) {
			
 
				+			return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
			
 
				+					  "enqueue_in_free:"
			
 
				+					  " %llu bad tail next ptr %llu",
			
 
				+					  (long long)frec_prev(&new)
			
 
				+					  + offsetof(struct ntdb_free_record,
			
 
				+						     next),
			
 
				+					  (long long)prev);
			
 
				+		}
			
 
				+#endif
			
 
				+	}
			
 
				+
			
 
				+	/* Update enqueue count, but don't set high bit: see NTDB_OFF_IS_ERR */
			
 
				+	if (*coalesce)
			
 
				+		head += (1ULL << (64 - NTDB_OFF_UPPER_STEAL));
			
 
				+	head &= ~(NTDB_OFF_MASK | (1ULL << 63));
			
 
				+	head |= off;
			
 
				+
			
 
				+	ecode = ntdb_write_off(ntdb, b_off, head);
			
 
				+	if (ecode != NTDB_SUCCESS) {
			
 
				+		return ecode;
			
 
				+	}
			
 
				+
			
 
				+	/* It's time to coalesce if counter wrapped. */
			
 
				+	if (*coalesce)
			
 
				+		*coalesce = ((head & ~NTDB_OFF_MASK) == 0);
			
 
				+
			
 
				+	return ntdb_write_convert(ntdb, off, &new, sizeof(new));
			
 
				+}
			
 
				+
			
 
				+static ntdb_off_t ftable_offset(struct ntdb_context *ntdb, unsigned int ftable)
			
 
				+{
			
 
				+	ntdb_off_t off;
			
 
				+	unsigned int i;
			
 
				+
			
 
				+	if (likely(ntdb->ftable == ftable))
			
 
				+		return ntdb->ftable_off;
			
 
				+
			
 
				+	off = first_ftable(ntdb);
			
 
				+	for (i = 0; i < ftable; i++) {
			
 
				+		if (NTDB_OFF_IS_ERR(off)) {
			
 
				+			break;
			
 
				+		}
			
 
				+		off = next_ftable(ntdb, off);
			
 
				+	}
			
 
				+	return off;
			
 
				+}
			
 
				+
			
 
				+/* Note: we unlock the current bucket if fail (-ve), or coalesce (+ve) and
			
 
				+ * need to blatt the *protect record (which is set to an error). */
			
 
				+static ntdb_len_t coalesce(struct ntdb_context *ntdb,
			
 
				+			  ntdb_off_t off, ntdb_off_t b_off,
			
 
				+			  ntdb_len_t data_len,
			
 
				+			  ntdb_off_t *protect)
			
 
				+{
			
 
				+	ntdb_off_t end;
			
 
				+	struct ntdb_free_record rec;
			
 
				+	enum NTDB_ERROR ecode;
			
 
				+
			
 
				+	ntdb->stats.alloc_coalesce_tried++;
			
 
				+	end = off + sizeof(struct ntdb_used_record) + data_len;
			
 
				+
			
 
				+	while (end < ntdb->file->map_size) {
			
 
				+		const struct ntdb_free_record *r;
			
 
				+		ntdb_off_t nb_off;
			
 
				+		unsigned ftable, bucket;
			
 
				+
			
 
				+		r = ntdb_access_read(ntdb, end, sizeof(*r), true);
			
 
				+		if (NTDB_PTR_IS_ERR(r)) {
			
 
				+			ecode = NTDB_PTR_ERR(r);
			
 
				+			goto err;
			
 
				+		}
			
 
				+
			
 
				+		if (frec_magic(r) != NTDB_FREE_MAGIC
			
 
				+		    || frec_ftable(r) == NTDB_FTABLE_NONE) {
			
 
				+			ntdb_access_release(ntdb, r);
			
 
				+			break;
			
 
				+		}
			
 
				+
			
 
				+		ftable = frec_ftable(r);
			
 
				+		bucket = size_to_bucket(frec_len(r));
			
 
				+		nb_off = ftable_offset(ntdb, ftable);
			
 
				+		if (NTDB_OFF_IS_ERR(nb_off)) {
			
 
				+			ntdb_access_release(ntdb, r);
			
 
				+			ecode = NTDB_OFF_TO_ERR(nb_off);
			
 
				+			goto err;
			
 
				+		}
			
 
				+		nb_off = bucket_off(nb_off, bucket);
			
 
				+		ntdb_access_release(ntdb, r);
			
 
				+
			
 
				+		/* We may be violating lock order here, so best effort. */
			
 
				+		if (ntdb_lock_free_bucket(ntdb, nb_off, NTDB_LOCK_NOWAIT)
			
 
				+		    != NTDB_SUCCESS) {
			
 
				+			ntdb->stats.alloc_coalesce_lockfail++;
			
 
				+			break;
			
 
				+		}
			
 
				+
			
 
				+		/* Now we have lock, re-check. */
			
 
				+		ecode = ntdb_read_convert(ntdb, end, &rec, sizeof(rec));
			
 
				+		if (ecode != NTDB_SUCCESS) {
			
 
				+			ntdb_unlock_free_bucket(ntdb, nb_off);
			
 
				+			goto err;
			
 
				+		}
			
 
				+
			
 
				+		if (unlikely(frec_magic(&rec) != NTDB_FREE_MAGIC)) {
			
 
				+			ntdb->stats.alloc_coalesce_race++;
			
 
				+			ntdb_unlock_free_bucket(ntdb, nb_off);
			
 
				+			break;
			
 
				+		}
			
 
				+
			
 
				+		if (unlikely(frec_ftable(&rec) != ftable)
			
 
				+		    || unlikely(size_to_bucket(frec_len(&rec)) != bucket)) {
			
 
				+			ntdb->stats.alloc_coalesce_race++;
			
 
				+			ntdb_unlock_free_bucket(ntdb, nb_off);
			
 
				+			break;
			
 
				+		}
			
 
				+
			
 
				+		/* Did we just mess up a record you were hoping to use? */
			
 
				+		if (end == *protect) {
			
 
				+			ntdb->stats.alloc_coalesce_iterate_clash++;
			
 
				+			*protect = NTDB_ERR_TO_OFF(NTDB_ERR_NOEXIST);
			
 
				+		}
			
 
				+
			
 
				+		ecode = remove_from_list(ntdb, nb_off, end, &rec);
			
 
				+		check_list(ntdb, nb_off);
			
 
				+		if (ecode != NTDB_SUCCESS) {
			
 
				+			ntdb_unlock_free_bucket(ntdb, nb_off);
			
 
				+			goto err;
			
 
				+		}
			
 
				+
			
 
				+		end += sizeof(struct ntdb_used_record) + frec_len(&rec);
			
 
				+		ntdb_unlock_free_bucket(ntdb, nb_off);
			
 
				+		ntdb->stats.alloc_coalesce_num_merged++;
			
 
				+	}
			
 
				+
			
 
				+	/* Didn't find any adjacent free? */
			
 
				+	if (end == off + sizeof(struct ntdb_used_record) + data_len)
			
 
				+		return 0;
			
 
				+
			
 
				+	/* Before we expand, check this isn't one you wanted protected? */
			
 
				+	if (off == *protect) {
			
 
				+		*protect = NTDB_ERR_TO_OFF(NTDB_ERR_EXISTS);
			
 
				+		ntdb->stats.alloc_coalesce_iterate_clash++;
			
 
				+	}
			
 
				+
			
 
				+	/* OK, expand initial record */
			
 
				+	ecode = ntdb_read_convert(ntdb, off, &rec, sizeof(rec));
			
 
				+	if (ecode != NTDB_SUCCESS) {
			
 
				+		goto err;
			
 
				+	}
			
 
				+
			
 
				+	if (frec_len(&rec) != data_len) {
			
 
				+		ecode = ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
			
 
				+				   "coalesce: expected data len %zu not %zu",
			
 
				+				   (size_t)data_len, (size_t)frec_len(&rec));
			
 
				+		goto err;
			
 
				+	}
			
 
				+
			
 
				+	ecode = remove_from_list(ntdb, b_off, off, &rec);
			
 
				+	check_list(ntdb, b_off);
			
 
				+	if (ecode != NTDB_SUCCESS) {
			
 
				+		goto err;
			
 
				+	}
			
 
				+
			
 
				+	/* Try locking violation first.  We don't allow coalesce recursion! */
			
 
				+	ecode = add_free_record(ntdb, off, end - off, NTDB_LOCK_NOWAIT, false);
			
 
				+	if (ecode != NTDB_SUCCESS) {
			
 
				+		/* Need to drop lock.  Can't rely on anything stable. */
			
 
				+		ntdb->stats.alloc_coalesce_lockfail++;
			
 
				+		*protect = NTDB_ERR_TO_OFF(NTDB_ERR_CORRUPT);
			
 
				+
			
 
				+		/* We have to drop this to avoid deadlocks, so make sure record
			
 
				+		 * doesn't get coalesced by someone else! */
			
 
				+		rec.ftable_and_len = (NTDB_FTABLE_NONE
			
 
				+				      << (64 - NTDB_OFF_UPPER_STEAL))
			
 
				+			| (end - off - sizeof(struct ntdb_used_record));
			
 
				+		ecode = ntdb_write_off(ntdb,
			
 
				+				      off + offsetof(struct ntdb_free_record,
			
 
				+						     ftable_and_len),
			
 
				+				      rec.ftable_and_len);
			
 
				+		if (ecode != NTDB_SUCCESS) {
			
 
				+			goto err;
			
 
				+		}
			
 
				+
			
 
				+		ntdb_unlock_free_bucket(ntdb, b_off);
			
 
				+
			
 
				+		ecode = add_free_record(ntdb, off, end - off, NTDB_LOCK_WAIT,
			
 
				+					false);
			
 
				+		if (ecode != NTDB_SUCCESS) {
			
 
				+			return NTDB_ERR_TO_OFF(ecode);
			
 
				+		}
			
 
				+	} else if (NTDB_OFF_IS_ERR(*protect)) {
			
 
				+		/* For simplicity, we always drop lock if they can't continue */
			
 
				+		ntdb_unlock_free_bucket(ntdb, b_off);
			
 
				+	}
			
 
				+	ntdb->stats.alloc_coalesce_succeeded++;
			
 
				+
			
 
				+	/* Return usable length. */
			
 
				+	return end - off - sizeof(struct ntdb_used_record);
			
 
				+
			
 
				+err:
			
 
				+	/* To unify error paths, we *always* unlock bucket on error. */
			
 
				+	ntdb_unlock_free_bucket(ntdb, b_off);
			
 
				+	return NTDB_ERR_TO_OFF(ecode);
			
 
				+}
			
 
				+
			
 
				+/* List is locked: we unlock it. */
			
 
				+static enum NTDB_ERROR coalesce_list(struct ntdb_context *ntdb,
			
 
				+				    ntdb_off_t ftable_off,
			
 
				+				    ntdb_off_t b_off,
			
 
				+				    unsigned int limit)
			
 
				+{
			
 
				+	enum NTDB_ERROR ecode;
			
 
				+	ntdb_off_t off;
			
 
				+
			
 
				+	off = ntdb_read_off(ntdb, b_off);
			
 
				+	if (NTDB_OFF_IS_ERR(off)) {
			
 
				+		ecode = NTDB_OFF_TO_ERR(off);
			
 
				+		goto unlock_err;
			
 
				+	}
			
 
				+	/* A little bit of paranoia: counter should be 0. */
			
 
				+	off &= NTDB_OFF_MASK;
			
 
				+
			
 
				+	while (off && limit--) {
			
 
				+		struct ntdb_free_record rec;
			
 
				+		ntdb_len_t coal;
			
 
				+		ntdb_off_t next;
			
 
				+
			
 
				+		ecode = ntdb_read_convert(ntdb, off, &rec, sizeof(rec));
			
 
				+		if (ecode != NTDB_SUCCESS)
			
 
				+			goto unlock_err;
			
 
				+
			
 
				+		next = rec.next;
			
 
				+		coal = coalesce(ntdb, off, b_off, frec_len(&rec), &next);
			
 
				+		if (NTDB_OFF_IS_ERR(coal)) {
			
 
				+			/* This has already unlocked on error. */
			
 
				+			return NTDB_OFF_TO_ERR(coal);
			
 
				+		}
			
 
				+		if (NTDB_OFF_IS_ERR(next)) {
			
 
				+			/* Coalescing had to unlock, so stop. */
			
 
				+			return NTDB_SUCCESS;
			
 
				+		}
			
 
				+		/* Keep going if we're doing well... */
			
 
				+		limit += size_to_bucket(coal / 16 + NTDB_MIN_DATA_LEN);
			
 
				+		off = next;
			
 
				+	}
			
 
				+
			
 
				+	/* Now, move those elements to the tail of the list so we get something
			
 
				+	 * else next time. */
			
 
				+	if (off) {
			
 
				+		struct ntdb_free_record oldhrec, newhrec, oldtrec, newtrec;
			
 
				+		ntdb_off_t oldhoff, oldtoff, newtoff;
			
 
				+
			
 
				+		/* The record we were up to is the new head. */
			
 
				+		ecode = ntdb_read_convert(ntdb, off, &newhrec, sizeof(newhrec));
			
 
				+		if (ecode != NTDB_SUCCESS)
			
 
				+			goto unlock_err;
			
 
				+
			
 
				+		/* Get the new tail. */
			
 
				+		newtoff = frec_prev(&newhrec);
			
 
				+		ecode = ntdb_read_convert(ntdb, newtoff, &newtrec,
			
 
				+					 sizeof(newtrec));
			
 
				+		if (ecode != NTDB_SUCCESS)
			
 
				+			goto unlock_err;
			
 
				+
			
 
				+		/* Get the old head. */
			
 
				+		oldhoff = ntdb_read_off(ntdb, b_off);
			
 
				+		if (NTDB_OFF_IS_ERR(oldhoff)) {
			
 
				+			ecode = NTDB_OFF_TO_ERR(oldhoff);
			
 
				+			goto unlock_err;
			
 
				+		}
			
 
				+
			
 
				+		/* This could happen if they all coalesced away. */
			
 
				+		if (oldhoff == off)
			
 
				+			goto out;
			
 
				+
			
 
				+		ecode = ntdb_read_convert(ntdb, oldhoff, &oldhrec,
			
 
				+					 sizeof(oldhrec));
			
 
				+		if (ecode != NTDB_SUCCESS)
			
 
				+			goto unlock_err;
			
 
				+
			
 
				+		/* Get the old tail. */
			
 
				+		oldtoff = frec_prev(&oldhrec);
			
 
				+		ecode = ntdb_read_convert(ntdb, oldtoff, &oldtrec,
			
 
				+					 sizeof(oldtrec));
			
 
				+		if (ecode != NTDB_SUCCESS)
			
 
				+			goto unlock_err;
			
 
				+
			
 
				+		/* Old tail's next points to old head. */
			
 
				+		oldtrec.next = oldhoff;
			
 
				+
			
 
				+		/* Old head's prev points to old tail. */
			
 
				+		oldhrec.magic_and_prev
			
 
				+			= (NTDB_FREE_MAGIC << (64 - NTDB_OFF_UPPER_STEAL))
			
 
				+			| oldtoff;
			
 
				+
			
 
				+		/* New tail's next is 0. */
			
 
				+		newtrec.next = 0;
			
 
				+
			
 
				+		/* Write out the modified versions. */
			
 
				+		ecode = ntdb_write_convert(ntdb, oldtoff, &oldtrec,
			
 
				+					  sizeof(oldtrec));
			
 
				+		if (ecode != NTDB_SUCCESS)
			
 
				+			goto unlock_err;
			
 
				+
			
 
				+		ecode = ntdb_write_convert(ntdb, oldhoff, &oldhrec,
			
 
				+					  sizeof(oldhrec));
			
 
				+		if (ecode != NTDB_SUCCESS)
			
 
				+			goto unlock_err;
			
 
				+
			
 
				+		ecode = ntdb_write_convert(ntdb, newtoff, &newtrec,
			
 
				+					  sizeof(newtrec));
			
 
				+		if (ecode != NTDB_SUCCESS)
			
 
				+			goto unlock_err;
			
 
				+
			
 
				+		/* And finally link in new head. */
			
 
				+		ecode = ntdb_write_off(ntdb, b_off, off);
			
 
				+		if (ecode != NTDB_SUCCESS)
			
 
				+			goto unlock_err;
			
 
				+	}
			
 
				+out:
			
 
				+	ntdb_unlock_free_bucket(ntdb, b_off);
			
 
				+	return NTDB_SUCCESS;
			
 
				+
			
 
				+unlock_err:
			
 
				+	ntdb_unlock_free_bucket(ntdb, b_off);
			
 
				+	return ecode;
			
 
				+}
			
 
				+
			
 
				+/* List must not be locked if coalesce_ok is set. */
			
 
				+enum NTDB_ERROR add_free_record(struct ntdb_context *ntdb,
			
 
				+			       ntdb_off_t off, ntdb_len_t len_with_header,
			
 
				+			       enum ntdb_lock_flags waitflag,
			
 
				+			       bool coalesce_ok)
			
 
				+{
			
 
				+	ntdb_off_t b_off;
			
 
				+	ntdb_len_t len;
			
 
				+	enum NTDB_ERROR ecode;
			
 
				+
			
 
				+	assert(len_with_header >= sizeof(struct ntdb_free_record));
			
 
				+
			
 
				+	len = len_with_header - sizeof(struct ntdb_used_record);
			
 
				+
			
 
				+	b_off = bucket_off(ntdb->ftable_off, size_to_bucket(len));
			
 
				+	ecode = ntdb_lock_free_bucket(ntdb, b_off, waitflag);
			
 
				+	if (ecode != NTDB_SUCCESS) {
			
 
				+		return ecode;
			
 
				+	}
			
 
				+
			
 
				+	ecode = enqueue_in_free(ntdb, b_off, off, len, &coalesce_ok);
			
 
				+	check_list(ntdb, b_off);
			
 
				+
			
 
				+	/* Coalescing unlocks free list. */
			
 
				+	if (!ecode && coalesce_ok)
			
 
				+		ecode = coalesce_list(ntdb, ntdb->ftable_off, b_off, 2);
			
 
				+	else
			
 
				+		ntdb_unlock_free_bucket(ntdb, b_off);
			
 
				+	return ecode;
			
 
				+}
			
 
				+
			
 
				+static size_t adjust_size(size_t keylen, size_t datalen)
			
 
				+{
			
 
				+	size_t size = keylen + datalen;
			
 
				+
			
 
				+	if (size < NTDB_MIN_DATA_LEN)
			
 
				+		size = NTDB_MIN_DATA_LEN;
			
 
				+
			
 
				+	/* Round to next uint64_t boundary. */
			
 
				+	return (size + (sizeof(uint64_t) - 1ULL)) & ~(sizeof(uint64_t) - 1ULL);
			
 
				+}
			
 
				+
			
 
				+/* If we have enough left over to be useful, split that off. */
			
 
				+static size_t record_leftover(size_t keylen, size_t datalen,
			
 
				+			      bool want_extra, size_t total_len)
			
 
				+{
			
 
				+	ssize_t leftover;
			
 
				+
			
 
				+	if (want_extra)
			
 
				+		datalen += datalen / 2;
			
 
				+	leftover = total_len - adjust_size(keylen, datalen);
			
 
				+
			
 
				+	if (leftover < (ssize_t)sizeof(struct ntdb_free_record))
			
 
				+		return 0;
			
 
				+
			
 
				+	return leftover;
			
 
				+}
			
 
				+
			
 
				+/* We need size bytes to put our key and data in. */
			
 
				+static ntdb_off_t lock_and_alloc(struct ntdb_context *ntdb,
			
 
				+				ntdb_off_t ftable_off,
			
 
				+				ntdb_off_t bucket,
			
 
				+				size_t keylen, size_t datalen,
			
 
				+				bool want_extra,
			
 
				+				unsigned magic)
			
 
				+{
			
 
				+	ntdb_off_t off, b_off,best_off;
			
 
				+	struct ntdb_free_record best = { 0 };
			
 
				+	double multiplier;
			
 
				+	size_t size = adjust_size(keylen, datalen);
			
 
				+	enum NTDB_ERROR ecode;
			
 
				+
			
 
				+	ntdb->stats.allocs++;
			
 
				+	b_off = bucket_off(ftable_off, bucket);
			
 
				+
			
 
				+	/* FIXME: Try non-blocking wait first, to measure contention. */
			
 
				+	/* Lock this bucket. */
			
 
				+	ecode = ntdb_lock_free_bucket(ntdb, b_off, NTDB_LOCK_WAIT);
			
 
				+	if (ecode != NTDB_SUCCESS) {
			
 
				+		return NTDB_ERR_TO_OFF(ecode);
			
 
				+	}
			
 
				+
			
 
				+	best.ftable_and_len = -1ULL;
			
 
				+	best_off = 0;
			
 
				+
			
 
				+	/* Get slack if we're after extra. */
			
 
				+	if (want_extra)
			
 
				+		multiplier = 1.5;
			
 
				+	else
			
 
				+		multiplier = 1.0;
			
 
				+
			
 
				+	/* Walk the list to see if any are large enough, getting less fussy
			
 
				+	 * as we go. */
			
 
				+	off = ntdb_read_off(ntdb, b_off);
			
 
				+	if (NTDB_OFF_IS_ERR(off)) {
			
 
				+		ecode = NTDB_OFF_TO_ERR(off);
			
 
				+		goto unlock_err;
			
 
				+	}
			
 
				+	off &= NTDB_OFF_MASK;
			
 
				+
			
 
				+	while (off) {
			
 
				+		const struct ntdb_free_record *r;
			
 
				+		ntdb_off_t next;
			
 
				+
			
 
				+		r = ntdb_access_read(ntdb, off, sizeof(*r), true);
			
 
				+		if (NTDB_PTR_IS_ERR(r)) {
			
 
				+			ecode = NTDB_PTR_ERR(r);
			
 
				+			goto unlock_err;
			
 
				+		}
			
 
				+
			
 
				+		if (frec_magic(r) != NTDB_FREE_MAGIC) {
			
 
				+			ecode = ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
			
 
				+					   "lock_and_alloc:"
			
 
				+					   " %llu non-free 0x%llx",
			
 
				+					   (long long)off,
			
 
				+					   (long long)r->magic_and_prev);
			
 
				+			ntdb_access_release(ntdb, r);
			
 
				+			goto unlock_err;
			
 
				+		}
			
 
				+
			
 
				+		if (frec_len(r) >= size && frec_len(r) < frec_len(&best)) {
			
 
				+			best_off = off;
			
 
				+			best = *r;
			
 
				+		}
			
 
				+
			
 
				+		if (frec_len(&best) <= size * multiplier && best_off) {
			
 
				+			ntdb_access_release(ntdb, r);
			
 
				+			break;
			
 
				+		}
			
 
				+
			
 
				+		multiplier *= 1.01;
			
 
				+
			
 
				+		next = r->next;
			
 
				+		ntdb_access_release(ntdb, r);
			
 
				+		off = next;
			
 
				+	}
			
 
				+
			
 
				+	/* If we found anything at all, use it. */
			
 
				+	if (best_off) {
			
 
				+		struct ntdb_used_record rec;
			
 
				+		size_t leftover;
			
 
				+
			
 
				+		/* We're happy with this size: take it. */
			
 
				+		ecode = remove_from_list(ntdb, b_off, best_off, &best);
			
 
				+		check_list(ntdb, b_off);
			
 
				+		if (ecode != NTDB_SUCCESS) {
			
 
				+			goto unlock_err;
			
 
				+		}
			
 
				+
			
 
				+		leftover = record_leftover(keylen, datalen, want_extra,
			
 
				+					   frec_len(&best));
			
 
				+
			
 
				+		assert(keylen + datalen + leftover <= frec_len(&best));
			
 
				+		/* We need to mark non-free before we drop lock, otherwise
			
 
				+		 * coalesce() could try to merge it! */
			
 
				+		ecode = set_header(ntdb, &rec, magic, keylen, datalen,
			
 
				+				   frec_len(&best) - leftover);
			
 
				+		if (ecode != NTDB_SUCCESS) {
			
 
				+			goto unlock_err;
			
 
				+		}
			
 
				+
			
 
				+		ecode = ntdb_write_convert(ntdb, best_off, &rec, sizeof(rec));
			
 
				+		if (ecode != NTDB_SUCCESS) {
			
 
				+			goto unlock_err;
			
 
				+		}
			
 
				+
			
 
				+		/* For futureproofing, we put a 0 in any unused space. */
			
 
				+		if (rec_extra_padding(&rec)) {
			
 
				+			ecode = ntdb->io->twrite(ntdb, best_off + sizeof(rec)
			
 
				+						+ keylen + datalen, "", 1);
			
 
				+			if (ecode != NTDB_SUCCESS) {
			
 
				+				goto unlock_err;
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+		/* Bucket of leftover will be <= current bucket, so nested
			
 
				+		 * locking is allowed. */
			
 
				+		if (leftover) {
			
 
				+			ntdb->stats.alloc_leftover++;
			
 
				+			ecode = add_free_record(ntdb,
			
 
				+						best_off + sizeof(rec)
			
 
				+						+ frec_len(&best) - leftover,
			
 
				+						leftover, NTDB_LOCK_WAIT, false);
			
 
				+			if (ecode != NTDB_SUCCESS) {
			
 
				+				best_off = NTDB_ERR_TO_OFF(ecode);
			
 
				+			}
			
 
				+		}
			
 
				+		ntdb_unlock_free_bucket(ntdb, b_off);
			
 
				+
			
 
				+		return best_off;
			
 
				+	}
			
 
				+
			
 
				+	ntdb_unlock_free_bucket(ntdb, b_off);
			
 
				+	return 0;
			
 
				+
			
 
				+unlock_err:
			
 
				+	ntdb_unlock_free_bucket(ntdb, b_off);
			
 
				+	return NTDB_ERR_TO_OFF(ecode);
			
 
				+}
			
 
				+
			
 
				+/* Get a free block from current free list, or 0 if none, -ve on error. */
			
 
				+static ntdb_off_t get_free(struct ntdb_context *ntdb,
			
 
				+			  size_t keylen, size_t datalen, bool want_extra,
			
 
				+			  unsigned magic)
			
 
				+{
			
 
				+	ntdb_off_t off, ftable_off;
			
 
				+	ntdb_off_t start_b, b, ftable;
			
 
				+	bool wrapped = false;
			
 
				+
			
 
				+	/* If they are growing, add 50% to get to higher bucket. */
			
 
				+	if (want_extra)
			
 
				+		start_b = size_to_bucket(adjust_size(keylen,
			
 
				+						     datalen + datalen / 2));
			
 
				+	else
			
 
				+		start_b = size_to_bucket(adjust_size(keylen, datalen));
			
 
				+
			
 
				+	ftable_off = ntdb->ftable_off;
			
 
				+	ftable = ntdb->ftable;
			
 
				+	while (!wrapped || ftable_off != ntdb->ftable_off) {
			
 
				+		/* Start at exact size bucket, and search up... */
			
 
				+		for (b = find_free_head(ntdb, ftable_off, start_b);
			
 
				+		     b < NTDB_FREE_BUCKETS;
			
 
				+		     b = find_free_head(ntdb, ftable_off, b + 1)) {
			
 
				+			/* Try getting one from list. */
			
 
				+			off = lock_and_alloc(ntdb, ftable_off,
			
 
				+					     b, keylen, datalen, want_extra,
			
 
				+					     magic);
			
 
				+			if (NTDB_OFF_IS_ERR(off))
			
 
				+				return off;
			
 
				+			if (off != 0) {
			
 
				+				if (b == start_b)
			
 
				+					ntdb->stats.alloc_bucket_exact++;
			
 
				+				if (b == NTDB_FREE_BUCKETS - 1)
			
 
				+					ntdb->stats.alloc_bucket_max++;
			
 
				+				/* Worked?  Stay using this list. */
			
 
				+				ntdb->ftable_off = ftable_off;
			
 
				+				ntdb->ftable = ftable;
			
 
				+				return off;
			
 
				+			}
			
 
				+			/* Didn't work.  Try next bucket. */
			
 
				+		}
			
 
				+
			
 
				+		if (NTDB_OFF_IS_ERR(b)) {
			
 
				+			return b;
			
 
				+		}
			
 
				+
			
 
				+		/* Hmm, try next table. */
			
 
				+		ftable_off = next_ftable(ntdb, ftable_off);
			
 
				+		if (NTDB_OFF_IS_ERR(ftable_off)) {
			
 
				+			return ftable_off;
			
 
				+		}
			
 
				+		ftable++;
			
 
				+
			
 
				+		if (ftable_off == 0) {
			
 
				+			wrapped = true;
			
 
				+			ftable_off = first_ftable(ntdb);
			
 
				+			if (NTDB_OFF_IS_ERR(ftable_off)) {
			
 
				+				return ftable_off;
			
 
				+			}
			
 
				+			ftable = 0;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+enum NTDB_ERROR set_header(struct ntdb_context *ntdb,
			
 
				+			  struct ntdb_used_record *rec,
			
 
				+			  unsigned magic, uint64_t keylen, uint64_t datalen,
			
 
				+			  uint64_t actuallen)
			
 
				+{
			
 
				+	uint64_t keybits = (fls64(keylen) + 1) / 2;
			
 
				+
			
 
				+	rec->magic_and_meta = ((actuallen - (keylen + datalen)) << 11)
			
 
				+		| (keybits << 43)
			
 
				+		| ((uint64_t)magic << 48);
			
 
				+	rec->key_and_data_len = (keylen | (datalen << (keybits*2)));
			
 
				+
			
 
				+	/* Encoding can fail on big values. */
			
 
				+	if (rec_key_length(rec) != keylen
			
 
				+	    || rec_data_length(rec) != datalen
			
 
				+	    || rec_extra_padding(rec) != actuallen - (keylen + datalen)) {
			
 
				+		return ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
			
 
				+				  "Could not encode k=%llu,d=%llu,a=%llu",
			
 
				+				  (long long)keylen, (long long)datalen,
			
 
				+				  (long long)actuallen);
			
 
				+	}
			
 
				+	return NTDB_SUCCESS;
			
 
				+}
			
 
				+
			
 
				+/* You need 'size', this tells you how much you should expand by. */
			
 
				+ntdb_off_t ntdb_expand_adjust(ntdb_off_t map_size, ntdb_off_t size)
			
 
				+{
			
 
				+	ntdb_off_t new_size, top_size;
			
 
				+
			
 
				+	/* limit size in order to avoid using up huge amounts of memory for
			
 
				+	 * in memory tdbs if an oddball huge record creeps in */
			
 
				+	if (size > 100 * 1024) {
			
 
				+		top_size = map_size + size * 2;
			
 
				+	} else {
			
 
				+		top_size = map_size + size * 100;
			
 
				+	}
			
 
				+
			
 
				+	/* always make room for at least top_size more records, and at
			
 
				+	   least 25% more space. if the DB is smaller than 100MiB,
			
 
				+	   otherwise grow it by 10% only. */
			
 
				+	if (map_size > 100 * 1024 * 1024) {
			
 
				+		new_size = map_size * 1.10;
			
 
				+	} else {
			
 
				+		new_size = map_size * 1.25;
			
 
				+	}
			
 
				+
			
 
				+	if (new_size < top_size)
			
 
				+		new_size = top_size;
			
 
				+
			
 
				+	/* We always make the file a multiple of transaction page
			
 
				+	 * size.  This guarantees that the transaction recovery area
			
 
				+	 * is always aligned, otherwise the transaction code can overwrite
			
 
				+	 * itself. */
			
 
				+	new_size = (new_size + NTDB_PGSIZE-1) & ~(NTDB_PGSIZE-1);
			
 
				+	return new_size - map_size;
			
 
				+}
			
 
				+
			
 
				+/* Expand the database. */
			
 
				+static enum NTDB_ERROR ntdb_expand(struct ntdb_context *ntdb, ntdb_len_t size)
			
 
				+{
			
 
				+	uint64_t old_size;
			
 
				+	ntdb_len_t wanted;
			
 
				+	enum NTDB_ERROR ecode;
			
 
				+
			
 
				+	/* Need to hold a hash lock to expand DB: transactions rely on it. */
			
 
				+	if (!(ntdb->flags & NTDB_NOLOCK)
			
 
				+	    && !ntdb->file->allrecord_lock.count && !ntdb_has_hash_locks(ntdb)) {
			
 
				+		return ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_ERROR,
			
 
				+				  "ntdb_expand: must hold lock during expand");
			
 
				+	}
			
 
				+
			
 
				+	/* Only one person can expand file at a time. */
			
 
				+	ecode = ntdb_lock_expand(ntdb, F_WRLCK);
			
 
				+	if (ecode != NTDB_SUCCESS) {
			
 
				+		return ecode;
			
 
				+	}
			
 
				+
			
 
				+	/* Someone else may have expanded the file, so retry. */
			
 
				+	old_size = ntdb->file->map_size;
			
 
				+	ntdb_oob(ntdb, ntdb->file->map_size, 1, true);
			
 
				+	if (ntdb->file->map_size != old_size) {
			
 
				+		ntdb_unlock_expand(ntdb, F_WRLCK);
			
 
				+		return NTDB_SUCCESS;
			
 
				+	}
			
 
				+
			
 
				+	/* We need room for the record header too. */
			
 
				+	size = adjust_size(0, sizeof(struct ntdb_used_record) + size);
			
 
				+	/* Overallocate. */
			
 
				+	wanted = ntdb_expand_adjust(old_size, size);
			
 
				+
			
 
				+	ecode = ntdb->io->expand_file(ntdb, wanted);
			
 
				+	if (ecode != NTDB_SUCCESS) {
			
 
				+		ntdb_unlock_expand(ntdb, F_WRLCK);
			
 
				+		return ecode;
			
 
				+	}
			
 
				+
			
 
				+	/* We need to drop this lock before adding free record. */
			
 
				+	ntdb_unlock_expand(ntdb, F_WRLCK);
			
 
				+
			
 
				+	ntdb->stats.expands++;
			
 
				+	return add_free_record(ntdb, old_size, wanted, NTDB_LOCK_WAIT, true);
			
 
				+}
			
 
				+
			
 
				+/* This won't fail: it will expand the database if it has to. */
			
 
				+ntdb_off_t alloc(struct ntdb_context *ntdb, size_t keylen, size_t datalen,
			
 
				+		 unsigned magic, bool growing)
			
 
				+{
			
 
				+	ntdb_off_t off;
			
 
				+
			
 
				+	for (;;) {
			
 
				+		enum NTDB_ERROR ecode;
			
 
				+		off = get_free(ntdb, keylen, datalen, growing, magic);
			
 
				+		if (likely(off != 0))
			
 
				+			break;
			
 
				+
			
 
				+		ecode = ntdb_expand(ntdb, adjust_size(keylen, datalen));
			
 
				+		if (ecode != NTDB_SUCCESS) {
			
 
				+			return NTDB_ERR_TO_OFF(ecode);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return off;
			
 
				+}
			
--- a/ccan/ntdb/hash.c
+++ b/ccan/ntdb/hash.c
@@ -0,0 +1,624 @@
 
				+ /*
			
 
				+   Trivial Database 2: hash handling
			
 
				+   Copyright (C) Rusty Russell 2010
			
 
				+
			
 
				+   This library is free software; you can redistribute it and/or
			
 
				+   modify it under the terms of the GNU Lesser General Public
			
 
				+   License as published by the Free Software Foundation; either
			
 
				+   version 3 of the License, or (at your option) any later version.
			
 
				+
			
 
				+   This library is distributed in the hope that it will be useful,
			
 
				+   but WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
			
 
				+   Lesser General Public License for more details.
			
 
				+
			
 
				+   You should have received a copy of the GNU Lesser General Public
			
 
				+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
			
 
				+*/
			
 
				+#include "private.h"
			
 
				+#include <ccan/hash/hash.h>
			
 
				+
			
 
				+/* Default hash function. */
			
 
				+uint32_t ntdb_jenkins_hash(const void *key, size_t length, uint32_t seed,
			
 
				+			  void *unused)
			
 
				+{
			
 
				+	return hash_stable((const unsigned char *)key, length, seed);
			
 
				+}
			
 
				+
			
 
				+uint32_t ntdb_hash(struct ntdb_context *ntdb, const void *ptr, size_t len)
			
 
				+{
			
 
				+	return ntdb->hash_fn(ptr, len, ntdb->hash_seed, ntdb->hash_data);
			
 
				+}
			
 
				+
			
 
				+static ntdb_bool_err key_matches(struct ntdb_context *ntdb,
			
 
				+				 const struct ntdb_used_record *rec,
			
 
				+				 ntdb_off_t off,
			
 
				+				 const NTDB_DATA *key,
			
 
				+				 const char **rptr)
			
 
				+{
			
 
				+	ntdb_bool_err ret = false;
			
 
				+	const char *rkey;
			
 
				+
			
 
				+	if (rec_key_length(rec) != key->dsize) {
			
 
				+		ntdb->stats.compare_wrong_keylen++;
			
 
				+		return ret;
			
 
				+	}
			
 
				+
			
 
				+	rkey = ntdb_access_read(ntdb, off + sizeof(*rec),
			
 
				+				key->dsize + rec_data_length(rec), false);
			
 
				+	if (NTDB_PTR_IS_ERR(rkey)) {
			
 
				+		return (ntdb_bool_err)NTDB_PTR_ERR(rkey);
			
 
				+	}
			
 
				+	if (memcmp(rkey, key->dptr, key->dsize) == 0) {
			
 
				+		if (rptr) {
			
 
				+			*rptr = rkey;
			
 
				+		} else {
			
 
				+			ntdb_access_release(ntdb, rkey);
			
 
				+		}
			
 
				+		return true;
			
 
				+	}
			
 
				+	ntdb->stats.compare_wrong_keycmp++;
			
 
				+	ntdb_access_release(ntdb, rkey);
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+/* Does entry match? */
			
 
				+static ntdb_bool_err match(struct ntdb_context *ntdb,
			
 
				+			   uint32_t hash,
			
 
				+			   const NTDB_DATA *key,
			
 
				+			   ntdb_off_t val,
			
 
				+			   struct ntdb_used_record *rec,
			
 
				+			   const char **rptr)
			
 
				+{
			
 
				+	ntdb_off_t off;
			
 
				+	enum NTDB_ERROR ecode;
			
 
				+
			
 
				+	ntdb->stats.compares++;
			
 
				+
			
 
				+	/* Top bits of offset == next bits of hash. */
			
 
				+	if (bits_from(hash, ntdb->hash_bits, NTDB_OFF_UPPER_STEAL)
			
 
				+	    != bits_from(val, 64-NTDB_OFF_UPPER_STEAL, NTDB_OFF_UPPER_STEAL)) {
			
 
				+		ntdb->stats.compare_wrong_offsetbits++;
			
 
				+		return false;
			
 
				+	}
			
 
				+
			
 
				+	off = val & NTDB_OFF_MASK;
			
 
				+	ecode = ntdb_read_convert(ntdb, off, rec, sizeof(*rec));
			
 
				+	if (ecode != NTDB_SUCCESS) {
			
 
				+		return (ntdb_bool_err)ecode;
			
 
				+	}
			
 
				+
			
 
				+	return key_matches(ntdb, rec, off, key, rptr);
			
 
				+}
			
 
				+
			
 
				+static bool is_chain(ntdb_off_t val)
			
 
				+{
			
 
				+	return val & (1ULL << NTDB_OFF_CHAIN_BIT);
			
 
				+}
			
 
				+
			
 
				+static ntdb_off_t hbucket_off(ntdb_off_t base, ntdb_len_t idx)
			
 
				+{
			
 
				+	return base + sizeof(struct ntdb_used_record)
			
 
				+		+ idx * sizeof(ntdb_off_t);
			
 
				+}
			
 
				+
			
 
				+/* This is the core routine which searches the hashtable for an entry.
			
 
				+ * On error, no locks are held and -ve is returned.
			
 
				+ * Otherwise, hinfo is filled in.
			
 
				+ * If not found, the return value is 0.
			
 
				+ * If found, the return value is the offset, and *rec is the record. */
			
 
				+ntdb_off_t find_and_lock(struct ntdb_context *ntdb,
			
 
				+			 NTDB_DATA key,
			
 
				+			 int ltype,
			
 
				+			 struct hash_info *h,
			
 
				+			 struct ntdb_used_record *rec,
			
 
				+			 const char **rptr)
			
 
				+{
			
 
				+	ntdb_off_t off, val;
			
 
				+	const ntdb_off_t *arr = NULL;
			
 
				+	ntdb_len_t i;
			
 
				+	bool found_empty;
			
 
				+	enum NTDB_ERROR ecode;
			
 
				+	struct ntdb_used_record chdr;
			
 
				+	ntdb_bool_err berr;
			
 
				+
			
 
				+	h->h = ntdb_hash(ntdb, key.dptr, key.dsize);
			
 
				+
			
 
				+	h->table = NTDB_HASH_OFFSET;
			
 
				+	h->table_size = 1 << ntdb->hash_bits;
			
 
				+	h->bucket = bits_from(h->h, 0, ntdb->hash_bits);
			
 
				+	h->old_val = 0;
			
 
				+
			
 
				+	ecode = ntdb_lock_hash(ntdb, h->bucket, ltype);
			
 
				+	if (ecode != NTDB_SUCCESS) {
			
 
				+		return NTDB_ERR_TO_OFF(ecode);
			
 
				+	}
			
 
				+
			
 
				+	off = hbucket_off(h->table, h->bucket);
			
 
				+	val = ntdb_read_off(ntdb, off);
			
 
				+	if (NTDB_OFF_IS_ERR(val)) {
			
 
				+		ecode = NTDB_OFF_TO_ERR(val);
			
 
				+		goto fail;
			
 
				+	}
			
 
				+
			
 
				+	/* Directly in hash table? */
			
 
				+	if (!likely(is_chain(val))) {
			
 
				+		if (val) {
			
 
				+			berr = match(ntdb, h->h, &key, val, rec, rptr);
			
 
				+			if (berr < 0) {
			
 
				+				ecode = NTDB_OFF_TO_ERR(berr);
			
 
				+				goto fail;
			
 
				+			}
			
 
				+			if (berr) {
			
 
				+				return val & NTDB_OFF_MASK;
			
 
				+			}
			
 
				+			/* If you want to insert here, make a chain. */
			
 
				+			h->old_val = val;
			
 
				+		}
			
 
				+		return 0;
			
 
				+	}
			
 
				+
			
 
				+	/* Nope?  Iterate through chain. */
			
 
				+	h->table = val & NTDB_OFF_MASK;
			
 
				+
			
 
				+	ecode = ntdb_read_convert(ntdb, h->table, &chdr, sizeof(chdr));
			
 
				+	if (ecode != NTDB_SUCCESS) {
			
 
				+		goto fail;
			
 
				+	}
			
 
				+
			
 
				+	if (rec_magic(&chdr) != NTDB_CHAIN_MAGIC) {
			
 
				+		ecode = ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
			
 
				+				    NTDB_LOG_ERROR,
			
 
				+				    "find_and_lock:"
			
 
				+				    " corrupt record %#x at %llu",
			
 
				+				    rec_magic(&chdr), (long long)off);
			
 
				+		goto fail;
			
 
				+	}
			
 
				+
			
 
				+	h->table_size = rec_data_length(&chdr) / sizeof(ntdb_off_t);
			
 
				+
			
 
				+	arr = ntdb_access_read(ntdb, hbucket_off(h->table, 0),
			
 
				+			       rec_data_length(&chdr), true);
			
 
				+	if (NTDB_PTR_IS_ERR(arr)) {
			
 
				+		ecode = NTDB_PTR_ERR(arr);
			
 
				+		goto fail;
			
 
				+	}
			
 
				+
			
 
				+	found_empty = false;
			
 
				+	for (i = 0; i < h->table_size; i++) {
			
 
				+		if (arr[i] == 0) {
			
 
				+			if (!found_empty) {
			
 
				+				h->bucket = i;
			
 
				+				found_empty = true;
			
 
				+			}
			
 
				+		} else {
			
 
				+			berr = match(ntdb, h->h, &key, arr[i], rec, rptr);
			
 
				+			if (berr < 0) {
			
 
				+				ecode = NTDB_OFF_TO_ERR(berr);
			
 
				+				ntdb_access_release(ntdb, arr);
			
 
				+				goto fail;
			
 
				+			}
			
 
				+			if (berr) {
			
 
				+				/* We found it! */
			
 
				+				h->bucket = i;
			
 
				+				off = arr[i] & NTDB_OFF_MASK;
			
 
				+				ntdb_access_release(ntdb, arr);
			
 
				+				return off;
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+	if (!found_empty) {
			
 
				+		/* Set to any non-zero value */
			
 
				+		h->old_val = 1;
			
 
				+		h->bucket = i;
			
 
				+	}
			
 
				+
			
 
				+	ntdb_access_release(ntdb, arr);
			
 
				+	return 0;
			
 
				+
			
 
				+fail:
			
 
				+	ntdb_unlock_hash(ntdb, h->bucket, ltype);
			
 
				+	return NTDB_ERR_TO_OFF(ecode);
			
 
				+}
			
 
				+
			
 
				+static ntdb_off_t encode_offset(const struct ntdb_context *ntdb,
			
 
				+				ntdb_off_t new_off, uint32_t hash)
			
 
				+{
			
 
				+	ntdb_off_t extra;
			
 
				+
			
 
				+	assert((new_off & (1ULL << NTDB_OFF_CHAIN_BIT)) == 0);
			
 
				+	assert((new_off >> (64 - NTDB_OFF_UPPER_STEAL)) == 0);
			
 
				+	/* We pack extra hash bits into the upper bits of the offset. */
			
 
				+	extra = bits_from(hash, ntdb->hash_bits, NTDB_OFF_UPPER_STEAL);
			
 
				+	extra <<= (64 - NTDB_OFF_UPPER_STEAL);
			
 
				+
			
 
				+	return new_off | extra;
			
 
				+}
			
 
				+
			
 
				+/* Simply overwrite the hash entry we found before. */
			
 
				+enum NTDB_ERROR replace_in_hash(struct ntdb_context *ntdb,
			
 
				+				const struct hash_info *h,
			
 
				+				ntdb_off_t new_off)
			
 
				+{
			
 
				+	return ntdb_write_off(ntdb, hbucket_off(h->table, h->bucket),
			
 
				+			      encode_offset(ntdb, new_off, h->h));
			
 
				+}
			
 
				+
			
 
				+enum NTDB_ERROR delete_from_hash(struct ntdb_context *ntdb,
			
 
				+				 const struct hash_info *h)
			
 
				+{
			
 
				+	return ntdb_write_off(ntdb, hbucket_off(h->table, h->bucket), 0);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+enum NTDB_ERROR add_to_hash(struct ntdb_context *ntdb,
			
 
				+			    const struct hash_info *h,
			
 
				+			    ntdb_off_t new_off)
			
 
				+{
			
 
				+	enum NTDB_ERROR ecode;
			
 
				+	ntdb_off_t chain;
			
 
				+	struct ntdb_used_record chdr;
			
 
				+	const ntdb_off_t *old;
			
 
				+	ntdb_off_t *new;
			
 
				+
			
 
				+	/* We hit an empty bucket during search?  That's where it goes. */
			
 
				+	if (!h->old_val) {
			
 
				+		return replace_in_hash(ntdb, h, new_off);
			
 
				+	}
			
 
				+
			
 
				+	/* Full at top-level?  Create a 2-element chain. */
			
 
				+	if (h->table == NTDB_HASH_OFFSET) {
			
 
				+		ntdb_off_t pair[2];
			
 
				+
			
 
				+		/* One element is old value, the other is the new value. */
			
 
				+		pair[0] = h->old_val;
			
 
				+		pair[1] = encode_offset(ntdb, new_off, h->h);
			
 
				+
			
 
				+		chain = alloc(ntdb, 0, sizeof(pair), NTDB_CHAIN_MAGIC, true);
			
 
				+		if (NTDB_OFF_IS_ERR(chain)) {
			
 
				+			return NTDB_OFF_TO_ERR(chain);
			
 
				+		}
			
 
				+		ecode = ntdb_write_convert(ntdb,
			
 
				+					   chain
			
 
				+					   + sizeof(struct ntdb_used_record),
			
 
				+					   pair, sizeof(pair));
			
 
				+		if (ecode == NTDB_SUCCESS) {
			
 
				+			ecode = ntdb_write_off(ntdb,
			
 
				+					       hbucket_off(h->table, h->bucket),
			
 
				+					       chain
			
 
				+					       | (1ULL << NTDB_OFF_CHAIN_BIT));
			
 
				+		}
			
 
				+		return ecode;
			
 
				+	}
			
 
				+
			
 
				+	/* Full bucket.  Expand. */
			
 
				+	ecode = ntdb_read_convert(ntdb, h->table, &chdr, sizeof(chdr));
			
 
				+	if (ecode != NTDB_SUCCESS) {
			
 
				+		return ecode;
			
 
				+	}
			
 
				+
			
 
				+	if (rec_extra_padding(&chdr) >= sizeof(new_off)) {
			
 
				+		/* Expand in place. */
			
 
				+		uint64_t dlen = rec_data_length(&chdr);
			
 
				+
			
 
				+		ecode = set_header(ntdb, &chdr, NTDB_CHAIN_MAGIC, 0,
			
 
				+				   dlen + sizeof(new_off),
			
 
				+				   dlen + rec_extra_padding(&chdr));
			
 
				+
			
 
				+		if (ecode != NTDB_SUCCESS) {
			
 
				+			return ecode;
			
 
				+		}
			
 
				+		/* find_and_lock set up h to point to last bucket. */
			
 
				+		ecode = replace_in_hash(ntdb, h, new_off);
			
 
				+		if (ecode != NTDB_SUCCESS) {
			
 
				+			return ecode;
			
 
				+		}
			
 
				+		ecode = ntdb_write_convert(ntdb, h->table, &chdr, sizeof(chdr));
			
 
				+		if (ecode != NTDB_SUCCESS) {
			
 
				+			return ecode;
			
 
				+		}
			
 
				+		/* For futureproofing, we always make the first byte of padding
			
 
				+		 * a zero. */
			
 
				+		if (rec_extra_padding(&chdr)) {
			
 
				+			ecode = ntdb->io->twrite(ntdb, h->table + sizeof(chdr)
			
 
				+						 + dlen + sizeof(new_off),
			
 
				+						 "", 1);
			
 
				+		}
			
 
				+		return ecode;
			
 
				+	}
			
 
				+
			
 
				+	/* We need to reallocate the chain. */
			
 
				+	chain = alloc(ntdb, 0, (h->table_size + 1) * sizeof(ntdb_off_t),
			
 
				+		      NTDB_CHAIN_MAGIC, true);
			
 
				+	if (NTDB_OFF_IS_ERR(chain)) {
			
 
				+		return NTDB_OFF_TO_ERR(chain);
			
 
				+	}
			
 
				+
			
 
				+	/* Map both and copy across old buckets. */
			
 
				+	old = ntdb_access_read(ntdb, hbucket_off(h->table, 0),
			
 
				+			       h->table_size*sizeof(ntdb_off_t), true);
			
 
				+	if (NTDB_PTR_IS_ERR(old)) {
			
 
				+		return NTDB_PTR_ERR(old);
			
 
				+	}
			
 
				+	new = ntdb_access_write(ntdb, hbucket_off(chain, 0),
			
 
				+				(h->table_size + 1)*sizeof(ntdb_off_t), true);
			
 
				+	if (NTDB_PTR_IS_ERR(new)) {
			
 
				+		ntdb_access_release(ntdb, old);
			
 
				+		return NTDB_PTR_ERR(new);
			
 
				+	}
			
 
				+
			
 
				+	memcpy(new, old, h->bucket * sizeof(ntdb_off_t));
			
 
				+	new[h->bucket] = encode_offset(ntdb, new_off, h->h);
			
 
				+	ntdb_access_release(ntdb, old);
			
 
				+
			
 
				+	ecode = ntdb_access_commit(ntdb, new);
			
 
				+	if (ecode != NTDB_SUCCESS) {
			
 
				+		return ecode;
			
 
				+	}
			
 
				+
			
 
				+	/* Free the old chain. */
			
 
				+	ecode = add_free_record(ntdb, h->table,
			
 
				+				sizeof(struct ntdb_used_record)
			
 
				+				+ rec_data_length(&chdr)
			
 
				+				+ rec_extra_padding(&chdr),
			
 
				+				NTDB_LOCK_WAIT, true);
			
 
				+
			
 
				+	/* Replace top-level to point to new chain */
			
 
				+	return ntdb_write_off(ntdb,
			
 
				+			      hbucket_off(NTDB_HASH_OFFSET,
			
 
				+					  bits_from(h->h, 0, ntdb->hash_bits)),
			
 
				+			      chain | (1ULL << NTDB_OFF_CHAIN_BIT));
			
 
				+}
			
 
				+
			
 
				+/* Traverse support: returns offset of record, or 0 or -ve error. */
			
 
				+static ntdb_off_t iterate_chain(struct ntdb_context *ntdb,
			
 
				+				ntdb_off_t val,
			
 
				+				struct hash_info *h)
			
 
				+{
			
 
				+	ntdb_off_t i;
			
 
				+	enum NTDB_ERROR ecode;
			
 
				+	struct ntdb_used_record chdr;
			
 
				+
			
 
				+	/* First load up chain header. */
			
 
				+	h->table = val & NTDB_OFF_MASK;
			
 
				+	ecode = ntdb_read_convert(ntdb, h->table, &chdr, sizeof(chdr));
			
 
				+	if (ecode != NTDB_SUCCESS) {
			
 
				+		return ecode;
			
 
				+	}
			
 
				+
			
 
				+	if (rec_magic(&chdr) != NTDB_CHAIN_MAGIC) {
			
 
				+		return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
			
 
				+				   NTDB_LOG_ERROR,
			
 
				+				   "get_table:"
			
 
				+				   " corrupt record %#x at %llu",
			
 
				+				   rec_magic(&chdr),
			
 
				+				   (long long)h->table);
			
 
				+	}
			
 
				+
			
 
				+	/* Chain length is implied by data length. */
			
 
				+	h->table_size = rec_data_length(&chdr) / sizeof(ntdb_off_t);
			
 
				+
			
 
				+	i = ntdb_find_nonzero_off(ntdb, hbucket_off(h->table, 0), h->bucket,
			
 
				+				  h->table_size);
			
 
				+	if (NTDB_OFF_IS_ERR(i)) {
			
 
				+		return i;
			
 
				+	}
			
 
				+
			
 
				+	if (i != h->table_size) {
			
 
				+		/* Return to next bucket. */
			
 
				+		h->bucket = i + 1;
			
 
				+		val = ntdb_read_off(ntdb, hbucket_off(h->table, i));
			
 
				+		if (NTDB_OFF_IS_ERR(val)) {
			
 
				+			return val;
			
 
				+		}
			
 
				+		return val & NTDB_OFF_MASK;
			
 
				+	}
			
 
				+
			
 
				+	/* Go back up to hash table. */
			
 
				+	h->table = NTDB_HASH_OFFSET;
			
 
				+	h->table_size = 1 << ntdb->hash_bits;
			
 
				+	h->bucket = bits_from(h->h, 0, ntdb->hash_bits) + 1;
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/* Keeps hash locked unless returns 0 or error. */
			
 
				+static ntdb_off_t lock_and_iterate_hash(struct ntdb_context *ntdb,
			
 
				+					struct hash_info *h)
			
 
				+{
			
 
				+	ntdb_off_t val, i;
			
 
				+	enum NTDB_ERROR ecode;
			
 
				+
			
 
				+	if (h->table != NTDB_HASH_OFFSET) {
			
 
				+		/* We're in a chain. */
			
 
				+		i = bits_from(h->h, 0, ntdb->hash_bits);
			
 
				+		ecode = ntdb_lock_hash(ntdb, i, F_RDLCK);
			
 
				+		if (ecode != NTDB_SUCCESS) {
			
 
				+			return NTDB_ERR_TO_OFF(ecode);
			
 
				+		}
			
 
				+
			
 
				+		/* We dropped lock, bucket might have moved! */
			
 
				+		val = ntdb_read_off(ntdb, hbucket_off(NTDB_HASH_OFFSET, i));
			
 
				+		if (NTDB_OFF_IS_ERR(val)) {
			
 
				+			goto unlock;
			
 
				+		}
			
 
				+
			
 
				+		/* We don't remove chains: there should still be one there! */
			
 
				+		if (!val || !is_chain(val)) {
			
 
				+			ecode = ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
			
 
				+					    NTDB_LOG_ERROR,
			
 
				+					    "iterate_hash:"
			
 
				+					    " vanished hchain %llu at %llu",
			
 
				+					    (long long)val,
			
 
				+					    (long long)i);
			
 
				+			val = NTDB_ERR_TO_OFF(ecode);
			
 
				+			goto unlock;
			
 
				+		}
			
 
				+
			
 
				+		/* Find next bucket in the chain. */
			
 
				+		val = iterate_chain(ntdb, val, h);
			
 
				+		if (NTDB_OFF_IS_ERR(val)) {
			
 
				+			goto unlock;
			
 
				+		}
			
 
				+		if (val != 0) {
			
 
				+			return val;
			
 
				+		}
			
 
				+		ntdb_unlock_hash(ntdb, i, F_RDLCK);
			
 
				+
			
 
				+		/* OK, we've reset h back to top level. */
			
 
				+	}
			
 
				+
			
 
				+	/* We do this unlocked, then re-check. */
			
 
				+	for (i = ntdb_find_nonzero_off(ntdb, hbucket_off(h->table, 0),
			
 
				+				       h->bucket, h->table_size);
			
 
				+	     i != h->table_size;
			
 
				+	     i = ntdb_find_nonzero_off(ntdb, hbucket_off(h->table, 0),
			
 
				+				       i+1, h->table_size)) {
			
 
				+		ecode = ntdb_lock_hash(ntdb, i, F_RDLCK);
			
 
				+		if (ecode != NTDB_SUCCESS) {
			
 
				+			return NTDB_ERR_TO_OFF(ecode);
			
 
				+		}
			
 
				+
			
 
				+		val = ntdb_read_off(ntdb, hbucket_off(h->table, i));
			
 
				+		if (NTDB_OFF_IS_ERR(val)) {
			
 
				+			goto unlock;
			
 
				+		}
			
 
				+
			
 
				+		/* Lost race, and it's empty? */
			
 
				+		if (!val) {
			
 
				+			ntdb->stats.traverse_val_vanished++;
			
 
				+			ntdb_unlock_hash(ntdb, i, F_RDLCK);
			
 
				+			continue;
			
 
				+		}
			
 
				+
			
 
				+		if (!is_chain(val)) {
			
 
				+			/* So caller knows what lock to free. */
			
 
				+			h->h = i;
			
 
				+			/* Return to next bucket. */
			
 
				+			h->bucket = i + 1;
			
 
				+			val &= NTDB_OFF_MASK;
			
 
				+			return val;
			
 
				+		}
			
 
				+
			
 
				+		/* Start at beginning of chain */
			
 
				+		h->bucket = 0;
			
 
				+		h->h = i;
			
 
				+
			
 
				+		val = iterate_chain(ntdb, val, h);
			
 
				+		if (NTDB_OFF_IS_ERR(val)) {
			
 
				+			goto unlock;
			
 
				+		}
			
 
				+		if (val != 0) {
			
 
				+			return val;
			
 
				+		}
			
 
				+
			
 
				+		/* Otherwise, bucket has been set to i+1 */
			
 
				+		ntdb_unlock_hash(ntdb, i, F_RDLCK);
			
 
				+	}
			
 
				+	return 0;
			
 
				+
			
 
				+unlock:
			
 
				+	ntdb_unlock_hash(ntdb, i, F_RDLCK);
			
 
				+	return val;
			
 
				+}
			
 
				+
			
 
				+/* Return success if we find something, NTDB_ERR_NOEXIST if none. */
			
 
				+enum NTDB_ERROR next_in_hash(struct ntdb_context *ntdb,
			
 
				+			     struct hash_info *h,
			
 
				+			     NTDB_DATA *kbuf, size_t *dlen)
			
 
				+{
			
 
				+	ntdb_off_t off;
			
 
				+	struct ntdb_used_record rec;
			
 
				+	enum NTDB_ERROR ecode;
			
 
				+
			
 
				+	off = lock_and_iterate_hash(ntdb, h);
			
 
				+
			
 
				+	if (NTDB_OFF_IS_ERR(off)) {
			
 
				+		return NTDB_OFF_TO_ERR(off);
			
 
				+	} else if (off == 0) {
			
 
				+		return NTDB_ERR_NOEXIST;
			
 
				+	}
			
 
				+
			
 
				+	/* The hash for this key is still locked. */
			
 
				+	ecode = ntdb_read_convert(ntdb, off, &rec, sizeof(rec));
			
 
				+	if (ecode != NTDB_SUCCESS) {
			
 
				+		goto unlock;
			
 
				+	}
			
 
				+	if (rec_magic(&rec) != NTDB_USED_MAGIC) {
			
 
				+		ecode = ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
			
 
				+				    NTDB_LOG_ERROR,
			
 
				+				    "next_in_hash:"
			
 
				+				    " corrupt record at %llu",
			
 
				+				    (long long)off);
			
 
				+		goto unlock;
			
 
				+	}
			
 
				+
			
 
				+	kbuf->dsize = rec_key_length(&rec);
			
 
				+
			
 
				+	/* They want data as well? */
			
 
				+	if (dlen) {
			
 
				+		*dlen = rec_data_length(&rec);
			
 
				+		kbuf->dptr = ntdb_alloc_read(ntdb, off + sizeof(rec),
			
 
				+					     kbuf->dsize + *dlen);
			
 
				+	} else {
			
 
				+		kbuf->dptr = ntdb_alloc_read(ntdb, off + sizeof(rec),
			
 
				+					     kbuf->dsize);
			
 
				+	}
			
 
				+	if (NTDB_PTR_IS_ERR(kbuf->dptr)) {
			
 
				+		ecode = NTDB_PTR_ERR(kbuf->dptr);
			
 
				+		goto unlock;
			
 
				+	}
			
 
				+	ecode = NTDB_SUCCESS;
			
 
				+
			
 
				+unlock:
			
 
				+	ntdb_unlock_hash(ntdb, bits_from(h->h, 0, ntdb->hash_bits), F_RDLCK);
			
 
				+	return ecode;
			
 
				+
			
 
				+}
			
 
				+
			
 
				+enum NTDB_ERROR first_in_hash(struct ntdb_context *ntdb,
			
 
				+			     struct hash_info *h,
			
 
				+			     NTDB_DATA *kbuf, size_t *dlen)
			
 
				+{
			
 
				+	h->table = NTDB_HASH_OFFSET;
			
 
				+	h->table_size = 1 << ntdb->hash_bits;
			
 
				+	h->bucket = 0;
			
 
				+
			
 
				+	return next_in_hash(ntdb, h, kbuf, dlen);
			
 
				+}
			
 
				+
			
 
				+/* Even if the entry isn't in this hash bucket, you'd have to lock this
			
 
				+ * bucket to find it. */
			
 
				+static enum NTDB_ERROR chainlock(struct ntdb_context *ntdb,
			
 
				+				 const NTDB_DATA *key, int ltype)
			
 
				+{
			
 
				+	uint32_t h = ntdb_hash(ntdb, key->dptr, key->dsize);
			
 
				+
			
 
				+	return ntdb_lock_hash(ntdb, bits_from(h, 0, ntdb->hash_bits), ltype);
			
 
				+}
			
 
				+
			
 
				+/* lock/unlock one hash chain. This is meant to be used to reduce
			
 
				+   contention - it cannot guarantee how many records will be locked */
			
 
				+_PUBLIC_ enum NTDB_ERROR ntdb_chainlock(struct ntdb_context *ntdb, NTDB_DATA key)
			
 
				+{
			
 
				+	return chainlock(ntdb, &key, F_WRLCK);
			
 
				+}
			
 
				+
			
 
				+_PUBLIC_ void ntdb_chainunlock(struct ntdb_context *ntdb, NTDB_DATA key)
			
 
				+{
			
 
				+	uint32_t h = ntdb_hash(ntdb, key.dptr, key.dsize);
			
 
				+
			
 
				+	ntdb_unlock_hash(ntdb, bits_from(h, 0, ntdb->hash_bits), F_WRLCK);
			
 
				+}
			
 
				+
			
 
				+_PUBLIC_ enum NTDB_ERROR ntdb_chainlock_read(struct ntdb_context *ntdb,
			
 
				+					     NTDB_DATA key)
			
 
				+{
			
 
				+	return chainlock(ntdb, &key, F_RDLCK);
			
 
				+}
			
 
				+
			
 
				+_PUBLIC_ void ntdb_chainunlock_read(struct ntdb_context *ntdb, NTDB_DATA key)
			
 
				+{
			
 
				+	uint32_t h = ntdb_hash(ntdb, key.dptr, key.dsize);
			
 
				+
			
 
				+	ntdb_unlock_hash(ntdb, bits_from(h, 0, ntdb->hash_bits), F_RDLCK);
			
 
				+}
			
--- a/ccan/ntdb/io.c
+++ b/ccan/ntdb/io.c
@@ -0,0 +1,750 @@
 
				+ /*
			
 
				+   Unix SMB/CIFS implementation.
			
 
				+
			
 
				+   trivial database library
			
 
				+
			
 
				+   Copyright (C) Andrew Tridgell              1999-2005
			
 
				+   Copyright (C) Paul `Rusty' Russell		   2000
			
 
				+   Copyright (C) Jeremy Allison			   2000-2003
			
 
				+   Copyright (C) Rusty Russell			   2010
			
 
				+
			
 
				+     ** NOTE! The following LGPL license applies to the ntdb
			
 
				+     ** library. This does NOT imply that all of Samba is released
			
 
				+     ** under the LGPL
			
 
				+
			
 
				+   This library is free software; you can redistribute it and/or
			
 
				+   modify it under the terms of the GNU Lesser General Public
			
 
				+   License as published by the Free Software Foundation; either
			
 
				+   version 3 of the License, or (at your option) any later version.
			
 
				+
			
 
				+   This library is distributed in the hope that it will be useful,
			
 
				+   but WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
			
 
				+   Lesser General Public License for more details.
			
 
				+
			
 
				+   You should have received a copy of the GNU Lesser General Public
			
 
				+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
			
 
				+*/
			
 
				+#include "private.h"
			
 
				+#include <ccan/likely/likely.h>
			
 
				+
			
 
				+static void free_old_mmaps(struct ntdb_context *ntdb)
			
 
				+{
			
 
				+	struct ntdb_old_mmap *i;
			
 
				+
			
 
				+	assert(ntdb->file->direct_count == 0);
			
 
				+
			
 
				+	while ((i = ntdb->file->old_mmaps) != NULL) {
			
 
				+		ntdb->file->old_mmaps = i->next;
			
 
				+		if (ntdb->flags & NTDB_INTERNAL) {
			
 
				+			ntdb->free_fn(i->map_ptr, ntdb->alloc_data);
			
 
				+		} else {
			
 
				+			munmap(i->map_ptr, i->map_size);
			
 
				+		}
			
 
				+		ntdb->free_fn(i, ntdb->alloc_data);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static enum NTDB_ERROR save_old_map(struct ntdb_context *ntdb)
			
 
				+{
			
 
				+	struct ntdb_old_mmap *old;
			
 
				+
			
 
				+	assert(ntdb->file->direct_count);
			
 
				+
			
 
				+	old = ntdb->alloc_fn(ntdb->file, sizeof(*old), ntdb->alloc_data);
			
 
				+	if (!old) {
			
 
				+		return ntdb_logerr(ntdb, NTDB_ERR_OOM, NTDB_LOG_ERROR,
			
 
				+				   "save_old_map alloc failed");
			
 
				+	}
			
 
				+	old->next = ntdb->file->old_mmaps;
			
 
				+	old->map_ptr = ntdb->file->map_ptr;
			
 
				+	old->map_size = ntdb->file->map_size;
			
 
				+	ntdb->file->old_mmaps = old;
			
 
				+
			
 
				+	return NTDB_SUCCESS;
			
 
				+}
			
 
				+
			
 
				+enum NTDB_ERROR ntdb_munmap(struct ntdb_context *ntdb)
			
 
				+{
			
 
				+	if (ntdb->file->fd == -1) {
			
 
				+		return NTDB_SUCCESS;
			
 
				+	}
			
 
				+
			
 
				+	if (!ntdb->file->map_ptr) {
			
 
				+		return NTDB_SUCCESS;
			
 
				+	}
			
 
				+
			
 
				+	/* We can't unmap now if there are accessors. */
			
 
				+	if (ntdb->file->direct_count) {
			
 
				+		return save_old_map(ntdb);
			
 
				+	} else {
			
 
				+		munmap(ntdb->file->map_ptr, ntdb->file->map_size);
			
 
				+		ntdb->file->map_ptr = NULL;
			
 
				+	}
			
 
				+	return NTDB_SUCCESS;
			
 
				+}
			
 
				+
			
 
				+enum NTDB_ERROR ntdb_mmap(struct ntdb_context *ntdb)
			
 
				+{
			
 
				+	int mmap_flags;
			
 
				+
			
 
				+	if (ntdb->flags & NTDB_INTERNAL)
			
 
				+		return NTDB_SUCCESS;
			
 
				+
			
 
				+#ifndef HAVE_INCOHERENT_MMAP
			
 
				+	if (ntdb->flags & NTDB_NOMMAP)
			
 
				+		return NTDB_SUCCESS;
			
 
				+#endif
			
 
				+
			
 
				+	if ((ntdb->open_flags & O_ACCMODE) == O_RDONLY)
			
 
				+		mmap_flags = PROT_READ;
			
 
				+	else
			
 
				+		mmap_flags = PROT_READ | PROT_WRITE;
			
 
				+
			
 
				+	/* size_t can be smaller than off_t. */
			
 
				+	if ((size_t)ntdb->file->map_size == ntdb->file->map_size) {
			
 
				+		ntdb->file->map_ptr = mmap(NULL, ntdb->file->map_size,
			
 
				+					  mmap_flags,
			
 
				+					  MAP_SHARED, ntdb->file->fd, 0);
			
 
				+	} else
			
 
				+		ntdb->file->map_ptr = MAP_FAILED;
			
 
				+
			
 
				+	/*
			
 
				+	 * NB. When mmap fails it returns MAP_FAILED *NOT* NULL !!!!
			
 
				+	 */
			
 
				+	if (ntdb->file->map_ptr == MAP_FAILED) {
			
 
				+		ntdb->file->map_ptr = NULL;
			
 
				+#ifdef HAVE_INCOHERENT_MMAP
			
 
				+		/* Incoherent mmap means everyone must mmap! */
			
 
				+		return ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
			
 
				+				  "ntdb_mmap failed for size %lld (%s)",
			
 
				+				  (long long)ntdb->file->map_size,
			
 
				+				  strerror(errno));
			
 
				+#else
			
 
				+		ntdb_logerr(ntdb, NTDB_SUCCESS, NTDB_LOG_WARNING,
			
 
				+			   "ntdb_mmap failed for size %lld (%s)",
			
 
				+			   (long long)ntdb->file->map_size, strerror(errno));
			
 
				+#endif
			
 
				+	}
			
 
				+	return NTDB_SUCCESS;
			
 
				+}
			
 
				+
			
 
				+/* check for an out of bounds access - if it is out of bounds then
			
 
				+   see if the database has been expanded by someone else and expand
			
 
				+   if necessary
			
 
				+   note that "len" is the minimum length needed for the db.
			
 
				+
			
 
				+   If probe is true, len being too large isn't a failure.
			
 
				+*/
			
 
				+static enum NTDB_ERROR ntdb_normal_oob(struct ntdb_context *ntdb,
			
 
				+				       ntdb_off_t off, ntdb_len_t len,
			
 
				+				       bool probe)
			
 
				+{
			
 
				+	struct stat st;
			
 
				+	enum NTDB_ERROR ecode;
			
 
				+
			
 
				+	if (len + off < len) {
			
 
				+		if (probe)
			
 
				+			return NTDB_SUCCESS;
			
 
				+
			
 
				+		return ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
			
 
				+				  "ntdb_oob off %llu len %llu wrap\n",
			
 
				+				  (long long)off, (long long)len);
			
 
				+	}
			
 
				+
			
 
				+	if (ntdb->flags & NTDB_INTERNAL) {
			
 
				+		if (probe)
			
 
				+			return NTDB_SUCCESS;
			
 
				+
			
 
				+		ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
			
 
				+			   "ntdb_oob len %lld beyond internal"
			
 
				+			   " alloc size %lld",
			
 
				+			   (long long)(off + len),
			
 
				+			   (long long)ntdb->file->map_size);
			
 
				+		return NTDB_ERR_IO;
			
 
				+	}
			
 
				+
			
 
				+	ecode = ntdb_lock_expand(ntdb, F_RDLCK);
			
 
				+	if (ecode != NTDB_SUCCESS) {
			
 
				+		return ecode;
			
 
				+	}
			
 
				+
			
 
				+	if (fstat(ntdb->file->fd, &st) != 0) {
			
 
				+		ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
			
 
				+			   "Failed to fstat file: %s", strerror(errno));
			
 
				+		ntdb_unlock_expand(ntdb, F_RDLCK);
			
 
				+		return NTDB_ERR_IO;
			
 
				+	}
			
 
				+
			
 
				+	ntdb_unlock_expand(ntdb, F_RDLCK);
			
 
				+
			
 
				+	if (st.st_size < off + len) {
			
 
				+		if (probe)
			
 
				+			return NTDB_SUCCESS;
			
 
				+
			
 
				+		ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
			
 
				+			   "ntdb_oob len %llu beyond eof at %llu",
			
 
				+			   (long long)(off + len), (long long)st.st_size);
			
 
				+		return NTDB_ERR_IO;
			
 
				+	}
			
 
				+
			
 
				+	/* Unmap, update size, remap */
			
 
				+	ecode = ntdb_munmap(ntdb);
			
 
				+	if (ecode) {
			
 
				+		return ecode;
			
 
				+	}
			
 
				+
			
 
				+	ntdb->file->map_size = st.st_size;
			
 
				+	return ntdb_mmap(ntdb);
			
 
				+}
			
 
				+
			
 
				+/* Endian conversion: we only ever deal with 8 byte quantities */
			
 
				+void *ntdb_convert(const struct ntdb_context *ntdb, void *buf, ntdb_len_t size)
			
 
				+{
			
 
				+	assert(size % 8 == 0);
			
 
				+	if (unlikely((ntdb->flags & NTDB_CONVERT)) && buf) {
			
 
				+		uint64_t i, *p = (uint64_t *)buf;
			
 
				+		for (i = 0; i < size / 8; i++)
			
 
				+			p[i] = bswap_64(p[i]);
			
 
				+	}
			
 
				+	return buf;
			
 
				+}
			
 
				+
			
 
				+/* Return first non-zero offset in offset array, or end, or -ve error. */
			
 
				+/* FIXME: Return the off? */
			
 
				+uint64_t ntdb_find_nonzero_off(struct ntdb_context *ntdb,
			
 
				+			      ntdb_off_t base, uint64_t start, uint64_t end)
			
 
				+{
			
 
				+	uint64_t i;
			
 
				+	const uint64_t *val;
			
 
				+
			
 
				+	/* Zero vs non-zero is the same unconverted: minor optimization. */
			
 
				+	val = ntdb_access_read(ntdb, base + start * sizeof(ntdb_off_t),
			
 
				+			      (end - start) * sizeof(ntdb_off_t), false);
			
 
				+	if (NTDB_PTR_IS_ERR(val)) {
			
 
				+		return NTDB_ERR_TO_OFF(NTDB_PTR_ERR(val));
			
 
				+	}
			
 
				+
			
 
				+	for (i = 0; i < (end - start); i++) {
			
 
				+		if (val[i])
			
 
				+			break;
			
 
				+	}
			
 
				+	ntdb_access_release(ntdb, val);
			
 
				+	return start + i;
			
 
				+}
			
 
				+
			
 
				+/* Return first zero offset in num offset array, or num, or -ve error. */
			
 
				+uint64_t ntdb_find_zero_off(struct ntdb_context *ntdb, ntdb_off_t off,
			
 
				+			   uint64_t num)
			
 
				+{
			
 
				+	uint64_t i;
			
 
				+	const uint64_t *val;
			
 
				+
			
 
				+	/* Zero vs non-zero is the same unconverted: minor optimization. */
			
 
				+	val = ntdb_access_read(ntdb, off, num * sizeof(ntdb_off_t), false);
			
 
				+	if (NTDB_PTR_IS_ERR(val)) {
			
 
				+		return NTDB_ERR_TO_OFF(NTDB_PTR_ERR(val));
			
 
				+	}
			
 
				+
			
 
				+	for (i = 0; i < num; i++) {
			
 
				+		if (!val[i])
			
 
				+			break;
			
 
				+	}
			
 
				+	ntdb_access_release(ntdb, val);
			
 
				+	return i;
			
 
				+}
			
 
				+
			
 
				+enum NTDB_ERROR zero_out(struct ntdb_context *ntdb, ntdb_off_t off, ntdb_len_t len)
			
 
				+{
			
 
				+	char buf[8192] = { 0 };
			
 
				+	void *p = ntdb->io->direct(ntdb, off, len, true);
			
 
				+	enum NTDB_ERROR ecode = NTDB_SUCCESS;
			
 
				+
			
 
				+	assert(!(ntdb->flags & NTDB_RDONLY));
			
 
				+	if (NTDB_PTR_IS_ERR(p)) {
			
 
				+		return NTDB_PTR_ERR(p);
			
 
				+	}
			
 
				+	if (p) {
			
 
				+		memset(p, 0, len);
			
 
				+		return ecode;
			
 
				+	}
			
 
				+	while (len) {
			
 
				+		unsigned todo = len < sizeof(buf) ? len : sizeof(buf);
			
 
				+		ecode = ntdb->io->twrite(ntdb, off, buf, todo);
			
 
				+		if (ecode != NTDB_SUCCESS) {
			
 
				+			break;
			
 
				+		}
			
 
				+		len -= todo;
			
 
				+		off += todo;
			
 
				+	}
			
 
				+	return ecode;
			
 
				+}
			
 
				+
			
 
				+/* write a lump of data at a specified offset */
			
 
				+static enum NTDB_ERROR ntdb_write(struct ntdb_context *ntdb, ntdb_off_t off,
			
 
				+				const void *buf, ntdb_len_t len)
			
 
				+{
			
 
				+	enum NTDB_ERROR ecode;
			
 
				+
			
 
				+	if (ntdb->flags & NTDB_RDONLY) {
			
 
				+		return ntdb_logerr(ntdb, NTDB_ERR_RDONLY, NTDB_LOG_USE_ERROR,
			
 
				+				  "Write to read-only database");
			
 
				+	}
			
 
				+
			
 
				+	ecode = ntdb_oob(ntdb, off, len, false);
			
 
				+	if (ecode != NTDB_SUCCESS) {
			
 
				+		return ecode;
			
 
				+	}
			
 
				+
			
 
				+	if (ntdb->file->map_ptr) {
			
 
				+		memcpy(off + (char *)ntdb->file->map_ptr, buf, len);
			
 
				+	} else {
			
 
				+#ifdef HAVE_INCOHERENT_MMAP
			
 
				+		return NTDB_ERR_IO;
			
 
				+#else
			
 
				+		ssize_t ret;
			
 
				+		ret = pwrite(ntdb->file->fd, buf, len, off);
			
 
				+		if (ret != len) {
			
 
				+			/* This shouldn't happen: we avoid sparse files. */
			
 
				+			if (ret >= 0)
			
 
				+				errno = ENOSPC;
			
 
				+
			
 
				+			return ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
			
 
				+					  "ntdb_write: %zi at %zu len=%zu (%s)",
			
 
				+					  ret, (size_t)off, (size_t)len,
			
 
				+					  strerror(errno));
			
 
				+		}
			
 
				+#endif
			
 
				+	}
			
 
				+	return NTDB_SUCCESS;
			
 
				+}
			
 
				+
			
 
				+/* read a lump of data at a specified offset */
			
 
				+static enum NTDB_ERROR ntdb_read(struct ntdb_context *ntdb, ntdb_off_t off,
			
 
				+			       void *buf, ntdb_len_t len)
			
 
				+{
			
 
				+	enum NTDB_ERROR ecode;
			
 
				+
			
 
				+	ecode = ntdb_oob(ntdb, off, len, false);
			
 
				+	if (ecode != NTDB_SUCCESS) {
			
 
				+		return ecode;
			
 
				+	}
			
 
				+
			
 
				+	if (ntdb->file->map_ptr) {
			
 
				+		memcpy(buf, off + (char *)ntdb->file->map_ptr, len);
			
 
				+	} else {
			
 
				+#ifdef HAVE_INCOHERENT_MMAP
			
 
				+		return NTDB_ERR_IO;
			
 
				+#else
			
 
				+		ssize_t r = pread(ntdb->file->fd, buf, len, off);
			
 
				+		if (r != len) {
			
 
				+			return ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
			
 
				+					  "ntdb_read failed with %zi at %zu "
			
 
				+					  "len=%zu (%s) map_size=%zu",
			
 
				+					  r, (size_t)off, (size_t)len,
			
 
				+					  strerror(errno),
			
 
				+					  (size_t)ntdb->file->map_size);
			
 
				+		}
			
 
				+#endif
			
 
				+	}
			
 
				+	return NTDB_SUCCESS;
			
 
				+}
			
 
				+
			
 
				+enum NTDB_ERROR ntdb_write_convert(struct ntdb_context *ntdb, ntdb_off_t off,
			
 
				+				 const void *rec, size_t len)
			
 
				+{
			
 
				+	enum NTDB_ERROR ecode;
			
 
				+
			
 
				+	if (unlikely((ntdb->flags & NTDB_CONVERT))) {
			
 
				+		void *conv = ntdb->alloc_fn(ntdb, len, ntdb->alloc_data);
			
 
				+		if (!conv) {
			
 
				+			return ntdb_logerr(ntdb, NTDB_ERR_OOM, NTDB_LOG_ERROR,
			
 
				+					  "ntdb_write: no memory converting"
			
 
				+					  " %zu bytes", len);
			
 
				+		}
			
 
				+		memcpy(conv, rec, len);
			
 
				+		ecode = ntdb->io->twrite(ntdb, off,
			
 
				+					 ntdb_convert(ntdb, conv, len), len);
			
 
				+		ntdb->free_fn(conv, ntdb->alloc_data);
			
 
				+	} else {
			
 
				+		ecode = ntdb->io->twrite(ntdb, off, rec, len);
			
 
				+	}
			
 
				+	return ecode;
			
 
				+}
			
 
				+
			
 
				+enum NTDB_ERROR ntdb_read_convert(struct ntdb_context *ntdb, ntdb_off_t off,
			
 
				+				void *rec, size_t len)
			
 
				+{
			
 
				+	enum NTDB_ERROR ecode = ntdb->io->tread(ntdb, off, rec, len);
			
 
				+	ntdb_convert(ntdb, rec, len);
			
 
				+	return ecode;
			
 
				+}
			
 
				+
			
 
				+static void *_ntdb_alloc_read(struct ntdb_context *ntdb, ntdb_off_t offset,
			
 
				+			     ntdb_len_t len, unsigned int prefix)
			
 
				+{
			
 
				+	unsigned char *buf;
			
 
				+	enum NTDB_ERROR ecode;
			
 
				+
			
 
				+	/* some systems don't like zero length malloc */
			
 
				+	buf = ntdb->alloc_fn(ntdb, prefix + len ? prefix + len : 1,
			
 
				+			  ntdb->alloc_data);
			
 
				+	if (!buf) {
			
 
				+		ntdb_logerr(ntdb, NTDB_ERR_OOM, NTDB_LOG_ERROR,
			
 
				+			   "ntdb_alloc_read alloc failed len=%zu",
			
 
				+			   (size_t)(prefix + len));
			
 
				+		return NTDB_ERR_PTR(NTDB_ERR_OOM);
			
 
				+	} else {
			
 
				+		ecode = ntdb->io->tread(ntdb, offset, buf+prefix, len);
			
 
				+		if (unlikely(ecode != NTDB_SUCCESS)) {
			
 
				+			ntdb->free_fn(buf, ntdb->alloc_data);
			
 
				+			return NTDB_ERR_PTR(ecode);
			
 
				+		}
			
 
				+	}
			
 
				+	return buf;
			
 
				+}
			
 
				+
			
 
				+/* read a lump of data, allocating the space for it */
			
 
				+void *ntdb_alloc_read(struct ntdb_context *ntdb, ntdb_off_t offset, ntdb_len_t len)
			
 
				+{
			
 
				+	return _ntdb_alloc_read(ntdb, offset, len, 0);
			
 
				+}
			
 
				+
			
 
				+static enum NTDB_ERROR fill(struct ntdb_context *ntdb,
			
 
				+			   const void *buf, size_t size,
			
 
				+			   ntdb_off_t off, ntdb_len_t len)
			
 
				+{
			
 
				+	while (len) {
			
 
				+		size_t n = len > size ? size : len;
			
 
				+		ssize_t ret = pwrite(ntdb->file->fd, buf, n, off);
			
 
				+		if (ret != n) {
			
 
				+			if (ret >= 0)
			
 
				+				errno = ENOSPC;
			
 
				+
			
 
				+			return ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
			
 
				+					  "fill failed:"
			
 
				+					  " %zi at %zu len=%zu (%s)",
			
 
				+					  ret, (size_t)off, (size_t)len,
			
 
				+					  strerror(errno));
			
 
				+		}
			
 
				+		len -= n;
			
 
				+		off += n;
			
 
				+	}
			
 
				+	return NTDB_SUCCESS;
			
 
				+}
			
 
				+
			
 
				+/* expand a file.  we prefer to use ftruncate, as that is what posix
			
 
				+  says to use for mmap expansion */
			
 
				+static enum NTDB_ERROR ntdb_expand_file(struct ntdb_context *ntdb,
			
 
				+				      ntdb_len_t addition)
			
 
				+{
			
 
				+	char buf[8192];
			
 
				+	enum NTDB_ERROR ecode;
			
 
				+
			
 
				+	assert((ntdb->file->map_size + addition) % NTDB_PGSIZE == 0);
			
 
				+	if (ntdb->flags & NTDB_RDONLY) {
			
 
				+		return ntdb_logerr(ntdb, NTDB_ERR_RDONLY, NTDB_LOG_USE_ERROR,
			
 
				+				  "Expand on read-only database");
			
 
				+	}
			
 
				+
			
 
				+	if (ntdb->flags & NTDB_INTERNAL) {
			
 
				+		char *new;
			
 
				+
			
 
				+		/* Can't free it if we have direct accesses. */
			
 
				+		if (ntdb->file->direct_count) {
			
 
				+			ecode = save_old_map(ntdb);
			
 
				+			if (ecode) {
			
 
				+				return ecode;
			
 
				+			}
			
 
				+			new = ntdb->alloc_fn(ntdb->file,
			
 
				+					     ntdb->file->map_size + addition,
			
 
				+					     ntdb->alloc_data);
			
 
				+			if (new) {
			
 
				+				memcpy(new, ntdb->file->map_ptr,
			
 
				+				       ntdb->file->map_size);
			
 
				+			}
			
 
				+		} else {
			
 
				+			new = ntdb->expand_fn(ntdb->file->map_ptr,
			
 
				+					      ntdb->file->map_size + addition,
			
 
				+					      ntdb->alloc_data);
			
 
				+		}
			
 
				+		if (!new) {
			
 
				+			return ntdb_logerr(ntdb, NTDB_ERR_OOM, NTDB_LOG_ERROR,
			
 
				+					  "No memory to expand database");
			
 
				+		}
			
 
				+		ntdb->file->map_ptr = new;
			
 
				+		ntdb->file->map_size += addition;
			
 
				+		return NTDB_SUCCESS;
			
 
				+	} else {
			
 
				+		/* Unmap before trying to write; old NTDB claimed OpenBSD had
			
 
				+		 * problem with this otherwise. */
			
 
				+		ecode = ntdb_munmap(ntdb);
			
 
				+		if (ecode) {
			
 
				+			return ecode;
			
 
				+		}
			
 
				+
			
 
				+		/* If this fails, we try to fill anyway. */
			
 
				+		if (ftruncate(ntdb->file->fd, ntdb->file->map_size + addition))
			
 
				+			;
			
 
				+
			
 
				+		/* now fill the file with something. This ensures that the
			
 
				+		   file isn't sparse, which would be very bad if we ran out of
			
 
				+		   disk. This must be done with write, not via mmap */
			
 
				+		memset(buf, 0x43, sizeof(buf));
			
 
				+		ecode = fill(ntdb, buf, sizeof(buf), ntdb->file->map_size,
			
 
				+			     addition);
			
 
				+		if (ecode != NTDB_SUCCESS)
			
 
				+			return ecode;
			
 
				+		ntdb->file->map_size += addition;
			
 
				+		return ntdb_mmap(ntdb);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+const void *ntdb_access_read(struct ntdb_context *ntdb,
			
 
				+			    ntdb_off_t off, ntdb_len_t len, bool convert)
			
 
				+{
			
 
				+	void *ret = NULL;
			
 
				+
			
 
				+	if (likely(!(ntdb->flags & NTDB_CONVERT))) {
			
 
				+		ret = ntdb->io->direct(ntdb, off, len, false);
			
 
				+
			
 
				+		if (NTDB_PTR_IS_ERR(ret)) {
			
 
				+			return ret;
			
 
				+		}
			
 
				+	}
			
 
				+	if (!ret) {
			
 
				+		struct ntdb_access_hdr *hdr;
			
 
				+		hdr = _ntdb_alloc_read(ntdb, off, len, sizeof(*hdr));
			
 
				+		if (NTDB_PTR_IS_ERR(hdr)) {
			
 
				+			return hdr;
			
 
				+		}
			
 
				+		hdr->next = ntdb->access;
			
 
				+		ntdb->access = hdr;
			
 
				+		ret = hdr + 1;
			
 
				+		if (convert) {
			
 
				+			ntdb_convert(ntdb, (void *)ret, len);
			
 
				+		}
			
 
				+	} else {
			
 
				+		ntdb->file->direct_count++;
			
 
				+	}
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+void *ntdb_access_write(struct ntdb_context *ntdb,
			
 
				+		       ntdb_off_t off, ntdb_len_t len, bool convert)
			
 
				+{
			
 
				+	void *ret = NULL;
			
 
				+
			
 
				+	if (ntdb->flags & NTDB_RDONLY) {
			
 
				+		ntdb_logerr(ntdb, NTDB_ERR_RDONLY, NTDB_LOG_USE_ERROR,
			
 
				+			   "Write to read-only database");
			
 
				+		return NTDB_ERR_PTR(NTDB_ERR_RDONLY);
			
 
				+	}
			
 
				+
			
 
				+	if (likely(!(ntdb->flags & NTDB_CONVERT))) {
			
 
				+		ret = ntdb->io->direct(ntdb, off, len, true);
			
 
				+
			
 
				+		if (NTDB_PTR_IS_ERR(ret)) {
			
 
				+			return ret;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	if (!ret) {
			
 
				+		struct ntdb_access_hdr *hdr;
			
 
				+		hdr = _ntdb_alloc_read(ntdb, off, len, sizeof(*hdr));
			
 
				+		if (NTDB_PTR_IS_ERR(hdr)) {
			
 
				+			return hdr;
			
 
				+		}
			
 
				+		hdr->next = ntdb->access;
			
 
				+		ntdb->access = hdr;
			
 
				+		hdr->off = off;
			
 
				+		hdr->len = len;
			
 
				+		hdr->convert = convert;
			
 
				+		ret = hdr + 1;
			
 
				+		if (convert)
			
 
				+			ntdb_convert(ntdb, (void *)ret, len);
			
 
				+	} else {
			
 
				+		ntdb->file->direct_count++;
			
 
				+	}
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+static struct ntdb_access_hdr **find_hdr(struct ntdb_context *ntdb, const void *p)
			
 
				+{
			
 
				+	struct ntdb_access_hdr **hp;
			
 
				+
			
 
				+	for (hp = &ntdb->access; *hp; hp = &(*hp)->next) {
			
 
				+		if (*hp + 1 == p)
			
 
				+			return hp;
			
 
				+	}
			
 
				+	return NULL;
			
 
				+}
			
 
				+
			
 
				+void ntdb_access_release(struct ntdb_context *ntdb, const void *p)
			
 
				+{
			
 
				+	struct ntdb_access_hdr *hdr, **hp = find_hdr(ntdb, p);
			
 
				+
			
 
				+	if (hp) {
			
 
				+		hdr = *hp;
			
 
				+		*hp = hdr->next;
			
 
				+		ntdb->free_fn(hdr, ntdb->alloc_data);
			
 
				+	} else {
			
 
				+		if (--ntdb->file->direct_count == 0) {
			
 
				+			free_old_mmaps(ntdb);
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+enum NTDB_ERROR ntdb_access_commit(struct ntdb_context *ntdb, void *p)
			
 
				+{
			
 
				+	struct ntdb_access_hdr *hdr, **hp = find_hdr(ntdb, p);
			
 
				+	enum NTDB_ERROR ecode;
			
 
				+
			
 
				+	if (hp) {
			
 
				+		hdr = *hp;
			
 
				+		if (hdr->convert)
			
 
				+			ecode = ntdb_write_convert(ntdb, hdr->off, p, hdr->len);
			
 
				+		else
			
 
				+			ecode = ntdb_write(ntdb, hdr->off, p, hdr->len);
			
 
				+		*hp = hdr->next;
			
 
				+		ntdb->free_fn(hdr, ntdb->alloc_data);
			
 
				+	} else {
			
 
				+		if (--ntdb->file->direct_count == 0) {
			
 
				+			free_old_mmaps(ntdb);
			
 
				+		}
			
 
				+		ecode = NTDB_SUCCESS;
			
 
				+	}
			
 
				+
			
 
				+	return ecode;
			
 
				+}
			
 
				+
			
 
				+static void *ntdb_direct(struct ntdb_context *ntdb, ntdb_off_t off, size_t len,
			
 
				+			bool write_mode)
			
 
				+{
			
 
				+	enum NTDB_ERROR ecode;
			
 
				+
			
 
				+	if (unlikely(!ntdb->file->map_ptr))
			
 
				+		return NULL;
			
 
				+
			
 
				+	ecode = ntdb_oob(ntdb, off, len, false);
			
 
				+	if (unlikely(ecode != NTDB_SUCCESS))
			
 
				+		return NTDB_ERR_PTR(ecode);
			
 
				+	return (char *)ntdb->file->map_ptr + off;
			
 
				+}
			
 
				+
			
 
				+static ntdb_off_t ntdb_read_normal_off(struct ntdb_context *ntdb,
			
 
				+				       ntdb_off_t off)
			
 
				+{
			
 
				+	ntdb_off_t ret;
			
 
				+	enum NTDB_ERROR ecode;
			
 
				+	ntdb_off_t *p;
			
 
				+
			
 
				+	p = ntdb_direct(ntdb, off, sizeof(*p), false);
			
 
				+	if (NTDB_PTR_IS_ERR(p)) {
			
 
				+		return NTDB_ERR_TO_OFF(NTDB_PTR_ERR(p));
			
 
				+	}
			
 
				+	if (likely(p)) {
			
 
				+		return *p;
			
 
				+	}
			
 
				+
			
 
				+	ecode = ntdb_read(ntdb, off, &ret, sizeof(ret));
			
 
				+	if (ecode != NTDB_SUCCESS) {
			
 
				+		return NTDB_ERR_TO_OFF(ecode);
			
 
				+	}
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+static ntdb_off_t ntdb_read_convert_off(struct ntdb_context *ntdb,
			
 
				+					ntdb_off_t off)
			
 
				+{
			
 
				+	ntdb_off_t ret;
			
 
				+	enum NTDB_ERROR ecode;
			
 
				+
			
 
				+	ecode = ntdb_read_convert(ntdb, off, &ret, sizeof(ret));
			
 
				+	if (ecode != NTDB_SUCCESS) {
			
 
				+		return NTDB_ERR_TO_OFF(ecode);
			
 
				+	}
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+static enum NTDB_ERROR ntdb_write_normal_off(struct ntdb_context *ntdb,
			
 
				+					     ntdb_off_t off, ntdb_off_t val)
			
 
				+{
			
 
				+	ntdb_off_t *p;
			
 
				+
			
 
				+	p = ntdb_direct(ntdb, off, sizeof(*p), true);
			
 
				+	if (NTDB_PTR_IS_ERR(p)) {
			
 
				+		return NTDB_PTR_ERR(p);
			
 
				+	}
			
 
				+	if (likely(p)) {
			
 
				+		*p = val;
			
 
				+		return NTDB_SUCCESS;
			
 
				+	}
			
 
				+	return ntdb_write(ntdb, off, &val, sizeof(val));
			
 
				+}
			
 
				+
			
 
				+static enum NTDB_ERROR ntdb_write_convert_off(struct ntdb_context *ntdb,
			
 
				+					      ntdb_off_t off, ntdb_off_t val)
			
 
				+{
			
 
				+	return ntdb_write_convert(ntdb, off, &val, sizeof(val));
			
 
				+}
			
 
				+
			
 
				+void ntdb_inc_seqnum(struct ntdb_context *ntdb)
			
 
				+{
			
 
				+	ntdb_off_t seq;
			
 
				+
			
 
				+	if (likely(!(ntdb->flags & NTDB_CONVERT))) {
			
 
				+		int64_t *direct;
			
 
				+
			
 
				+		direct = ntdb->io->direct(ntdb,
			
 
				+					 offsetof(struct ntdb_header, seqnum),
			
 
				+					 sizeof(*direct), true);
			
 
				+		if (likely(direct)) {
			
 
				+			/* Don't let it go negative, even briefly */
			
 
				+			if (unlikely((*direct) + 1) < 0)
			
 
				+				*direct = 0;
			
 
				+			(*direct)++;
			
 
				+			return;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	seq = ntdb_read_off(ntdb, offsetof(struct ntdb_header, seqnum));
			
 
				+	if (!NTDB_OFF_IS_ERR(seq)) {
			
 
				+		seq++;
			
 
				+		if (unlikely((int64_t)seq < 0))
			
 
				+			seq = 0;
			
 
				+		ntdb_write_off(ntdb, offsetof(struct ntdb_header, seqnum), seq);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static const struct ntdb_methods io_methods = {
			
 
				+	ntdb_read,
			
 
				+	ntdb_write,
			
 
				+	ntdb_normal_oob,
			
 
				+	ntdb_expand_file,
			
 
				+	ntdb_direct,
			
 
				+	ntdb_read_normal_off,
			
 
				+	ntdb_write_normal_off,
			
 
				+};
			
 
				+
			
 
				+static const struct ntdb_methods io_convert_methods = {
			
 
				+	ntdb_read,
			
 
				+	ntdb_write,
			
 
				+	ntdb_normal_oob,
			
 
				+	ntdb_expand_file,
			
 
				+	ntdb_direct,
			
 
				+	ntdb_read_convert_off,
			
 
				+	ntdb_write_convert_off,
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+  initialise the default methods table
			
 
				+*/
			
 
				+void ntdb_io_init(struct ntdb_context *ntdb)
			
 
				+{
			
 
				+	if (ntdb->flags & NTDB_CONVERT)
			
 
				+		ntdb->io = &io_convert_methods;
			
 
				+	else
			
 
				+		ntdb->io = &io_methods;
			
 
				+}
			
--- a/ccan/ntdb/lock.c
+++ b/ccan/ntdb/lock.c
@@ -0,0 +1,886 @@
 
				+ /*
			
 
				+   Unix SMB/CIFS implementation.
			
 
				+
			
 
				+   trivial database library
			
 
				+
			
 
				+   Copyright (C) Andrew Tridgell              1999-2005
			
 
				+   Copyright (C) Paul `Rusty' Russell		   2000
			
 
				+   Copyright (C) Jeremy Allison			   2000-2003
			
 
				+
			
 
				+     ** NOTE! The following LGPL license applies to the ntdb
			
 
				+     ** library. This does NOT imply that all of Samba is released
			
 
				+     ** under the LGPL
			
 
				+
			
 
				+   This library is free software; you can redistribute it and/or
			
 
				+   modify it under the terms of the GNU Lesser General Public
			
 
				+   License as published by the Free Software Foundation; either
			
 
				+   version 3 of the License, or (at your option) any later version.
			
 
				+
			
 
				+   This library is distributed in the hope that it will be useful,
			
 
				+   but WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
			
 
				+   Lesser General Public License for more details.
			
 
				+
			
 
				+   You should have received a copy of the GNU Lesser General Public
			
 
				+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
			
 
				+*/
			
 
				+
			
 
				+#include "private.h"
			
 
				+#include <ccan/build_assert/build_assert.h>
			
 
				+
			
 
				+/* If we were threaded, we could wait for unlock, but we're not, so fail. */
			
 
				+enum NTDB_ERROR owner_conflict(struct ntdb_context *ntdb, const char *call)
			
 
				+{
			
 
				+	return ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_USE_ERROR,
			
 
				+			  "%s: lock owned by another ntdb in this process.",
			
 
				+			  call);
			
 
				+}
			
 
				+
			
 
				+/* If we fork, we no longer really own locks. */
			
 
				+bool check_lock_pid(struct ntdb_context *ntdb, const char *call, bool log)
			
 
				+{
			
 
				+	/* No locks?  No problem! */
			
 
				+	if (ntdb->file->allrecord_lock.count == 0
			
 
				+	    && ntdb->file->num_lockrecs == 0) {
			
 
				+		return true;
			
 
				+	}
			
 
				+
			
 
				+	/* No fork?  No problem! */
			
 
				+	if (ntdb->file->locker == getpid()) {
			
 
				+		return true;
			
 
				+	}
			
 
				+
			
 
				+	if (log) {
			
 
				+		ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_USE_ERROR,
			
 
				+			    "%s: fork() detected after lock acquisition!"
			
 
				+			    " (%u vs %u)", call,
			
 
				+			    (unsigned int)ntdb->file->locker,
			
 
				+			    (unsigned int)getpid());
			
 
				+	}
			
 
				+	return false;
			
 
				+}
			
 
				+
			
 
				+int ntdb_fcntl_lock(int fd, int rw, off_t off, off_t len, bool waitflag,
			
 
				+		   void *unused)
			
 
				+{
			
 
				+	struct flock fl;
			
 
				+	int ret;
			
 
				+
			
 
				+	do {
			
 
				+		fl.l_type = rw;
			
 
				+		fl.l_whence = SEEK_SET;
			
 
				+		fl.l_start = off;
			
 
				+		fl.l_len = len;
			
 
				+
			
 
				+		if (waitflag)
			
 
				+			ret = fcntl(fd, F_SETLKW, &fl);
			
 
				+		else
			
 
				+			ret = fcntl(fd, F_SETLK, &fl);
			
 
				+	} while (ret != 0 && errno == EINTR);
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+int ntdb_fcntl_unlock(int fd, int rw, off_t off, off_t len, void *unused)
			
 
				+{
			
 
				+	struct flock fl;
			
 
				+	int ret;
			
 
				+
			
 
				+	do {
			
 
				+		fl.l_type = F_UNLCK;
			
 
				+		fl.l_whence = SEEK_SET;
			
 
				+		fl.l_start = off;
			
 
				+		fl.l_len = len;
			
 
				+
			
 
				+		ret = fcntl(fd, F_SETLKW, &fl);
			
 
				+	} while (ret != 0 && errno == EINTR);
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+static int lock(struct ntdb_context *ntdb,
			
 
				+		      int rw, off_t off, off_t len, bool waitflag)
			
 
				+{
			
 
				+	int ret;
			
 
				+	if (ntdb->file->allrecord_lock.count == 0
			
 
				+	    && ntdb->file->num_lockrecs == 0) {
			
 
				+		ntdb->file->locker = getpid();
			
 
				+	}
			
 
				+
			
 
				+	ntdb->stats.lock_lowlevel++;
			
 
				+	ret = ntdb->lock_fn(ntdb->file->fd, rw, off, len, waitflag,
			
 
				+			   ntdb->lock_data);
			
 
				+	if (!waitflag) {
			
 
				+		ntdb->stats.lock_nonblock++;
			
 
				+		if (ret != 0)
			
 
				+			ntdb->stats.lock_nonblock_fail++;
			
 
				+	}
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+static int unlock(struct ntdb_context *ntdb, int rw, off_t off, off_t len)
			
 
				+{
			
 
				+#if 0 /* Check they matched up locks and unlocks correctly. */
			
 
				+	char line[80];
			
 
				+	FILE *locks;
			
 
				+	bool found = false;
			
 
				+
			
 
				+	locks = fopen("/proc/locks", "r");
			
 
				+
			
 
				+	while (fgets(line, 80, locks)) {
			
 
				+		char *p;
			
 
				+		int type, start, l;
			
 
				+
			
 
				+		/* eg. 1: FLOCK  ADVISORY  WRITE 2440 08:01:2180826 0 EOF */
			
 
				+		p = strchr(line, ':') + 1;
			
 
				+		if (strncmp(p, " POSIX  ADVISORY  ", strlen(" POSIX  ADVISORY  ")))
			
 
				+			continue;
			
 
				+		p += strlen(" FLOCK  ADVISORY  ");
			
 
				+		if (strncmp(p, "READ  ", strlen("READ  ")) == 0)
			
 
				+			type = F_RDLCK;
			
 
				+		else if (strncmp(p, "WRITE ", strlen("WRITE ")) == 0)
			
 
				+			type = F_WRLCK;
			
 
				+		else
			
 
				+			abort();
			
 
				+		p += 6;
			
 
				+		if (atoi(p) != getpid())
			
 
				+			continue;
			
 
				+		p = strchr(strchr(p, ' ') + 1, ' ') + 1;
			
 
				+		start = atoi(p);
			
 
				+		p = strchr(p, ' ') + 1;
			
 
				+		if (strncmp(p, "EOF", 3) == 0)
			
 
				+			l = 0;
			
 
				+		else
			
 
				+			l = atoi(p) - start + 1;
			
 
				+
			
 
				+		if (off == start) {
			
 
				+			if (len != l) {
			
 
				+				fprintf(stderr, "Len %u should be %u: %s",
			
 
				+					(int)len, l, line);
			
 
				+				abort();
			
 
				+			}
			
 
				+			if (type != rw) {
			
 
				+				fprintf(stderr, "Type %s wrong: %s",
			
 
				+					rw == F_RDLCK ? "READ" : "WRITE", line);
			
 
				+				abort();
			
 
				+			}
			
 
				+			found = true;
			
 
				+			break;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	if (!found) {
			
 
				+		fprintf(stderr, "Unlock on %u@%u not found!",
			
 
				+			(int)off, (int)len);
			
 
				+		abort();
			
 
				+	}
			
 
				+
			
 
				+	fclose(locks);
			
 
				+#endif
			
 
				+
			
 
				+	return ntdb->unlock_fn(ntdb->file->fd, rw, off, len, ntdb->lock_data);
			
 
				+}
			
 
				+
			
 
				+/* a byte range locking function - return 0 on success
			
 
				+   this functions locks len bytes at the specified offset.
			
 
				+
			
 
				+   note that a len of zero means lock to end of file
			
 
				+*/
			
 
				+static enum NTDB_ERROR ntdb_brlock(struct ntdb_context *ntdb,
			
 
				+				 int rw_type, ntdb_off_t offset, ntdb_off_t len,
			
 
				+				 enum ntdb_lock_flags flags)
			
 
				+{
			
 
				+	int ret;
			
 
				+
			
 
				+	if (rw_type == F_WRLCK && (ntdb->flags & NTDB_RDONLY)) {
			
 
				+		return ntdb_logerr(ntdb, NTDB_ERR_RDONLY, NTDB_LOG_USE_ERROR,
			
 
				+				  "Write lock attempted on read-only database");
			
 
				+	}
			
 
				+
			
 
				+	if (ntdb->flags & NTDB_NOLOCK) {
			
 
				+		return NTDB_SUCCESS;
			
 
				+	}
			
 
				+
			
 
				+	/* A 32 bit system cannot open a 64-bit file, but it could have
			
 
				+	 * expanded since then: check here. */
			
 
				+	if ((size_t)(offset + len) != offset + len) {
			
 
				+		return ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
			
 
				+				  "ntdb_brlock: lock on giant offset %llu",
			
 
				+				  (long long)(offset + len));
			
 
				+	}
			
 
				+
			
 
				+	ret = lock(ntdb, rw_type, offset, len, flags & NTDB_LOCK_WAIT);
			
 
				+	if (ret != 0) {
			
 
				+		/* Generic lock error. errno set by fcntl.
			
 
				+		 * EAGAIN is an expected return from non-blocking
			
 
				+		 * locks. */
			
 
				+		if (!(flags & NTDB_LOCK_PROBE)
			
 
				+		    && (errno != EAGAIN && errno != EINTR)) {
			
 
				+			ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_ERROR,
			
 
				+				   "ntdb_brlock failed (fd=%d) at"
			
 
				+				   " offset %zu rw_type=%d flags=%d len=%zu:"
			
 
				+				   " %s",
			
 
				+				   ntdb->file->fd, (size_t)offset, rw_type,
			
 
				+				   flags, (size_t)len, strerror(errno));
			
 
				+		}
			
 
				+		return NTDB_ERR_LOCK;
			
 
				+	}
			
 
				+	return NTDB_SUCCESS;
			
 
				+}
			
 
				+
			
 
				+static enum NTDB_ERROR ntdb_brunlock(struct ntdb_context *ntdb,
			
 
				+				   int rw_type, ntdb_off_t offset, size_t len)
			
 
				+{
			
 
				+	if (ntdb->flags & NTDB_NOLOCK) {
			
 
				+		return NTDB_SUCCESS;
			
 
				+	}
			
 
				+
			
 
				+	if (!check_lock_pid(ntdb, "ntdb_brunlock", false))
			
 
				+		return NTDB_ERR_LOCK;
			
 
				+
			
 
				+	if (unlock(ntdb, rw_type, offset, len) == -1) {
			
 
				+		return ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_ERROR,
			
 
				+				  "ntdb_brunlock failed (fd=%d) at offset %zu"
			
 
				+				  " rw_type=%d len=%zu: %s",
			
 
				+				  ntdb->file->fd, (size_t)offset, rw_type,
			
 
				+				  (size_t)len, strerror(errno));
			
 
				+	}
			
 
				+	return NTDB_SUCCESS;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+  upgrade a read lock to a write lock. This needs to be handled in a
			
 
				+  special way as some OSes (such as solaris) have too conservative
			
 
				+  deadlock detection and claim a deadlock when progress can be
			
 
				+  made. For those OSes we may loop for a while.
			
 
				+*/
			
 
				+enum NTDB_ERROR ntdb_allrecord_upgrade(struct ntdb_context *ntdb, off_t start)
			
 
				+{
			
 
				+	int count = 1000;
			
 
				+
			
 
				+	if (!check_lock_pid(ntdb, "ntdb_transaction_prepare_commit", true))
			
 
				+		return NTDB_ERR_LOCK;
			
 
				+
			
 
				+	if (ntdb->file->allrecord_lock.count != 1) {
			
 
				+		return ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_ERROR,
			
 
				+				  "ntdb_allrecord_upgrade failed:"
			
 
				+				  " count %u too high",
			
 
				+				  ntdb->file->allrecord_lock.count);
			
 
				+	}
			
 
				+
			
 
				+	if (ntdb->file->allrecord_lock.off != 1) {
			
 
				+		return ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_ERROR,
			
 
				+				  "ntdb_allrecord_upgrade failed:"
			
 
				+				  " already upgraded?");
			
 
				+	}
			
 
				+
			
 
				+	if (ntdb->file->allrecord_lock.owner != ntdb) {
			
 
				+		return owner_conflict(ntdb, "ntdb_allrecord_upgrade");
			
 
				+	}
			
 
				+
			
 
				+	while (count--) {
			
 
				+		struct timeval tv;
			
 
				+		if (ntdb_brlock(ntdb, F_WRLCK, start, 0,
			
 
				+			       NTDB_LOCK_WAIT|NTDB_LOCK_PROBE) == NTDB_SUCCESS) {
			
 
				+			ntdb->file->allrecord_lock.ltype = F_WRLCK;
			
 
				+			ntdb->file->allrecord_lock.off = 0;
			
 
				+			return NTDB_SUCCESS;
			
 
				+		}
			
 
				+		if (errno != EDEADLK) {
			
 
				+			break;
			
 
				+		}
			
 
				+		/* sleep for as short a time as we can - more portable than usleep() */
			
 
				+		tv.tv_sec = 0;
			
 
				+		tv.tv_usec = 1;
			
 
				+		select(0, NULL, NULL, NULL, &tv);
			
 
				+	}
			
 
				+
			
 
				+	if (errno != EAGAIN && errno != EINTR)
			
 
				+		ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_ERROR,
			
 
				+			   "ntdb_allrecord_upgrade failed");
			
 
				+	return NTDB_ERR_LOCK;
			
 
				+}
			
 
				+
			
 
				+static struct ntdb_lock *find_nestlock(struct ntdb_context *ntdb, ntdb_off_t offset,
			
 
				+				      const struct ntdb_context *owner)
			
 
				+{
			
 
				+	unsigned int i;
			
 
				+
			
 
				+	for (i=0; i<ntdb->file->num_lockrecs; i++) {
			
 
				+		if (ntdb->file->lockrecs[i].off == offset) {
			
 
				+			if (owner && ntdb->file->lockrecs[i].owner != owner)
			
 
				+				return NULL;
			
 
				+			return &ntdb->file->lockrecs[i];
			
 
				+		}
			
 
				+	}
			
 
				+	return NULL;
			
 
				+}
			
 
				+
			
 
				+enum NTDB_ERROR ntdb_lock_and_recover(struct ntdb_context *ntdb)
			
 
				+{
			
 
				+	enum NTDB_ERROR ecode;
			
 
				+
			
 
				+	if (!check_lock_pid(ntdb, "ntdb_transaction_prepare_commit", true))
			
 
				+		return NTDB_ERR_LOCK;
			
 
				+
			
 
				+	ecode = ntdb_allrecord_lock(ntdb, F_WRLCK, NTDB_LOCK_WAIT|NTDB_LOCK_NOCHECK,
			
 
				+				   false);
			
 
				+	if (ecode != NTDB_SUCCESS) {
			
 
				+		return ecode;
			
 
				+	}
			
 
				+
			
 
				+	ecode = ntdb_lock_open(ntdb, F_WRLCK, NTDB_LOCK_WAIT|NTDB_LOCK_NOCHECK);
			
 
				+	if (ecode != NTDB_SUCCESS) {
			
 
				+		ntdb_allrecord_unlock(ntdb, F_WRLCK);
			
 
				+		return ecode;
			
 
				+	}
			
 
				+	ecode = ntdb_transaction_recover(ntdb);
			
 
				+	ntdb_unlock_open(ntdb, F_WRLCK);
			
 
				+	ntdb_allrecord_unlock(ntdb, F_WRLCK);
			
 
				+
			
 
				+	return ecode;
			
 
				+}
			
 
				+
			
 
				+/* lock an offset in the database. */
			
 
				+static enum NTDB_ERROR ntdb_nest_lock(struct ntdb_context *ntdb,
			
 
				+				    ntdb_off_t offset, int ltype,
			
 
				+				    enum ntdb_lock_flags flags)
			
 
				+{
			
 
				+	struct ntdb_lock *new_lck;
			
 
				+	enum NTDB_ERROR ecode;
			
 
				+
			
 
				+	assert(offset <= (NTDB_HASH_LOCK_START + (1 << ntdb->hash_bits)
			
 
				+			  + ntdb->file->map_size / 8));
			
 
				+
			
 
				+	if (ntdb->flags & NTDB_NOLOCK)
			
 
				+		return NTDB_SUCCESS;
			
 
				+
			
 
				+	if (!check_lock_pid(ntdb, "ntdb_nest_lock", true)) {
			
 
				+		return NTDB_ERR_LOCK;
			
 
				+	}
			
 
				+
			
 
				+	ntdb->stats.locks++;
			
 
				+
			
 
				+	new_lck = find_nestlock(ntdb, offset, NULL);
			
 
				+	if (new_lck) {
			
 
				+		if (new_lck->owner != ntdb) {
			
 
				+			return owner_conflict(ntdb, "ntdb_nest_lock");
			
 
				+		}
			
 
				+
			
 
				+		if (new_lck->ltype == F_RDLCK && ltype == F_WRLCK) {
			
 
				+			return ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_ERROR,
			
 
				+					  "ntdb_nest_lock:"
			
 
				+					  " offset %zu has read lock",
			
 
				+					  (size_t)offset);
			
 
				+		}
			
 
				+		/* Just increment the struct, posix locks don't stack. */
			
 
				+		new_lck->count++;
			
 
				+		return NTDB_SUCCESS;
			
 
				+	}
			
 
				+
			
 
				+#if 0
			
 
				+	if (ntdb->file->num_lockrecs
			
 
				+	    && offset >= NTDB_HASH_LOCK_START
			
 
				+	    && offset < NTDB_HASH_LOCK_START + NTDB_HASH_LOCK_RANGE) {
			
 
				+		return ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_ERROR,
			
 
				+				  "ntdb_nest_lock: already have a hash lock?");
			
 
				+	}
			
 
				+#endif
			
 
				+	if (ntdb->file->lockrecs == NULL) {
			
 
				+		new_lck = ntdb->alloc_fn(ntdb->file, sizeof(*ntdb->file->lockrecs),
			
 
				+				     ntdb->alloc_data);
			
 
				+	} else {
			
 
				+		new_lck = (struct ntdb_lock *)ntdb->expand_fn(
			
 
				+			ntdb->file->lockrecs,
			
 
				+			sizeof(*ntdb->file->lockrecs)
			
 
				+			* (ntdb->file->num_lockrecs+1),
			
 
				+			ntdb->alloc_data);
			
 
				+	}
			
 
				+	if (new_lck == NULL) {
			
 
				+		return ntdb_logerr(ntdb, NTDB_ERR_OOM, NTDB_LOG_ERROR,
			
 
				+				  "ntdb_nest_lock:"
			
 
				+				  " unable to allocate %zu lock struct",
			
 
				+				  ntdb->file->num_lockrecs + 1);
			
 
				+	}
			
 
				+	ntdb->file->lockrecs = new_lck;
			
 
				+
			
 
				+	/* Since fcntl locks don't nest, we do a lock for the first one,
			
 
				+	   and simply bump the count for future ones */
			
 
				+	ecode = ntdb_brlock(ntdb, ltype, offset, 1, flags);
			
 
				+	if (ecode != NTDB_SUCCESS) {
			
 
				+		return ecode;
			
 
				+	}
			
 
				+
			
 
				+	/* First time we grab a lock, perhaps someone died in commit? */
			
 
				+	if (!(flags & NTDB_LOCK_NOCHECK)
			
 
				+	    && ntdb->file->num_lockrecs == 0) {
			
 
				+		ntdb_bool_err berr = ntdb_needs_recovery(ntdb);
			
 
				+		if (berr != false) {
			
 
				+			ntdb_brunlock(ntdb, ltype, offset, 1);
			
 
				+
			
 
				+			if (berr < 0)
			
 
				+				return NTDB_OFF_TO_ERR(berr);
			
 
				+			ecode = ntdb_lock_and_recover(ntdb);
			
 
				+			if (ecode == NTDB_SUCCESS) {
			
 
				+				ecode = ntdb_brlock(ntdb, ltype, offset, 1,
			
 
				+						   flags);
			
 
				+			}
			
 
				+			if (ecode != NTDB_SUCCESS) {
			
 
				+				return ecode;
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	ntdb->file->lockrecs[ntdb->file->num_lockrecs].owner = ntdb;
			
 
				+	ntdb->file->lockrecs[ntdb->file->num_lockrecs].off = offset;
			
 
				+	ntdb->file->lockrecs[ntdb->file->num_lockrecs].count = 1;
			
 
				+	ntdb->file->lockrecs[ntdb->file->num_lockrecs].ltype = ltype;
			
 
				+	ntdb->file->num_lockrecs++;
			
 
				+
			
 
				+	return NTDB_SUCCESS;
			
 
				+}
			
 
				+
			
 
				+static enum NTDB_ERROR ntdb_nest_unlock(struct ntdb_context *ntdb,
			
 
				+				      ntdb_off_t off, int ltype)
			
 
				+{
			
 
				+	struct ntdb_lock *lck;
			
 
				+	enum NTDB_ERROR ecode;
			
 
				+
			
 
				+	if (ntdb->flags & NTDB_NOLOCK)
			
 
				+		return NTDB_SUCCESS;
			
 
				+
			
 
				+	lck = find_nestlock(ntdb, off, ntdb);
			
 
				+	if ((lck == NULL) || (lck->count == 0)) {
			
 
				+		return ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_ERROR,
			
 
				+				  "ntdb_nest_unlock: no lock for %zu",
			
 
				+				  (size_t)off);
			
 
				+	}
			
 
				+
			
 
				+	if (lck->count > 1) {
			
 
				+		lck->count--;
			
 
				+		return NTDB_SUCCESS;
			
 
				+	}
			
 
				+
			
 
				+	/*
			
 
				+	 * This lock has count==1 left, so we need to unlock it in the
			
 
				+	 * kernel. We don't bother with decrementing the in-memory array
			
 
				+	 * element, we're about to overwrite it with the last array element
			
 
				+	 * anyway.
			
 
				+	 */
			
 
				+	ecode = ntdb_brunlock(ntdb, ltype, off, 1);
			
 
				+
			
 
				+	/*
			
 
				+	 * Shrink the array by overwriting the element just unlocked with the
			
 
				+	 * last array element.
			
 
				+	 */
			
 
				+	*lck = ntdb->file->lockrecs[--ntdb->file->num_lockrecs];
			
 
				+
			
 
				+	return ecode;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+  get the transaction lock
			
 
				+ */
			
 
				+enum NTDB_ERROR ntdb_transaction_lock(struct ntdb_context *ntdb, int ltype)
			
 
				+{
			
 
				+	return ntdb_nest_lock(ntdb, NTDB_TRANSACTION_LOCK, ltype, NTDB_LOCK_WAIT);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+  release the transaction lock
			
 
				+ */
			
 
				+void ntdb_transaction_unlock(struct ntdb_context *ntdb, int ltype)
			
 
				+{
			
 
				+	ntdb_nest_unlock(ntdb, NTDB_TRANSACTION_LOCK, ltype);
			
 
				+}
			
 
				+
			
 
				+/* We only need to lock individual bytes, but Linux merges consecutive locks
			
 
				+ * so we lock in contiguous ranges. */
			
 
				+static enum NTDB_ERROR ntdb_lock_gradual(struct ntdb_context *ntdb,
			
 
				+				       int ltype, enum ntdb_lock_flags flags,
			
 
				+				       ntdb_off_t off, ntdb_off_t len)
			
 
				+{
			
 
				+	enum NTDB_ERROR ecode;
			
 
				+	enum ntdb_lock_flags nb_flags = (flags & ~NTDB_LOCK_WAIT);
			
 
				+
			
 
				+	if (len <= 1) {
			
 
				+		/* 0 would mean to end-of-file... */
			
 
				+		assert(len != 0);
			
 
				+		/* Single hash.  Just do blocking lock. */
			
 
				+		return ntdb_brlock(ntdb, ltype, off, len, flags);
			
 
				+	}
			
 
				+
			
 
				+	/* First we try non-blocking. */
			
 
				+	ecode = ntdb_brlock(ntdb, ltype, off, len, nb_flags);
			
 
				+	if (ecode != NTDB_ERR_LOCK) {
			
 
				+		return ecode;
			
 
				+	}
			
 
				+
			
 
				+	/* Try locking first half, then second. */
			
 
				+	ecode = ntdb_lock_gradual(ntdb, ltype, flags, off, len / 2);
			
 
				+	if (ecode != NTDB_SUCCESS)
			
 
				+		return ecode;
			
 
				+
			
 
				+	ecode = ntdb_lock_gradual(ntdb, ltype, flags,
			
 
				+				 off + len / 2, len - len / 2);
			
 
				+	if (ecode != NTDB_SUCCESS) {
			
 
				+		ntdb_brunlock(ntdb, ltype, off, len / 2);
			
 
				+	}
			
 
				+	return ecode;
			
 
				+}
			
 
				+
			
 
				+/* lock/unlock entire database.  It can only be upgradable if you have some
			
 
				+ * other way of guaranteeing exclusivity (ie. transaction write lock). */
			
 
				+enum NTDB_ERROR ntdb_allrecord_lock(struct ntdb_context *ntdb, int ltype,
			
 
				+				  enum ntdb_lock_flags flags, bool upgradable)
			
 
				+{
			
 
				+	enum NTDB_ERROR ecode;
			
 
				+	ntdb_bool_err berr;
			
 
				+
			
 
				+	if (ntdb->flags & NTDB_NOLOCK) {
			
 
				+		return NTDB_SUCCESS;
			
 
				+	}
			
 
				+
			
 
				+	if (!check_lock_pid(ntdb, "ntdb_allrecord_lock", true)) {
			
 
				+		return NTDB_ERR_LOCK;
			
 
				+	}
			
 
				+
			
 
				+	if (ntdb->file->allrecord_lock.count) {
			
 
				+		if (ntdb->file->allrecord_lock.owner != ntdb) {
			
 
				+			return owner_conflict(ntdb, "ntdb_allrecord_lock");
			
 
				+		}
			
 
				+
			
 
				+		if (ltype == F_RDLCK
			
 
				+		    || ntdb->file->allrecord_lock.ltype == F_WRLCK) {
			
 
				+			ntdb->file->allrecord_lock.count++;
			
 
				+			return NTDB_SUCCESS;
			
 
				+		}
			
 
				+
			
 
				+		/* a global lock of a different type exists */
			
 
				+		return ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_USE_ERROR,
			
 
				+				  "ntdb_allrecord_lock: already have %s lock",
			
 
				+				  ntdb->file->allrecord_lock.ltype == F_RDLCK
			
 
				+				  ? "read" : "write");
			
 
				+	}
			
 
				+
			
 
				+	if (ntdb_has_hash_locks(ntdb)) {
			
 
				+		/* can't combine global and chain locks */
			
 
				+		return ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_USE_ERROR,
			
 
				+				  "ntdb_allrecord_lock:"
			
 
				+				  " already have chain lock");
			
 
				+	}
			
 
				+
			
 
				+	if (upgradable && ltype != F_RDLCK) {
			
 
				+		/* ntdb error: you can't upgrade a write lock! */
			
 
				+		return ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_ERROR,
			
 
				+				  "ntdb_allrecord_lock:"
			
 
				+				  " can't upgrade a write lock");
			
 
				+	}
			
 
				+
			
 
				+	ntdb->stats.locks++;
			
 
				+again:
			
 
				+	/* Lock hashes, gradually. */
			
 
				+	ecode = ntdb_lock_gradual(ntdb, ltype, flags, NTDB_HASH_LOCK_START,
			
 
				+				  1 << ntdb->hash_bits);
			
 
				+	if (ecode != NTDB_SUCCESS)
			
 
				+		return ecode;
			
 
				+
			
 
				+	/* Lock free tables: there to end of file. */
			
 
				+	ecode = ntdb_brlock(ntdb, ltype,
			
 
				+			    NTDB_HASH_LOCK_START + (1 << ntdb->hash_bits),
			
 
				+			    0, flags);
			
 
				+	if (ecode != NTDB_SUCCESS) {
			
 
				+		ntdb_brunlock(ntdb, ltype, NTDB_HASH_LOCK_START,
			
 
				+			      1 << ntdb->hash_bits);
			
 
				+		return ecode;
			
 
				+	}
			
 
				+
			
 
				+	ntdb->file->allrecord_lock.owner = ntdb;
			
 
				+	ntdb->file->allrecord_lock.count = 1;
			
 
				+	/* If it's upgradable, it's actually exclusive so we can treat
			
 
				+	 * it as a write lock. */
			
 
				+	ntdb->file->allrecord_lock.ltype = upgradable ? F_WRLCK : ltype;
			
 
				+	ntdb->file->allrecord_lock.off = upgradable;
			
 
				+
			
 
				+	/* Now check for needing recovery. */
			
 
				+	if (flags & NTDB_LOCK_NOCHECK)
			
 
				+		return NTDB_SUCCESS;
			
 
				+
			
 
				+	berr = ntdb_needs_recovery(ntdb);
			
 
				+	if (likely(berr == false))
			
 
				+		return NTDB_SUCCESS;
			
 
				+
			
 
				+	ntdb_allrecord_unlock(ntdb, ltype);
			
 
				+	if (berr < 0)
			
 
				+		return NTDB_OFF_TO_ERR(berr);
			
 
				+	ecode = ntdb_lock_and_recover(ntdb);
			
 
				+	if (ecode != NTDB_SUCCESS) {
			
 
				+		return ecode;
			
 
				+	}
			
 
				+	goto again;
			
 
				+}
			
 
				+
			
 
				+enum NTDB_ERROR ntdb_lock_open(struct ntdb_context *ntdb,
			
 
				+			     int ltype, enum ntdb_lock_flags flags)
			
 
				+{
			
 
				+	return ntdb_nest_lock(ntdb, NTDB_OPEN_LOCK, ltype, flags);
			
 
				+}
			
 
				+
			
 
				+void ntdb_unlock_open(struct ntdb_context *ntdb, int ltype)
			
 
				+{
			
 
				+	ntdb_nest_unlock(ntdb, NTDB_OPEN_LOCK, ltype);
			
 
				+}
			
 
				+
			
 
				+bool ntdb_has_open_lock(struct ntdb_context *ntdb)
			
 
				+{
			
 
				+	return !(ntdb->flags & NTDB_NOLOCK)
			
 
				+		&& find_nestlock(ntdb, NTDB_OPEN_LOCK, ntdb) != NULL;
			
 
				+}
			
 
				+
			
 
				+enum NTDB_ERROR ntdb_lock_expand(struct ntdb_context *ntdb, int ltype)
			
 
				+{
			
 
				+	/* Lock doesn't protect data, so don't check (we recurse if we do!) */
			
 
				+	return ntdb_nest_lock(ntdb, NTDB_EXPANSION_LOCK, ltype,
			
 
				+			     NTDB_LOCK_WAIT | NTDB_LOCK_NOCHECK);
			
 
				+}
			
 
				+
			
 
				+void ntdb_unlock_expand(struct ntdb_context *ntdb, int ltype)
			
 
				+{
			
 
				+	ntdb_nest_unlock(ntdb, NTDB_EXPANSION_LOCK, ltype);
			
 
				+}
			
 
				+
			
 
				+/* unlock entire db */
			
 
				+void ntdb_allrecord_unlock(struct ntdb_context *ntdb, int ltype)
			
 
				+{
			
 
				+	if (ntdb->flags & NTDB_NOLOCK)
			
 
				+		return;
			
 
				+
			
 
				+	if (ntdb->file->allrecord_lock.count == 0) {
			
 
				+		ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_USE_ERROR,
			
 
				+			   "ntdb_allrecord_unlock: not locked!");
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	if (ntdb->file->allrecord_lock.owner != ntdb) {
			
 
				+		ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_USE_ERROR,
			
 
				+			   "ntdb_allrecord_unlock: not locked by us!");
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	/* Upgradable locks are marked as write locks. */
			
 
				+	if (ntdb->file->allrecord_lock.ltype != ltype
			
 
				+	    && (!ntdb->file->allrecord_lock.off || ltype != F_RDLCK)) {
			
 
				+		ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_ERROR,
			
 
				+			   "ntdb_allrecord_unlock: have %s lock",
			
 
				+			   ntdb->file->allrecord_lock.ltype == F_RDLCK
			
 
				+			   ? "read" : "write");
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	if (ntdb->file->allrecord_lock.count > 1) {
			
 
				+		ntdb->file->allrecord_lock.count--;
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	ntdb->file->allrecord_lock.count = 0;
			
 
				+	ntdb->file->allrecord_lock.ltype = 0;
			
 
				+
			
 
				+	ntdb_brunlock(ntdb, ltype, NTDB_HASH_LOCK_START, 0);
			
 
				+}
			
 
				+
			
 
				+bool ntdb_has_expansion_lock(struct ntdb_context *ntdb)
			
 
				+{
			
 
				+	return find_nestlock(ntdb, NTDB_EXPANSION_LOCK, ntdb) != NULL;
			
 
				+}
			
 
				+
			
 
				+bool ntdb_has_hash_locks(struct ntdb_context *ntdb)
			
 
				+{
			
 
				+	unsigned int i;
			
 
				+
			
 
				+	for (i=0; i<ntdb->file->num_lockrecs; i++) {
			
 
				+		if (ntdb->file->lockrecs[i].off >= NTDB_HASH_LOCK_START
			
 
				+		    && ntdb->file->lockrecs[i].off < (NTDB_HASH_LOCK_START
			
 
				+						      + (1 << ntdb->hash_bits)))
			
 
				+			return true;
			
 
				+	}
			
 
				+	return false;
			
 
				+}
			
 
				+
			
 
				+static bool ntdb_has_free_lock(struct ntdb_context *ntdb)
			
 
				+{
			
 
				+	unsigned int i;
			
 
				+
			
 
				+	if (ntdb->flags & NTDB_NOLOCK)
			
 
				+		return false;
			
 
				+
			
 
				+	for (i=0; i<ntdb->file->num_lockrecs; i++) {
			
 
				+		if (ntdb->file->lockrecs[i].off
			
 
				+		    > NTDB_HASH_LOCK_START + (1 << ntdb->hash_bits))
			
 
				+			return true;
			
 
				+	}
			
 
				+	return false;
			
 
				+}
			
 
				+
			
 
				+enum NTDB_ERROR ntdb_lock_hash(struct ntdb_context *ntdb,
			
 
				+			       unsigned int h,
			
 
				+			       int ltype)
			
 
				+{
			
 
				+	unsigned l = NTDB_HASH_LOCK_START + h;
			
 
				+
			
 
				+	assert(h < (1 << ntdb->hash_bits));
			
 
				+
			
 
				+	/* a allrecord lock allows us to avoid per chain locks */
			
 
				+	if (ntdb->file->allrecord_lock.count) {
			
 
				+		if (!check_lock_pid(ntdb, "ntdb_lock_hashes", true))
			
 
				+			return NTDB_ERR_LOCK;
			
 
				+
			
 
				+		if (ntdb->file->allrecord_lock.owner != ntdb)
			
 
				+			return owner_conflict(ntdb, "ntdb_lock_hashes");
			
 
				+		if (ltype == ntdb->file->allrecord_lock.ltype
			
 
				+		    || ltype == F_RDLCK) {
			
 
				+			return NTDB_SUCCESS;
			
 
				+		}
			
 
				+
			
 
				+		return ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_USE_ERROR,
			
 
				+				  "ntdb_lock_hashes:"
			
 
				+				  " already have %s allrecordlock",
			
 
				+				  ntdb->file->allrecord_lock.ltype == F_RDLCK
			
 
				+				  ? "read" : "write");
			
 
				+	}
			
 
				+
			
 
				+	if (ntdb_has_free_lock(ntdb)) {
			
 
				+		return ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_ERROR,
			
 
				+				  "ntdb_lock_hashes: already have free lock");
			
 
				+	}
			
 
				+
			
 
				+	if (ntdb_has_expansion_lock(ntdb)) {
			
 
				+		return ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_ERROR,
			
 
				+				  "ntdb_lock_hashes:"
			
 
				+				  " already have expansion lock");
			
 
				+	}
			
 
				+
			
 
				+	return ntdb_nest_lock(ntdb, l, ltype, NTDB_LOCK_WAIT);
			
 
				+}
			
 
				+
			
 
				+enum NTDB_ERROR ntdb_unlock_hash(struct ntdb_context *ntdb,
			
 
				+				 unsigned int h, int ltype)
			
 
				+{
			
 
				+	unsigned l = NTDB_HASH_LOCK_START + (h & ((1 << ntdb->hash_bits)-1));
			
 
				+
			
 
				+	if (ntdb->flags & NTDB_NOLOCK)
			
 
				+		return 0;
			
 
				+
			
 
				+	/* a allrecord lock allows us to avoid per chain locks */
			
 
				+	if (ntdb->file->allrecord_lock.count) {
			
 
				+		if (ntdb->file->allrecord_lock.ltype == F_RDLCK
			
 
				+		    && ltype == F_WRLCK) {
			
 
				+			return ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_ERROR,
			
 
				+					  "ntdb_unlock_hashes RO allrecord!");
			
 
				+		}
			
 
				+		if (ntdb->file->allrecord_lock.owner != ntdb) {
			
 
				+			return ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_USE_ERROR,
			
 
				+					  "ntdb_unlock_hashes:"
			
 
				+					  " not locked by us!");
			
 
				+		}
			
 
				+		return NTDB_SUCCESS;
			
 
				+	}
			
 
				+
			
 
				+	return ntdb_nest_unlock(ntdb, l, ltype);
			
 
				+}
			
 
				+
			
 
				+/* Hash locks use NTDB_HASH_LOCK_START + <number of hash entries>..
			
 
				+ * Then we begin; bucket offsets are sizeof(ntdb_len_t) apart, so we divide.
			
 
				+ * The result is that on 32 bit systems we don't use lock values > 2^31 on
			
 
				+ * files that are less than 4GB.
			
 
				+ */
			
 
				+static ntdb_off_t free_lock_off(const struct ntdb_context *ntdb,
			
 
				+				ntdb_off_t b_off)
			
 
				+{
			
 
				+	return NTDB_HASH_LOCK_START + (1 << ntdb->hash_bits)
			
 
				+		+ b_off / sizeof(ntdb_off_t);
			
 
				+}
			
 
				+
			
 
				+enum NTDB_ERROR ntdb_lock_free_bucket(struct ntdb_context *ntdb, ntdb_off_t b_off,
			
 
				+				    enum ntdb_lock_flags waitflag)
			
 
				+{
			
 
				+	assert(b_off >= sizeof(struct ntdb_header));
			
 
				+
			
 
				+	if (ntdb->flags & NTDB_NOLOCK)
			
 
				+		return 0;
			
 
				+
			
 
				+	/* a allrecord lock allows us to avoid per chain locks */
			
 
				+	if (ntdb->file->allrecord_lock.count) {
			
 
				+		if (!check_lock_pid(ntdb, "ntdb_lock_free_bucket", true))
			
 
				+			return NTDB_ERR_LOCK;
			
 
				+
			
 
				+		if (ntdb->file->allrecord_lock.owner != ntdb) {
			
 
				+			return owner_conflict(ntdb, "ntdb_lock_free_bucket");
			
 
				+		}
			
 
				+
			
 
				+		if (ntdb->file->allrecord_lock.ltype == F_WRLCK)
			
 
				+			return 0;
			
 
				+		return ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_ERROR,
			
 
				+				  "ntdb_lock_free_bucket with"
			
 
				+				  " read-only allrecordlock!");
			
 
				+	}
			
 
				+
			
 
				+#if 0 /* FIXME */
			
 
				+	if (ntdb_has_expansion_lock(ntdb)) {
			
 
				+		return ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_ERROR,
			
 
				+				  "ntdb_lock_free_bucket:"
			
 
				+				  " already have expansion lock");
			
 
				+	}
			
 
				+#endif
			
 
				+
			
 
				+	return ntdb_nest_lock(ntdb, free_lock_off(ntdb, b_off), F_WRLCK,
			
 
				+			      waitflag);
			
 
				+}
			
 
				+
			
 
				+void ntdb_unlock_free_bucket(struct ntdb_context *ntdb, ntdb_off_t b_off)
			
 
				+{
			
 
				+	if (ntdb->file->allrecord_lock.count)
			
 
				+		return;
			
 
				+
			
 
				+	ntdb_nest_unlock(ntdb, free_lock_off(ntdb, b_off), F_WRLCK);
			
 
				+}
			
 
				+
			
 
				+_PUBLIC_ enum NTDB_ERROR ntdb_lockall(struct ntdb_context *ntdb)
			
 
				+{
			
 
				+	return ntdb_allrecord_lock(ntdb, F_WRLCK, NTDB_LOCK_WAIT, false);
			
 
				+}
			
 
				+
			
 
				+_PUBLIC_ void ntdb_unlockall(struct ntdb_context *ntdb)
			
 
				+{
			
 
				+	ntdb_allrecord_unlock(ntdb, F_WRLCK);
			
 
				+}
			
 
				+
			
 
				+_PUBLIC_ enum NTDB_ERROR ntdb_lockall_read(struct ntdb_context *ntdb)
			
 
				+{
			
 
				+	return ntdb_allrecord_lock(ntdb, F_RDLCK, NTDB_LOCK_WAIT, false);
			
 
				+}
			
 
				+
			
 
				+_PUBLIC_ void ntdb_unlockall_read(struct ntdb_context *ntdb)
			
 
				+{
			
 
				+	ntdb_allrecord_unlock(ntdb, F_RDLCK);
			
 
				+}
			
 
				+
			
 
				+void ntdb_lock_cleanup(struct ntdb_context *ntdb)
			
 
				+{
			
 
				+	unsigned int i;
			
 
				+
			
 
				+	/* We don't want to warn: they're allowed to close ntdb after fork. */
			
 
				+	if (!check_lock_pid(ntdb, "ntdb_close", false))
			
 
				+		return;
			
 
				+
			
 
				+	while (ntdb->file->allrecord_lock.count
			
 
				+	       && ntdb->file->allrecord_lock.owner == ntdb) {
			
 
				+		ntdb_allrecord_unlock(ntdb, ntdb->file->allrecord_lock.ltype);
			
 
				+	}
			
 
				+
			
 
				+	for (i=0; i<ntdb->file->num_lockrecs; i++) {
			
 
				+		if (ntdb->file->lockrecs[i].owner == ntdb) {
			
 
				+			ntdb_nest_unlock(ntdb,
			
 
				+					ntdb->file->lockrecs[i].off,
			
 
				+					ntdb->file->lockrecs[i].ltype);
			
 
				+			i--;
			
 
				+		}
			
 
				+	}
			
 
				+}
			
--- a/ccan/ntdb/man/ntdb.3.xml
+++ b/ccan/ntdb/man/ntdb.3.xml
@@ -0,0 +1,132 @@
 
				+<?xml version="1.0"?>
			
 
				+<!DOCTYPE refentry PUBLIC "-//OASIS//DTD DocBook XML V4.2//EN" "http://www.oasis-open.org/docbook/xml/4.2/docbookx.dtd">
			
 
				+<refentry>
			
 
				+  <refmeta>
			
 
				+    <refentrytitle>ntdb</refentrytitle>
			
 
				+    <manvolnum>3</manvolnum>
			
 
				+    <refmiscinfo class="source">Samba</refmiscinfo>
			
 
				+    <refmiscinfo class="manual">System Administration tools</refmiscinfo>
			
 
				+    <refmiscinfo class="version">4.1</refmiscinfo>
			
 
				+  </refmeta>
			
 
				+  <refnamediv>
			
 
				+    <refname>ntdb</refname>
			
 
				+<refpurpose>A not-so trivial keyword/data database system</refpurpose>
			
 
				+  </refnamediv>
			
 
				+  <refsynopsisdiv>
			
 
				+<synopsis>#include &lt;ntdb.h&gt;</synopsis>
			
 
				+  </refsynopsisdiv>
			
 
				+  <refsect1><title>DESCRIPTION</title>
			
 
				+    <para>
			
 
				+      If you have previously used the tdb library from Samba, much of
			
 
				+      this will seem familiar, but there are some API changes which a
			
 
				+      compiler will warn you about if you simply replace 'tdb' with
			
 
				+      'ntdb' in your code!  The on-disk format for ntdb is
			
 
				+      incompatible with tdb.
			
 
				+    </para>
			
 
				+    <para>
			
 
				+      tdb's API was based on gdbm, and ntdb continues this tradition,
			
 
				+      with enhancements.  A differences guide is available in the text
			
 
				+      file <filename>lib/ntdb/doc/TDB_porting.txt</filename> in the
			
 
				+      SAMBA source tree.
			
 
				+    </para>
			
 
				+  </refsect1>
			
 
				+  <refsect1><title>NTDB API OVERVIEW</title>
			
 
				+    <para>
			
 
				+      The complete API is documented in the ntdb.h header, which is
			
 
				+      kept up-to-date and recommended reading.
			
 
				+    </para>
			
 
				+    <para>
			
 
				+      Normal usage is to call ntdb_open() to create or open an ntdb
			
 
				+      file.  ntdb_store() is used to add records, ntdb_fetch() is used
			
 
				+      to fetch them.  Traversals are supported via callback
			
 
				+      (ntdb_traverse()) or iteration (ntdb_firstkey() and
			
 
				+      ntdb_nextkey()).  Transactions are supported for batching
			
 
				+      updates or reads atomically, using ntdb_transaction_start() and
			
 
				+      ntdb_transaction_commit().
			
 
				+    </para>
			
 
				+    <refsect2><title>Use With Talloc</title>
			
 
				+      <para>
			
 
				+	ntdb_open() takes an optional linked list of attributes:
			
 
				+	in particular you can specify an alternate allocator (such as
			
 
				+	talloc):
			
 
				+      </para>
			
 
				+      <programlisting>
			
 
				+#include &lt;talloc.h&gt;
			
 
				+#include &lt;ntdb.h&gt;
			
 
				+
			
 
				+static void *my_alloc(const void *owner, size_t len, void *priv)
			
 
				+{
			
 
				+    return talloc_size(owner, len);
			
 
				+}
			
 
				+
			
 
				+static void *my_expand(void *old, size_t newlen, void *priv)
			
 
				+{
			
 
				+    return talloc_realloc_size(NULL, old, newlen);
			
 
				+}
			
 
				+
			
 
				+static void my_free(void *old, void *priv)
			
 
				+{
			
 
				+    talloc_free(old);
			
 
				+}
			
 
				+
			
 
				+/* This opens an ntdb file as a talloc object with given parent. */
			
 
				+struct ntdb_context *ntdb_open_talloc(const void *parent,
			
 
				+                                      const char *filename)
			
 
				+{
			
 
				+     struct ntdb_context *ntdb;
			
 
				+     union ntdb_attribute alloc;
			
 
				+
			
 
				+     alloc.base.attr = NTDB_ATTRIBUTE_ALLOCATOR;
			
 
				+     alloc.base.next = NULL;
			
 
				+     alloc.alloc.alloc = my_alloc;
			
 
				+     alloc.alloc.expand = my_expand;
			
 
				+     alloc.alloc.free = my_free;
			
 
				+
			
 
				+     ntdb = ntdb_open(filename, NTDB_DEFAULT, O_RDWR|O_CREAT, 0600,
			
 
				+                      &amp;alloc);
			
 
				+     if (ntdb) {
			
 
				+         talloc_steal(parent, ntdb);
			
 
				+         talloc_set_name(ntdb, "%s", filename);
			
 
				+     }
			
 
				+     return ntdb;
			
 
				+}
			
 
				+</programlisting>
			
 
				+    </refsect2>
			
 
				+  </refsect1>
			
 
				+  <refsect1><title>SEE ALSO</title>
			
 
				+    <para>
			
 
				+      <ulink url="http://tdb.samba.org/"/>
			
 
				+    </para>
			
 
				+  </refsect1>
			
 
				+
			
 
				+  <refsect1><title>AUTHOR</title>
			
 
				+    <para> The original tdb software was created by Andrew Tridgell, and
			
 
				+    is now developed by the
			
 
				+      Samba Team as an Open Source project similar to the way the
			
 
				+      Linux kernel is developed.  ntdb was derived from tdb, but mostly
			
 
				+      rewritten by Rusty Russell.
			
 
				+    </para>
			
 
				+  </refsect1>
			
 
				+
			
 
				+  <refsect1><title>COPYRIGHT/LICENSE</title>
			
 
				+    <para>
			
 
				+      Copyright (C) Rusty Russell 2013, IBM Corporation
			
 
				+    </para>
			
 
				+    <para>
			
 
				+      This program is free software; you can redistribute it and/or modify
			
 
				+      it under the terms of the GNU Lesser General Public License as
			
 
				+      published by the Free Software Foundation; either version 3 of the
			
 
				+      License, or (at your option) any later version.
			
 
				+    </para>
			
 
				+    <para>
			
 
				+      This program is distributed in the hope that it will be useful, but
			
 
				+      WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+      MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
			
 
				+      General Public License for more details.
			
 
				+    </para>
			
 
				+    <para>
			
 
				+      You should have received a copy of the GNU General Public License
			
 
				+      along with this program; if not, see http://www.gnu.org/licenses/.
			
 
				+    </para>
			
 
				+  </refsect1>
			
 
				+</refentry>
			
--- a/ccan/ntdb/man/ntdbbackup.8.xml
+++ b/ccan/ntdb/man/ntdbbackup.8.xml
@@ -0,0 +1,150 @@
 
				+<?xml version="1.0" encoding="iso-8859-1"?>
			
 
				+<!DOCTYPE refentry PUBLIC "-//OASIS//DTD DocBook XML V4.2//EN" "http://www.oasis-open.org/docbook/xml/4.2/docbookx.dtd">
			
 
				+<refentry id="ntdbbackup.8">
			
 
				+
			
 
				+<refmeta>
			
 
				+	<refentrytitle>ntdbbackup</refentrytitle>
			
 
				+	<manvolnum>8</manvolnum>
			
 
				+	<refmiscinfo class="source">Samba</refmiscinfo>
			
 
				+	<refmiscinfo class="manual">System Administration tools</refmiscinfo>
			
 
				+	<refmiscinfo class="version">4.1</refmiscinfo>
			
 
				+</refmeta>
			
 
				+
			
 
				+
			
 
				+<refnamediv>
			
 
				+	<refname>ntdbbackup</refname>
			
 
				+	<refpurpose>tool for backing up and for validating the integrity of samba .ntdb files</refpurpose>
			
 
				+</refnamediv>
			
 
				+
			
 
				+<refsynopsisdiv>
			
 
				+	<cmdsynopsis>
			
 
				+		<command>ntdbbackup</command>
			
 
				+		<arg choice="opt">-s suffix</arg>
			
 
				+		<arg choice="opt">-v</arg>
			
 
				+		<arg choice="opt">-h</arg>
			
 
				+	</cmdsynopsis>
			
 
				+</refsynopsisdiv>
			
 
				+
			
 
				+<refsect1>
			
 
				+	<title>DESCRIPTION</title>
			
 
				+
			
 
				+	<para>This tool is part of the <citerefentry><refentrytitle>samba</refentrytitle>
			
 
				+	<manvolnum>1</manvolnum></citerefentry> suite.</para>
			
 
				+
			
 
				+	<para><command>ntdbbackup</command> is a tool that may be used to backup samba .ntdb
			
 
				+	files. This tool may also be used to verify the integrity of the .ntdb files prior
			
 
				+	to samba startup or during normal operation. If it finds file damage and it finds
			
 
				+	a prior backup the backup file will be restored.
			
 
				+	</para>
			
 
				+</refsect1>
			
 
				+
			
 
				+
			
 
				+<refsect1>
			
 
				+	<title>OPTIONS</title>
			
 
				+
			
 
				+	<variablelist>
			
 
				+
			
 
				+		<varlistentry>
			
 
				+		<term>-h</term>
			
 
				+		<listitem><para>
			
 
				+		Get help information.
			
 
				+		</para></listitem>
			
 
				+		</varlistentry>
			
 
				+
			
 
				+		<varlistentry>
			
 
				+		<term>-s suffix</term>
			
 
				+		<listitem><para>
			
 
				+		The <command>-s</command> option allows the administrator to specify a file
			
 
				+		backup extension. This way it is possible to keep a history of ntdb backup
			
 
				+		files by using a new suffix for each backup.
			
 
				+		</para> </listitem>
			
 
				+		</varlistentry>
			
 
				+
			
 
				+		<varlistentry>
			
 
				+		<term>-v</term>
			
 
				+		<listitem><para>
			
 
				+		The <command>-v</command> will check the database for damages (corrupt data)
			
 
				+		which if detected causes the backup to be restored.
			
 
				+		</para></listitem>
			
 
				+		</varlistentry>
			
 
				+
			
 
				+	</variablelist>
			
 
				+</refsect1>
			
 
				+
			
 
				+
			
 
				+<refsect1>
			
 
				+	<title>COMMANDS</title>
			
 
				+
			
 
				+	<para><emphasis>GENERAL INFORMATION</emphasis></para>
			
 
				+
			
 
				+	<para>
			
 
				+	The <command>ntdbbackup</command> utility can safely be run at any time. It was designed so
			
 
				+	that it can be used at any time to validate the integrity of ntdb files, even during Samba
			
 
				+	operation. Typical usage for the command will be:
			
 
				+	</para>
			
 
				+
			
 
				+	<para>ntdbbackup [-s suffix] *.ntdb</para>
			
 
				+
			
 
				+	<para>
			
 
				+	Before restarting samba the following command may be run to validate .ntdb files:
			
 
				+	</para>
			
 
				+
			
 
				+	<para>ntdbbackup -v [-s suffix] *.ntdb</para>
			
 
				+
			
 
				+	<para>
			
 
				+	Note that Samba 4 can use .tdb files instead, so you should
			
 
				+	use <command>tdbbackup</command> on those files.
			
 
				+	</para>
			
 
				+
			
 
				+	<para>
			
 
				+	Samba .tdb and .ntdb files are stored in various locations, be sure to run backup all
			
 
				+	.(n)tdb files on the system. Important files includes:
			
 
				+	</para>
			
 
				+
			
 
				+	<itemizedlist>
			
 
				+		<listitem><para>
			
 
				+		<command>secrets.(n)tdb</command> - usual location is in the /usr/local/samba/private
			
 
				+		directory, or on some systems in /etc/samba.
			
 
				+		</para></listitem>
			
 
				+
			
 
				+		<listitem><para>
			
 
				+		<command>passdb.(n)tdb</command> - usual location is in the /usr/local/samba/private
			
 
				+		directory, or on some systems in /etc/samba.
			
 
				+		</para></listitem>
			
 
				+
			
 
				+		<listitem><para>
			
 
				+		<command>*.tdb</command> and <command>*.ntdb</command> located in the /usr/local/samba/var directory or on some
			
 
				+		systems in the /var/cache or /var/lib/samba directories.
			
 
				+		</para></listitem>
			
 
				+	</itemizedlist>
			
 
				+
			
 
				+</refsect1>
			
 
				+
			
 
				+<refsect1>
			
 
				+	<title>VERSION</title>
			
 
				+
			
 
				+	<para>This man page is correct for version 4 of the Samba suite.</para>
			
 
				+</refsect1>
			
 
				+
			
 
				+<refsect1>
			
 
				+	<title>SEE ALSO</title>
			
 
				+
			
 
				+    <para>
			
 
				+      tdbbackup(8), ntdbrestore(8)
			
 
				+    </para>
			
 
				+</refsect1>
			
 
				+
			
 
				+<refsect1>
			
 
				+	<title>AUTHOR</title>
			
 
				+
			
 
				+	<para>
			
 
				+	The original Samba software and related utilities were created by Andrew Tridgell.
			
 
				+	Samba is now developed by the Samba Team as an Open Source project similar to the way
			
 
				+	the Linux kernel is developed.
			
 
				+	</para>
			
 
				+
			
 
				+	<para>The ntdbbackup man page was written by Rusty Russell,
			
 
				+	based on the tdbbackup man page by John H Terpstra.</para>
			
 
				+</refsect1>
			
 
				+
			
 
				+</refentry>
			
--- a/ccan/ntdb/man/ntdbdump.8.xml
+++ b/ccan/ntdb/man/ntdbdump.8.xml
@@ -0,0 +1,93 @@
 
				+<?xml version="1.0" encoding="iso-8859-1"?>
			
 
				+<!DOCTYPE refentry PUBLIC "-//OASIS//DTD DocBook XML V4.2//EN" "http://www.oasis-open.org/docbook/xml/4.2/docbookx.dtd">
			
 
				+<refentry id="ntdbdump.8">
			
 
				+
			
 
				+<refmeta>
			
 
				+	<refentrytitle>ntdbdump</refentrytitle>
			
 
				+	<manvolnum>8</manvolnum>
			
 
				+	<refmiscinfo class="source">Samba</refmiscinfo>
			
 
				+	<refmiscinfo class="manual">System Administration tools</refmiscinfo>
			
 
				+	<refmiscinfo class="version">4.1</refmiscinfo>
			
 
				+</refmeta>
			
 
				+
			
 
				+
			
 
				+<refnamediv>
			
 
				+	<refname>ntdbdump</refname>
			
 
				+	<refpurpose>tool for printing the contents of an NTDB file</refpurpose>
			
 
				+</refnamediv>
			
 
				+
			
 
				+<refsynopsisdiv>
			
 
				+	<cmdsynopsis>
			
 
				+		<command>ntdbdump</command>
			
 
				+		<arg choice="opt">-k <replaceable>keyname</replaceable></arg>
			
 
				+		<arg choice="opt">-e</arg>
			
 
				+		<arg choice="opt">-h</arg>
			
 
				+		<arg choice="req">filename</arg>
			
 
				+	</cmdsynopsis>
			
 
				+</refsynopsisdiv>
			
 
				+
			
 
				+<refsect1>
			
 
				+	<title>DESCRIPTION</title>
			
 
				+
			
 
				+	<para>This tool is part of the <citerefentry><refentrytitle>samba</refentrytitle>
			
 
				+	<manvolnum>1</manvolnum></citerefentry> suite.</para>
			
 
				+
			
 
				+	<para><command>ntdbdump</command> is a very simple utility that 'dumps' the
			
 
				+		contents of a NTDB (New Trivial DataBase) file to standard output in a
			
 
				+		human-readable format.
			
 
				+	</para>
			
 
				+
			
 
				+	<para>This tool can be used when debugging problems with NTDB files. It is
			
 
				+		intended for those who are somewhat familiar with Samba internals.
			
 
				+	</para>
			
 
				+</refsect1>
			
 
				+
			
 
				+<refsect1>
			
 
				+	<title>OPTIONS</title>
			
 
				+
			
 
				+	<variablelist>
			
 
				+
			
 
				+		<varlistentry>
			
 
				+		<term>-h</term>
			
 
				+		<listitem><para>
			
 
				+		Get help information.
			
 
				+		</para></listitem>
			
 
				+		</varlistentry>
			
 
				+
			
 
				+		<varlistentry>
			
 
				+		<term>-k <replaceable>keyname</replaceable></term>
			
 
				+		<listitem><para>
			
 
				+		The <command>-k</command> option restricts dumping to a single key, if found.
			
 
				+		</para> </listitem>
			
 
				+		</varlistentry>
			
 
				+
			
 
				+	</variablelist>
			
 
				+</refsect1>
			
 
				+
			
 
				+<refsect1>
			
 
				+	<title>SEE ALSO</title>
			
 
				+
			
 
				+    <para>
			
 
				+      tdbdump(8), ntdbtool(8)
			
 
				+    </para>
			
 
				+</refsect1>
			
 
				+
			
 
				+<refsect1>
			
 
				+	<title>VERSION</title>
			
 
				+
			
 
				+	<para>This man page is correct for version 4 of the Samba suite.</para>
			
 
				+</refsect1>
			
 
				+
			
 
				+<refsect1>
			
 
				+	<title>AUTHOR</title>
			
 
				+
			
 
				+	<para>
			
 
				+	The original Samba software and related utilities were created by Andrew Tridgell.
			
 
				+	Samba is now developed by the Samba Team as an Open Source project similar to the way
			
 
				+	the Linux kernel is developed.
			
 
				+	</para>
			
 
				+
			
 
				+	<para>The ntdbdump man page was written by Rusty Russell, base on the tdbdump man page by Jelmer Vernooij.</para>
			
 
				+</refsect1>
			
 
				+
			
 
				+</refentry>
			
--- a/ccan/ntdb/man/ntdbrestore.8.xml
+++ b/ccan/ntdb/man/ntdbrestore.8.xml
@@ -0,0 +1,74 @@
 
				+<?xml version="1.0" encoding="iso-8859-1"?>
			
 
				+<!DOCTYPE refentry PUBLIC "-//OASIS//DTD DocBook XML V4.2//EN" "http://www.oasis-open.org/docbook/xml/4.2/docbookx.dtd">
			
 
				+<refentry id="ntdbrestore.8">
			
 
				+
			
 
				+<refmeta>
			
 
				+	<refentrytitle>ntdbrestore</refentrytitle>
			
 
				+	<manvolnum>8</manvolnum>
			
 
				+	<refmiscinfo class="source">Samba</refmiscinfo>
			
 
				+	<refmiscinfo class="manual">System Administration tools</refmiscinfo>
			
 
				+	<refmiscinfo class="version">4.1</refmiscinfo>
			
 
				+</refmeta>
			
 
				+
			
 
				+
			
 
				+<refnamediv>
			
 
				+	<refname>ntdbrestore</refname>
			
 
				+	<refpurpose>tool for creating a NTDB file out of a ntdbdump output</refpurpose>
			
 
				+</refnamediv>
			
 
				+
			
 
				+<refsynopsisdiv>
			
 
				+	<cmdsynopsis>
			
 
				+		<command>ntdbrestore</command>
			
 
				+		<arg choice="req">ntdbfilename</arg>
			
 
				+	</cmdsynopsis>
			
 
				+</refsynopsisdiv>
			
 
				+
			
 
				+<refsect1>
			
 
				+	<title>DESCRIPTION</title>
			
 
				+
			
 
				+	<para>This tool is part of the <citerefentry><refentrytitle>samba</refentrytitle>
			
 
				+	<manvolnum>1</manvolnum></citerefentry> suite.</para>
			
 
				+
			
 
				+	<para><command>ntdbrestore</command> is a very simple utility that 'restores' the
			
 
				+		contents of dump file into NTDB (New Trivial DataBase) file. The dump file is obtained from the ntdbdump or tdbdump
			
 
				+		commands.
			
 
				+	</para>
			
 
				+
			
 
				+	<para>This tool wait on the standard input for the content of the dump and will write the ntdb in the ntdbfilename
			
 
				+  parameter.
			
 
				+	</para>
			
 
				+	<para>This tool can be used to translate between ntdb and tdb files by dumping and restoring.
			
 
				+	</para>
			
 
				+</refsect1>
			
 
				+
			
 
				+
			
 
				+<refsect1>
			
 
				+	<title>VERSION</title>
			
 
				+
			
 
				+	<para>This man page is correct for version 4 of the Samba suite.</para>
			
 
				+</refsect1>
			
 
				+
			
 
				+<refsect1>
			
 
				+	<title>SEE ALSO</title>
			
 
				+
			
 
				+    <para>
			
 
				+      ntdbdump(8), tdbrestore(8)
			
 
				+    </para>
			
 
				+</refsect1>
			
 
				+
			
 
				+<refsect1>
			
 
				+	<title>AUTHOR</title>
			
 
				+
			
 
				+	<para>
			
 
				+	The original Samba software and related utilities were created by Andrew Tridgell.
			
 
				+	Samba is now developed by the Samba Team as an Open Source project similar to the way
			
 
				+	the Linux kernel is developed.
			
 
				+
			
 
				+        ntdbrestore was written by Rusty Russell based on tdbrestore, which was initially written by Volker Lendecke based on an
			
 
				+        idea by Simon McVittie.
			
 
				+	</para>
			
 
				+
			
 
				+	<para>The ntdbrestore man page was written by Rusty Russell, based on the tdbrestore man page by Matthieu Patou.</para>
			
 
				+</refsect1>
			
 
				+
			
 
				+</refentry>
			
--- a/ccan/ntdb/man/ntdbtool.8.xml
+++ b/ccan/ntdb/man/ntdbtool.8.xml
@@ -0,0 +1,247 @@
 
				+<?xml version="1.0" encoding="iso-8859-1"?>
			
 
				+<!DOCTYPE refentry PUBLIC "-//OASIS//DTD DocBook XML V4.2//EN" "http://www.oasis-open.org/docbook/xml/4.2/docbookx.dtd">
			
 
				+<refentry id="ntdbtool.8">
			
 
				+
			
 
				+<refmeta>
			
 
				+	<refentrytitle>ntdbtool</refentrytitle>
			
 
				+	<manvolnum>8</manvolnum>
			
 
				+	<refmiscinfo class="source">Samba</refmiscinfo>
			
 
				+	<refmiscinfo class="manual">System Administration tools</refmiscinfo>
			
 
				+	<refmiscinfo class="version">4.1</refmiscinfo>
			
 
				+</refmeta>
			
 
				+
			
 
				+
			
 
				+<refnamediv>
			
 
				+	<refname>ntdbtool</refname>
			
 
				+	<refpurpose>manipulate the contents NTDB files</refpurpose>
			
 
				+</refnamediv>
			
 
				+
			
 
				+<refsynopsisdiv>
			
 
				+
			
 
				+	<cmdsynopsis>
			
 
				+		<command>ntdbtool</command>
			
 
				+		<arg choice="plain">
			
 
				+		<replaceable>NTDBFILE</replaceable>
			
 
				+		</arg>
			
 
				+		<arg rep="repeat" choice="opt">
			
 
				+		<replaceable>COMMANDS</replaceable>
			
 
				+		</arg>
			
 
				+	</cmdsynopsis>
			
 
				+
			
 
				+</refsynopsisdiv>
			
 
				+
			
 
				+<refsect1>
			
 
				+	<title>DESCRIPTION</title>
			
 
				+
			
 
				+	<para>This tool is part of the
			
 
				+	<citerefentry><refentrytitle>samba</refentrytitle>
			
 
				+	<manvolnum>1</manvolnum></citerefentry> suite.</para>
			
 
				+
			
 
				+	<para><command>ntdbtool</command> a tool for displaying and
			
 
				+	altering the contents of Samba NTDB (New Trivial DataBase) files. Each
			
 
				+	of the commands listed below can be entered interactively or
			
 
				+	provided on the command line.</para>
			
 
				+
			
 
				+</refsect1>
			
 
				+
			
 
				+
			
 
				+<refsect1>
			
 
				+	<title>COMMANDS</title>
			
 
				+
			
 
				+	<variablelist>
			
 
				+
			
 
				+		<varlistentry>
			
 
				+		<term><option>create</option>
			
 
				+		<replaceable>NTDBFILE</replaceable></term>
			
 
				+		<listitem><para>Create a new database named
			
 
				+		<replaceable>NTDBFILE</replaceable>.
			
 
				+		</para></listitem>
			
 
				+		</varlistentry>
			
 
				+
			
 
				+		<varlistentry>
			
 
				+		<term><option>open</option>
			
 
				+		<replaceable>NTDBFILE</replaceable></term>
			
 
				+		<listitem><para>Open an existing database named
			
 
				+		<replaceable>NTDBFILE</replaceable>.
			
 
				+		</para></listitem>
			
 
				+		</varlistentry>
			
 
				+
			
 
				+		<varlistentry>
			
 
				+		<term><option>erase</option></term>
			
 
				+		<listitem><para>Erase the current database.
			
 
				+		</para></listitem>
			
 
				+		</varlistentry>
			
 
				+
			
 
				+		<varlistentry>
			
 
				+		<term><option>dump</option></term>
			
 
				+		<listitem><para>Dump the current database as strings.
			
 
				+		</para></listitem>
			
 
				+		</varlistentry>
			
 
				+
			
 
				+		<varlistentry>
			
 
				+		<term><option>cdump</option></term>
			
 
				+		<listitem><para>Dump the current database as connection records.
			
 
				+		</para></listitem>
			
 
				+		</varlistentry>
			
 
				+
			
 
				+		<varlistentry>
			
 
				+		<term><option>keys</option></term>
			
 
				+		<listitem><para>Dump the current database keys as strings.
			
 
				+		</para></listitem>
			
 
				+		</varlistentry>
			
 
				+
			
 
				+		<varlistentry>
			
 
				+		<term><option>hexkeys</option></term>
			
 
				+		<listitem><para>Dump the current database keys as hex values.
			
 
				+		</para></listitem>
			
 
				+		</varlistentry>
			
 
				+
			
 
				+		<varlistentry>
			
 
				+		<term><option>info</option></term>
			
 
				+		<listitem><para>Print summary information about the
			
 
				+		current database.
			
 
				+		</para></listitem>
			
 
				+		</varlistentry>
			
 
				+
			
 
				+		<varlistentry>
			
 
				+		<term><option>insert</option>
			
 
				+		<replaceable>KEY</replaceable>
			
 
				+		<replaceable>DATA</replaceable>
			
 
				+		</term>
			
 
				+		<listitem><para>Insert a record into the
			
 
				+		current database.
			
 
				+		</para></listitem>
			
 
				+		</varlistentry>
			
 
				+
			
 
				+		<varlistentry>
			
 
				+		<term><option>move</option>
			
 
				+		<replaceable>KEY</replaceable>
			
 
				+		<replaceable>NTDBFILE</replaceable>
			
 
				+		</term>
			
 
				+		<listitem><para>Move a record from the
			
 
				+		current database into <replaceable>NTDBFILE</replaceable>.
			
 
				+		</para></listitem>
			
 
				+		</varlistentry>
			
 
				+
			
 
				+		<varlistentry>
			
 
				+		<term><option>store</option>
			
 
				+		<replaceable>KEY</replaceable>
			
 
				+		<replaceable>DATA</replaceable>
			
 
				+		</term>
			
 
				+		<listitem><para>Store (replace) a record in the
			
 
				+		current database.
			
 
				+		</para></listitem>
			
 
				+		</varlistentry>
			
 
				+
			
 
				+		<varlistentry>
			
 
				+		<term><option>show</option>
			
 
				+		<replaceable>KEY</replaceable>
			
 
				+		</term>
			
 
				+		<listitem><para>Show a record by key.
			
 
				+		</para></listitem>
			
 
				+		</varlistentry>
			
 
				+
			
 
				+		<varlistentry>
			
 
				+		<term><option>delete</option>
			
 
				+		<replaceable>KEY</replaceable>
			
 
				+		</term>
			
 
				+		<listitem><para>Delete a record by key.
			
 
				+		</para></listitem>
			
 
				+		</varlistentry>
			
 
				+
			
 
				+		<varlistentry>
			
 
				+		<term><option>list</option>
			
 
				+		</term>
			
 
				+		<listitem><para>Print the current database hash table and free list.
			
 
				+		</para></listitem>
			
 
				+		</varlistentry>
			
 
				+
			
 
				+		<varlistentry>
			
 
				+		<term><option>free</option>
			
 
				+		</term>
			
 
				+		<listitem><para>Print the current database and free list.
			
 
				+		</para></listitem>
			
 
				+		</varlistentry>
			
 
				+
			
 
				+		<varlistentry>
			
 
				+		<term><option>!</option>
			
 
				+		<replaceable>COMMAND</replaceable>
			
 
				+		</term>
			
 
				+		<listitem><para>Execute the given system command.
			
 
				+		</para></listitem>
			
 
				+		</varlistentry>
			
 
				+
			
 
				+		<varlistentry>
			
 
				+		<term>
			
 
				+		<option>first</option>
			
 
				+		</term>
			
 
				+		<listitem><para>Print the first record in the current database.
			
 
				+		</para></listitem>
			
 
				+		</varlistentry>
			
 
				+
			
 
				+		<varlistentry>
			
 
				+		<term>
			
 
				+		<option>next</option>
			
 
				+		</term>
			
 
				+		<listitem><para>Print the next record in the current database.
			
 
				+		</para></listitem>
			
 
				+		</varlistentry>
			
 
				+
			
 
				+		<varlistentry>
			
 
				+		<term>
			
 
				+		<option>check</option>
			
 
				+		</term>
			
 
				+		<listitem><para>Check the integrity of the current database.
			
 
				+		</para></listitem>
			
 
				+		</varlistentry>
			
 
				+
			
 
				+		<varlistentry>
			
 
				+		<term>
			
 
				+		<option>repack</option>
			
 
				+		</term>
			
 
				+		<listitem><para>Repack a database using a temporary file to remove fragmentation.
			
 
				+		</para></listitem>
			
 
				+		</varlistentry>
			
 
				+
			
 
				+		<varlistentry>
			
 
				+		<term>
			
 
				+		<option>quit</option>
			
 
				+		</term>
			
 
				+		<listitem><para>Exit <command>ntdbtool</command>.
			
 
				+		</para></listitem>
			
 
				+		</varlistentry>
			
 
				+
			
 
				+	</variablelist>
			
 
				+</refsect1>
			
 
				+
			
 
				+<refsect1>
			
 
				+	<title>SEE ALSO</title>
			
 
				+
			
 
				+    <para>
			
 
				+      tdbtool(8)
			
 
				+    </para>
			
 
				+</refsect1>
			
 
				+
			
 
				+<refsect1>
			
 
				+	<title>CAVEATS</title>
			
 
				+	<para>The contents of the Samba NTDB files are private
			
 
				+	to the implementation and should not be altered with
			
 
				+	<command>ntdbtool</command>.
			
 
				+	</para>
			
 
				+</refsect1>
			
 
				+
			
 
				+<refsect1>
			
 
				+	<title>VERSION</title>
			
 
				+	<para>This man page is correct for version 4.0 of the Samba suite.</para>
			
 
				+</refsect1>
			
 
				+
			
 
				+<refsect1>
			
 
				+	<title>AUTHOR</title>
			
 
				+
			
 
				+	<para> The original Samba software and related utilities were
			
 
				+	created by Andrew Tridgell.  Samba is now developed by the
			
 
				+	Samba Team as an Open Source project similar to the way the
			
 
				+	Linux kernel is developed.</para>
			
 
				+</refsect1>
			
 
				+
			
 
				+</refentry>
			
--- a/ccan/ntdb/ntdb.c
+++ b/ccan/ntdb/ntdb.c
@@ -0,0 +1,601 @@
 
				+ /*
			
 
				+   Trivial Database 2: fetch, store and misc routines.
			
 
				+   Copyright (C) Rusty Russell 2010
			
 
				+
			
 
				+   This library is free software; you can redistribute it and/or
			
 
				+   modify it under the terms of the GNU Lesser General Public
			
 
				+   License as published by the Free Software Foundation; either
			
 
				+   version 3 of the License, or (at your option) any later version.
			
 
				+
			
 
				+   This library is distributed in the hope that it will be useful,
			
 
				+   but WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
			
 
				+   Lesser General Public License for more details.
			
 
				+
			
 
				+   You should have received a copy of the GNU Lesser General Public
			
 
				+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
			
 
				+*/
			
 
				+#include "private.h"
			
 
				+#ifndef HAVE_LIBREPLACE
			
 
				+#include <stdarg.h>
			
 
				+#endif
			
 
				+
			
 
				+static enum NTDB_ERROR update_rec_hdr(struct ntdb_context *ntdb,
			
 
				+				     ntdb_off_t off,
			
 
				+				     ntdb_len_t keylen,
			
 
				+				     ntdb_len_t datalen,
			
 
				+				     struct ntdb_used_record *rec)
			
 
				+{
			
 
				+	uint64_t dataroom = rec_data_length(rec) + rec_extra_padding(rec);
			
 
				+	enum NTDB_ERROR ecode;
			
 
				+
			
 
				+	ecode = set_header(ntdb, rec, NTDB_USED_MAGIC, keylen, datalen,
			
 
				+			   keylen + dataroom);
			
 
				+	if (ecode == NTDB_SUCCESS) {
			
 
				+		ecode = ntdb_write_convert(ntdb, off, rec, sizeof(*rec));
			
 
				+	}
			
 
				+	return ecode;
			
 
				+}
			
 
				+
			
 
				+static enum NTDB_ERROR replace_data(struct ntdb_context *ntdb,
			
 
				+				   struct hash_info *h,
			
 
				+				   NTDB_DATA key, NTDB_DATA dbuf,
			
 
				+				   ntdb_off_t old_off, ntdb_len_t old_room,
			
 
				+				   bool growing)
			
 
				+{
			
 
				+	ntdb_off_t new_off;
			
 
				+	enum NTDB_ERROR ecode;
			
 
				+
			
 
				+	/* Allocate a new record. */
			
 
				+	new_off = alloc(ntdb, key.dsize, dbuf.dsize, NTDB_USED_MAGIC, growing);
			
 
				+	if (NTDB_OFF_IS_ERR(new_off)) {
			
 
				+		return NTDB_OFF_TO_ERR(new_off);
			
 
				+	}
			
 
				+
			
 
				+	/* We didn't like the existing one: remove it. */
			
 
				+	if (old_off) {
			
 
				+		ntdb->stats.frees++;
			
 
				+		ecode = add_free_record(ntdb, old_off,
			
 
				+					sizeof(struct ntdb_used_record)
			
 
				+					+ key.dsize + old_room,
			
 
				+					NTDB_LOCK_WAIT, true);
			
 
				+		if (ecode == NTDB_SUCCESS)
			
 
				+			ecode = replace_in_hash(ntdb, h, new_off);
			
 
				+	} else {
			
 
				+		ecode = add_to_hash(ntdb, h, new_off);
			
 
				+	}
			
 
				+	if (ecode != NTDB_SUCCESS) {
			
 
				+		return ecode;
			
 
				+	}
			
 
				+
			
 
				+	new_off += sizeof(struct ntdb_used_record);
			
 
				+	ecode = ntdb->io->twrite(ntdb, new_off, key.dptr, key.dsize);
			
 
				+	if (ecode != NTDB_SUCCESS) {
			
 
				+		return ecode;
			
 
				+	}
			
 
				+
			
 
				+	new_off += key.dsize;
			
 
				+	ecode = ntdb->io->twrite(ntdb, new_off, dbuf.dptr, dbuf.dsize);
			
 
				+	if (ecode != NTDB_SUCCESS) {
			
 
				+		return ecode;
			
 
				+	}
			
 
				+
			
 
				+	if (ntdb->flags & NTDB_SEQNUM)
			
 
				+		ntdb_inc_seqnum(ntdb);
			
 
				+
			
 
				+	return NTDB_SUCCESS;
			
 
				+}
			
 
				+
			
 
				+static enum NTDB_ERROR update_data(struct ntdb_context *ntdb,
			
 
				+				  ntdb_off_t off,
			
 
				+				  NTDB_DATA dbuf,
			
 
				+				  ntdb_len_t extra)
			
 
				+{
			
 
				+	enum NTDB_ERROR ecode;
			
 
				+
			
 
				+	ecode = ntdb->io->twrite(ntdb, off, dbuf.dptr, dbuf.dsize);
			
 
				+	if (ecode == NTDB_SUCCESS && extra) {
			
 
				+		/* Put a zero in; future versions may append other data. */
			
 
				+		ecode = ntdb->io->twrite(ntdb, off + dbuf.dsize, "", 1);
			
 
				+	}
			
 
				+	if (ntdb->flags & NTDB_SEQNUM)
			
 
				+		ntdb_inc_seqnum(ntdb);
			
 
				+
			
 
				+	return ecode;
			
 
				+}
			
 
				+
			
 
				+_PUBLIC_ enum NTDB_ERROR ntdb_store(struct ntdb_context *ntdb,
			
 
				+			 NTDB_DATA key, NTDB_DATA dbuf, int flag)
			
 
				+{
			
 
				+	struct hash_info h;
			
 
				+	ntdb_off_t off;
			
 
				+	ntdb_len_t old_room = 0;
			
 
				+	struct ntdb_used_record rec;
			
 
				+	enum NTDB_ERROR ecode;
			
 
				+
			
 
				+	off = find_and_lock(ntdb, key, F_WRLCK, &h, &rec, NULL);
			
 
				+	if (NTDB_OFF_IS_ERR(off)) {
			
 
				+		return NTDB_OFF_TO_ERR(off);
			
 
				+	}
			
 
				+
			
 
				+	/* Now we have lock on this hash bucket. */
			
 
				+	if (flag == NTDB_INSERT) {
			
 
				+		if (off) {
			
 
				+			ecode = NTDB_ERR_EXISTS;
			
 
				+			goto out;
			
 
				+		}
			
 
				+	} else {
			
 
				+		if (off) {
			
 
				+			old_room = rec_data_length(&rec)
			
 
				+				+ rec_extra_padding(&rec);
			
 
				+			if (old_room >= dbuf.dsize) {
			
 
				+				/* Can modify in-place.  Easy! */
			
 
				+				ecode = update_rec_hdr(ntdb, off,
			
 
				+						       key.dsize, dbuf.dsize,
			
 
				+						       &rec);
			
 
				+				if (ecode != NTDB_SUCCESS) {
			
 
				+					goto out;
			
 
				+				}
			
 
				+				ecode = update_data(ntdb,
			
 
				+						    off + sizeof(rec)
			
 
				+						    + key.dsize, dbuf,
			
 
				+						    old_room - dbuf.dsize);
			
 
				+				if (ecode != NTDB_SUCCESS) {
			
 
				+					goto out;
			
 
				+				}
			
 
				+				ntdb_unlock_hash(ntdb, h.h, F_WRLCK);
			
 
				+				return NTDB_SUCCESS;
			
 
				+			}
			
 
				+		} else {
			
 
				+			if (flag == NTDB_MODIFY) {
			
 
				+				/* if the record doesn't exist and we
			
 
				+				   are in NTDB_MODIFY mode then we should fail
			
 
				+				   the store */
			
 
				+				ecode = NTDB_ERR_NOEXIST;
			
 
				+				goto out;
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	/* If we didn't use the old record, this implies we're growing. */
			
 
				+	ecode = replace_data(ntdb, &h, key, dbuf, off, old_room, off);
			
 
				+out:
			
 
				+	ntdb_unlock_hash(ntdb, h.h, F_WRLCK);
			
 
				+	return ecode;
			
 
				+}
			
 
				+
			
 
				+_PUBLIC_ enum NTDB_ERROR ntdb_append(struct ntdb_context *ntdb,
			
 
				+			  NTDB_DATA key, NTDB_DATA dbuf)
			
 
				+{
			
 
				+	struct hash_info h;
			
 
				+	ntdb_off_t off;
			
 
				+	struct ntdb_used_record rec;
			
 
				+	ntdb_len_t old_room = 0, old_dlen;
			
 
				+	unsigned char *newdata;
			
 
				+	NTDB_DATA new_dbuf;
			
 
				+	enum NTDB_ERROR ecode;
			
 
				+
			
 
				+	off = find_and_lock(ntdb, key, F_WRLCK, &h, &rec, NULL);
			
 
				+	if (NTDB_OFF_IS_ERR(off)) {
			
 
				+		return NTDB_OFF_TO_ERR(off);
			
 
				+	}
			
 
				+
			
 
				+	if (off) {
			
 
				+		old_dlen = rec_data_length(&rec);
			
 
				+		old_room = old_dlen + rec_extra_padding(&rec);
			
 
				+
			
 
				+		/* Fast path: can append in place. */
			
 
				+		if (rec_extra_padding(&rec) >= dbuf.dsize) {
			
 
				+			ecode = update_rec_hdr(ntdb, off, key.dsize,
			
 
				+					       old_dlen + dbuf.dsize, &rec);
			
 
				+			if (ecode != NTDB_SUCCESS) {
			
 
				+				goto out;
			
 
				+			}
			
 
				+
			
 
				+			off += sizeof(rec) + key.dsize + old_dlen;
			
 
				+			ecode = update_data(ntdb, off, dbuf,
			
 
				+					    rec_extra_padding(&rec));
			
 
				+			goto out;
			
 
				+		}
			
 
				+
			
 
				+		/* Slow path. */
			
 
				+		newdata = ntdb->alloc_fn(ntdb, key.dsize + old_dlen + dbuf.dsize,
			
 
				+				     ntdb->alloc_data);
			
 
				+		if (!newdata) {
			
 
				+			ecode = ntdb_logerr(ntdb, NTDB_ERR_OOM, NTDB_LOG_ERROR,
			
 
				+					   "ntdb_append:"
			
 
				+					   " failed to allocate %zu bytes",
			
 
				+					   (size_t)(key.dsize + old_dlen
			
 
				+						    + dbuf.dsize));
			
 
				+			goto out;
			
 
				+		}
			
 
				+		ecode = ntdb->io->tread(ntdb, off + sizeof(rec) + key.dsize,
			
 
				+				       newdata, old_dlen);
			
 
				+		if (ecode != NTDB_SUCCESS) {
			
 
				+			goto out_free_newdata;
			
 
				+		}
			
 
				+		memcpy(newdata + old_dlen, dbuf.dptr, dbuf.dsize);
			
 
				+		new_dbuf.dptr = newdata;
			
 
				+		new_dbuf.dsize = old_dlen + dbuf.dsize;
			
 
				+	} else {
			
 
				+		newdata = NULL;
			
 
				+		new_dbuf = dbuf;
			
 
				+	}
			
 
				+
			
 
				+	/* If they're using ntdb_append(), it implies they're growing record. */
			
 
				+	ecode = replace_data(ntdb, &h, key, new_dbuf, off, old_room, true);
			
 
				+
			
 
				+out_free_newdata:
			
 
				+	ntdb->free_fn(newdata, ntdb->alloc_data);
			
 
				+out:
			
 
				+	ntdb_unlock_hash(ntdb, h.h, F_WRLCK);
			
 
				+	return ecode;
			
 
				+}
			
 
				+
			
 
				+_PUBLIC_ enum NTDB_ERROR ntdb_fetch(struct ntdb_context *ntdb, NTDB_DATA key,
			
 
				+				    NTDB_DATA *data)
			
 
				+{
			
 
				+	ntdb_off_t off;
			
 
				+	struct ntdb_used_record rec;
			
 
				+	struct hash_info h;
			
 
				+	enum NTDB_ERROR ecode;
			
 
				+	const char *keyp;
			
 
				+
			
 
				+	off = find_and_lock(ntdb, key, F_RDLCK, &h, &rec, &keyp);
			
 
				+	if (NTDB_OFF_IS_ERR(off)) {
			
 
				+		return NTDB_OFF_TO_ERR(off);
			
 
				+	}
			
 
				+
			
 
				+	if (!off) {
			
 
				+		ecode = NTDB_ERR_NOEXIST;
			
 
				+	} else {
			
 
				+		data->dsize = rec_data_length(&rec);
			
 
				+		data->dptr = ntdb->alloc_fn(ntdb, data->dsize, ntdb->alloc_data);
			
 
				+		if (unlikely(!data->dptr)) {
			
 
				+			ecode = NTDB_ERR_OOM;
			
 
				+		} else {
			
 
				+			memcpy(data->dptr, keyp + key.dsize, data->dsize);
			
 
				+			ecode = NTDB_SUCCESS;
			
 
				+		}
			
 
				+		ntdb_access_release(ntdb, keyp);
			
 
				+	}
			
 
				+
			
 
				+	ntdb_unlock_hash(ntdb, h.h, F_RDLCK);
			
 
				+	return ecode;
			
 
				+}
			
 
				+
			
 
				+_PUBLIC_ bool ntdb_exists(struct ntdb_context *ntdb, NTDB_DATA key)
			
 
				+{
			
 
				+	ntdb_off_t off;
			
 
				+	struct ntdb_used_record rec;
			
 
				+	struct hash_info h;
			
 
				+
			
 
				+	off = find_and_lock(ntdb, key, F_RDLCK, &h, &rec, NULL);
			
 
				+	if (NTDB_OFF_IS_ERR(off)) {
			
 
				+		return false;
			
 
				+	}
			
 
				+	ntdb_unlock_hash(ntdb, h.h, F_RDLCK);
			
 
				+
			
 
				+	return off ? true : false;
			
 
				+}
			
 
				+
			
 
				+_PUBLIC_ enum NTDB_ERROR ntdb_delete(struct ntdb_context *ntdb, NTDB_DATA key)
			
 
				+{
			
 
				+	ntdb_off_t off;
			
 
				+	struct ntdb_used_record rec;
			
 
				+	struct hash_info h;
			
 
				+	enum NTDB_ERROR ecode;
			
 
				+
			
 
				+	off = find_and_lock(ntdb, key, F_WRLCK, &h, &rec, NULL);
			
 
				+	if (NTDB_OFF_IS_ERR(off)) {
			
 
				+		return NTDB_OFF_TO_ERR(off);
			
 
				+	}
			
 
				+
			
 
				+	if (!off) {
			
 
				+		ecode = NTDB_ERR_NOEXIST;
			
 
				+		goto unlock;
			
 
				+	}
			
 
				+
			
 
				+	ecode = delete_from_hash(ntdb, &h);
			
 
				+	if (ecode != NTDB_SUCCESS) {
			
 
				+		goto unlock;
			
 
				+	}
			
 
				+
			
 
				+	/* Free the deleted entry. */
			
 
				+	ntdb->stats.frees++;
			
 
				+	ecode = add_free_record(ntdb, off,
			
 
				+				sizeof(struct ntdb_used_record)
			
 
				+				+ rec_key_length(&rec)
			
 
				+				+ rec_data_length(&rec)
			
 
				+				+ rec_extra_padding(&rec),
			
 
				+				NTDB_LOCK_WAIT, true);
			
 
				+
			
 
				+	if (ntdb->flags & NTDB_SEQNUM)
			
 
				+		ntdb_inc_seqnum(ntdb);
			
 
				+
			
 
				+unlock:
			
 
				+	ntdb_unlock_hash(ntdb, h.h, F_WRLCK);
			
 
				+	return ecode;
			
 
				+}
			
 
				+
			
 
				+_PUBLIC_ unsigned int ntdb_get_flags(struct ntdb_context *ntdb)
			
 
				+{
			
 
				+	return ntdb->flags;
			
 
				+}
			
 
				+
			
 
				+static bool inside_transaction(const struct ntdb_context *ntdb)
			
 
				+{
			
 
				+	return ntdb->transaction != NULL;
			
 
				+}
			
 
				+
			
 
				+static bool readonly_changable(struct ntdb_context *ntdb, const char *caller)
			
 
				+{
			
 
				+	if (inside_transaction(ntdb)) {
			
 
				+		ntdb_logerr(ntdb, NTDB_ERR_EINVAL, NTDB_LOG_USE_ERROR,
			
 
				+			    "%s: can't change"
			
 
				+			    " NTDB_RDONLY inside transaction",
			
 
				+			    caller);
			
 
				+		return false;
			
 
				+	}
			
 
				+	return true;
			
 
				+}
			
 
				+
			
 
				+_PUBLIC_ void ntdb_add_flag(struct ntdb_context *ntdb, unsigned flag)
			
 
				+{
			
 
				+	if (ntdb->flags & NTDB_INTERNAL) {
			
 
				+		ntdb_logerr(ntdb, NTDB_ERR_EINVAL, NTDB_LOG_USE_ERROR,
			
 
				+			    "ntdb_add_flag: internal db");
			
 
				+		return;
			
 
				+	}
			
 
				+	switch (flag) {
			
 
				+	case NTDB_NOLOCK:
			
 
				+		ntdb->flags |= NTDB_NOLOCK;
			
 
				+		break;
			
 
				+	case NTDB_NOMMAP:
			
 
				+		if (ntdb->file->direct_count) {
			
 
				+			ntdb_logerr(ntdb, NTDB_ERR_EINVAL, NTDB_LOG_USE_ERROR,
			
 
				+				    "ntdb_add_flag: Can't get NTDB_NOMMAP from"
			
 
				+				    " ntdb_parse_record!");
			
 
				+			return;
			
 
				+		}
			
 
				+		ntdb->flags |= NTDB_NOMMAP;
			
 
				+#ifndef HAVE_INCOHERENT_MMAP
			
 
				+		ntdb_munmap(ntdb);
			
 
				+#endif
			
 
				+		break;
			
 
				+	case NTDB_NOSYNC:
			
 
				+		ntdb->flags |= NTDB_NOSYNC;
			
 
				+		break;
			
 
				+	case NTDB_SEQNUM:
			
 
				+		ntdb->flags |= NTDB_SEQNUM;
			
 
				+		break;
			
 
				+	case NTDB_ALLOW_NESTING:
			
 
				+		ntdb->flags |= NTDB_ALLOW_NESTING;
			
 
				+		break;
			
 
				+	case NTDB_RDONLY:
			
 
				+		if (readonly_changable(ntdb, "ntdb_add_flag"))
			
 
				+			ntdb->flags |= NTDB_RDONLY;
			
 
				+		break;
			
 
				+	default:
			
 
				+		ntdb_logerr(ntdb, NTDB_ERR_EINVAL, NTDB_LOG_USE_ERROR,
			
 
				+			    "ntdb_add_flag: Unknown flag %u", flag);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+_PUBLIC_ void ntdb_remove_flag(struct ntdb_context *ntdb, unsigned flag)
			
 
				+{
			
 
				+	if (ntdb->flags & NTDB_INTERNAL) {
			
 
				+		ntdb_logerr(ntdb, NTDB_ERR_EINVAL, NTDB_LOG_USE_ERROR,
			
 
				+			    "ntdb_remove_flag: internal db");
			
 
				+		return;
			
 
				+	}
			
 
				+	switch (flag) {
			
 
				+	case NTDB_NOLOCK:
			
 
				+		ntdb->flags &= ~NTDB_NOLOCK;
			
 
				+		break;
			
 
				+	case NTDB_NOMMAP:
			
 
				+		ntdb->flags &= ~NTDB_NOMMAP;
			
 
				+#ifndef HAVE_INCOHERENT_MMAP
			
 
				+		/* If mmap incoherent, we were mmaping anyway. */
			
 
				+		ntdb_mmap(ntdb);
			
 
				+#endif
			
 
				+		break;
			
 
				+	case NTDB_NOSYNC:
			
 
				+		ntdb->flags &= ~NTDB_NOSYNC;
			
 
				+		break;
			
 
				+	case NTDB_SEQNUM:
			
 
				+		ntdb->flags &= ~NTDB_SEQNUM;
			
 
				+		break;
			
 
				+	case NTDB_ALLOW_NESTING:
			
 
				+		ntdb->flags &= ~NTDB_ALLOW_NESTING;
			
 
				+		break;
			
 
				+	case NTDB_RDONLY:
			
 
				+		if ((ntdb->open_flags & O_ACCMODE) == O_RDONLY) {
			
 
				+			ntdb_logerr(ntdb, NTDB_ERR_EINVAL, NTDB_LOG_USE_ERROR,
			
 
				+				    "ntdb_remove_flag: can't"
			
 
				+				    " remove NTDB_RDONLY on ntdb"
			
 
				+				    " opened with O_RDONLY");
			
 
				+			break;
			
 
				+		}
			
 
				+		if (readonly_changable(ntdb, "ntdb_remove_flag"))
			
 
				+			ntdb->flags &= ~NTDB_RDONLY;
			
 
				+		break;
			
 
				+	default:
			
 
				+		ntdb_logerr(ntdb, NTDB_ERR_EINVAL, NTDB_LOG_USE_ERROR,
			
 
				+			    "ntdb_remove_flag: Unknown flag %u",
			
 
				+			    flag);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+_PUBLIC_ const char *ntdb_errorstr(enum NTDB_ERROR ecode)
			
 
				+{
			
 
				+	/* Gcc warns if you miss a case in the switch, so use that. */
			
 
				+	switch (NTDB_ERR_TO_OFF(ecode)) {
			
 
				+	case NTDB_ERR_TO_OFF(NTDB_SUCCESS): return "Success";
			
 
				+	case NTDB_ERR_TO_OFF(NTDB_ERR_CORRUPT): return "Corrupt database";
			
 
				+	case NTDB_ERR_TO_OFF(NTDB_ERR_IO): return "IO Error";
			
 
				+	case NTDB_ERR_TO_OFF(NTDB_ERR_LOCK): return "Locking error";
			
 
				+	case NTDB_ERR_TO_OFF(NTDB_ERR_OOM): return "Out of memory";
			
 
				+	case NTDB_ERR_TO_OFF(NTDB_ERR_EXISTS): return "Record exists";
			
 
				+	case NTDB_ERR_TO_OFF(NTDB_ERR_EINVAL): return "Invalid parameter";
			
 
				+	case NTDB_ERR_TO_OFF(NTDB_ERR_NOEXIST): return "Record does not exist";
			
 
				+	case NTDB_ERR_TO_OFF(NTDB_ERR_RDONLY): return "write not permitted";
			
 
				+	}
			
 
				+	return "Invalid error code";
			
 
				+}
			
 
				+
			
 
				+enum NTDB_ERROR COLD ntdb_logerr(struct ntdb_context *ntdb,
			
 
				+			       enum NTDB_ERROR ecode,
			
 
				+			       enum ntdb_log_level level,
			
 
				+			       const char *fmt, ...)
			
 
				+{
			
 
				+	char *message;
			
 
				+	va_list ap;
			
 
				+	size_t len;
			
 
				+	/* ntdb_open paths care about errno, so save it. */
			
 
				+	int saved_errno = errno;
			
 
				+
			
 
				+	if (!ntdb->log_fn)
			
 
				+		return ecode;
			
 
				+
			
 
				+	va_start(ap, fmt);
			
 
				+	len = vsnprintf(NULL, 0, fmt, ap);
			
 
				+	va_end(ap);
			
 
				+
			
 
				+	message = ntdb->alloc_fn(ntdb, len + 1, ntdb->alloc_data);
			
 
				+	if (!message) {
			
 
				+		ntdb->log_fn(ntdb, NTDB_LOG_ERROR, NTDB_ERR_OOM,
			
 
				+			    "out of memory formatting message:", ntdb->log_data);
			
 
				+		ntdb->log_fn(ntdb, level, ecode, fmt, ntdb->log_data);
			
 
				+	} else {
			
 
				+		va_start(ap, fmt);
			
 
				+		vsnprintf(message, len+1, fmt, ap);
			
 
				+		va_end(ap);
			
 
				+		ntdb->log_fn(ntdb, level, ecode, message, ntdb->log_data);
			
 
				+		ntdb->free_fn(message, ntdb->alloc_data);
			
 
				+	}
			
 
				+	errno = saved_errno;
			
 
				+	return ecode;
			
 
				+}
			
 
				+
			
 
				+_PUBLIC_ enum NTDB_ERROR ntdb_parse_record_(struct ntdb_context *ntdb,
			
 
				+				 NTDB_DATA key,
			
 
				+				 enum NTDB_ERROR (*parse)(NTDB_DATA k,
			
 
				+							 NTDB_DATA d,
			
 
				+							 void *data),
			
 
				+				 void *data)
			
 
				+{
			
 
				+	ntdb_off_t off;
			
 
				+	struct ntdb_used_record rec;
			
 
				+	struct hash_info h;
			
 
				+	enum NTDB_ERROR ecode;
			
 
				+	const char *keyp;
			
 
				+
			
 
				+	off = find_and_lock(ntdb, key, F_RDLCK, &h, &rec, &keyp);
			
 
				+	if (NTDB_OFF_IS_ERR(off)) {
			
 
				+		return NTDB_OFF_TO_ERR(off);
			
 
				+	}
			
 
				+
			
 
				+	if (!off) {
			
 
				+		ecode = NTDB_ERR_NOEXIST;
			
 
				+	} else {
			
 
				+		unsigned int old_flags;
			
 
				+		NTDB_DATA d = ntdb_mkdata(keyp + key.dsize,
			
 
				+					  rec_data_length(&rec));
			
 
				+
			
 
				+		/*
			
 
				+		 * Make sure they don't try to write db, since they
			
 
				+		 * have read lock!  They can if they've done
			
 
				+		 * ntdb_lockall(): if it was ntdb_lockall_read, that'll
			
 
				+		 * stop them doing a write operation anyway.
			
 
				+		 */
			
 
				+		old_flags = ntdb->flags;
			
 
				+		if (!ntdb->file->allrecord_lock.count &&
			
 
				+		    !(ntdb->flags & NTDB_NOLOCK)) {
			
 
				+			ntdb->flags |= NTDB_RDONLY;
			
 
				+		}
			
 
				+		ecode = parse(key, d, data);
			
 
				+		ntdb->flags = old_flags;
			
 
				+		ntdb_access_release(ntdb, keyp);
			
 
				+	}
			
 
				+
			
 
				+	ntdb_unlock_hash(ntdb, h.h, F_RDLCK);
			
 
				+	return ecode;
			
 
				+}
			
 
				+
			
 
				+_PUBLIC_ const char *ntdb_name(const struct ntdb_context *ntdb)
			
 
				+{
			
 
				+	return ntdb->name;
			
 
				+}
			
 
				+
			
 
				+_PUBLIC_ int64_t ntdb_get_seqnum(struct ntdb_context *ntdb)
			
 
				+{
			
 
				+	return ntdb_read_off(ntdb, offsetof(struct ntdb_header, seqnum));
			
 
				+}
			
 
				+
			
 
				+
			
 
				+_PUBLIC_ int ntdb_fd(const struct ntdb_context *ntdb)
			
 
				+{
			
 
				+	return ntdb->file->fd;
			
 
				+}
			
 
				+
			
 
				+struct traverse_state {
			
 
				+	enum NTDB_ERROR error;
			
 
				+	struct ntdb_context *dest_db;
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+  traverse function for repacking
			
 
				+ */
			
 
				+static int repack_traverse(struct ntdb_context *ntdb, NTDB_DATA key, NTDB_DATA data,
			
 
				+			   struct traverse_state *state)
			
 
				+{
			
 
				+	state->error = ntdb_store(state->dest_db, key, data, NTDB_INSERT);
			
 
				+	if (state->error != NTDB_SUCCESS) {
			
 
				+		return -1;
			
 
				+	}
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+_PUBLIC_ enum NTDB_ERROR ntdb_repack(struct ntdb_context *ntdb)
			
 
				+{
			
 
				+	struct ntdb_context *tmp_db;
			
 
				+	struct traverse_state state;
			
 
				+
			
 
				+	state.error = ntdb_transaction_start(ntdb);
			
 
				+	if (state.error != NTDB_SUCCESS) {
			
 
				+		return state.error;
			
 
				+	}
			
 
				+
			
 
				+	tmp_db = ntdb_open("tmpdb", NTDB_INTERNAL, O_RDWR|O_CREAT, 0, NULL);
			
 
				+	if (tmp_db == NULL) {
			
 
				+		state.error = ntdb_logerr(ntdb, NTDB_ERR_OOM, NTDB_LOG_ERROR,
			
 
				+					 __location__
			
 
				+					 " Failed to create tmp_db");
			
 
				+		ntdb_transaction_cancel(ntdb);
			
 
				+		return state.error;
			
 
				+	}
			
 
				+
			
 
				+	state.dest_db = tmp_db;
			
 
				+	if (ntdb_traverse(ntdb, repack_traverse, &state) < 0) {
			
 
				+		goto fail;
			
 
				+	}
			
 
				+
			
 
				+	state.error = ntdb_wipe_all(ntdb);
			
 
				+	if (state.error != NTDB_SUCCESS) {
			
 
				+		goto fail;
			
 
				+	}
			
 
				+
			
 
				+	state.dest_db = ntdb;
			
 
				+	if (ntdb_traverse(tmp_db, repack_traverse, &state) < 0) {
			
 
				+		goto fail;
			
 
				+	}
			
 
				+
			
 
				+	ntdb_close(tmp_db);
			
 
				+	return ntdb_transaction_commit(ntdb);
			
 
				+
			
 
				+fail:
			
 
				+	ntdb_transaction_cancel(ntdb);
			
 
				+	ntdb_close(tmp_db);
			
 
				+	return state.error;
			
 
				+}
			
--- a/ccan/ntdb/ntdb.h
+++ b/ccan/ntdb/ntdb.h
@@ -0,0 +1,947 @@
 
				+#ifndef CCAN_NTDB_H
			
 
				+#define CCAN_NTDB_H
			
 
				+
			
 
				+/*
			
 
				+   NTDB: trivial database library version 2
			
 
				+
			
 
				+   Copyright (C) Andrew Tridgell 1999-2004
			
 
				+   Copyright (C) Rusty Russell 2010-2012
			
 
				+
			
 
				+     ** NOTE! The following LGPL license applies to the ntdb
			
 
				+     ** library. This does NOT imply that all of Samba is released
			
 
				+     ** under the LGPL
			
 
				+
			
 
				+   This library is free software; you can redistribute it and/or
			
 
				+   modify it under the terms of the GNU Lesser General Public
			
 
				+   License as published by the Free Software Foundation; either
			
 
				+   version 3 of the License, or (at your option) any later version.
			
 
				+
			
 
				+   This library is distributed in the hope that it will be useful,
			
 
				+   but WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
			
 
				+   Lesser General Public License for more details.
			
 
				+
			
 
				+   You should have received a copy of the GNU Lesser General Public
			
 
				+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
			
 
				+*/
			
 
				+
			
 
				+#ifdef  __cplusplus
			
 
				+extern "C" {
			
 
				+#endif
			
 
				+
			
 
				+#ifdef HAVE_LIBREPLACE
			
 
				+#include <replace.h>
			
 
				+#include <system/filesys.h>
			
 
				+#else
			
 
				+#if HAVE_FILE_OFFSET_BITS
			
 
				+#define _FILE_OFFSET_BITS 64
			
 
				+#endif
			
 
				+
			
 
				+#ifndef _PUBLIC_
			
 
				+#ifdef HAVE_VISIBILITY_ATTR
			
 
				+#define _PUBLIC_ __attribute__((visibility("default")))
			
 
				+#else
			
 
				+#define _PUBLIC_
			
 
				+#endif
			
 
				+#endif
			
 
				+
			
 
				+/* For mode_t */
			
 
				+#include <sys/types.h>
			
 
				+/* For O_* flags. */
			
 
				+#include <sys/stat.h>
			
 
				+/* For sig_atomic_t. */
			
 
				+#include <signal.h>
			
 
				+/* For uint64_t */
			
 
				+#include <stdint.h>
			
 
				+/* For bool */
			
 
				+#include <stdbool.h>
			
 
				+/* For memcmp */
			
 
				+#include <string.h>
			
 
				+#endif
			
 
				+
			
 
				+#if HAVE_CCAN
			
 
				+#include <ccan/compiler/compiler.h>
			
 
				+#include <ccan/typesafe_cb/typesafe_cb.h>
			
 
				+#include <ccan/cast/cast.h>
			
 
				+#else
			
 
				+#ifndef typesafe_cb_preargs
			
 
				+/* Failing to have CCAN just mean less typesafe protection, etc. */
			
 
				+#define typesafe_cb_preargs(rtype, atype, fn, arg, ...)	\
			
 
				+	((rtype (*)(__VA_ARGS__, atype))(fn))
			
 
				+#endif
			
 
				+#ifndef cast_const
			
 
				+#if defined(__intptr_t_defined) || defined(HAVE_INTPTR_T)
			
 
				+#define cast_const(type, expr) ((type)((intptr_t)(expr)))
			
 
				+#else
			
 
				+#define cast_const(type, expr) ((type *)(expr))
			
 
				+#endif
			
 
				+#endif
			
 
				+#endif /* !HAVE_CCAN */
			
 
				+
			
 
				+union ntdb_attribute;
			
 
				+struct ntdb_context;
			
 
				+
			
 
				+/**
			
 
				+ * struct TDB_DATA - (n)tdb data blob
			
 
				+ *
			
 
				+ * To ease compatibility, we use 'struct TDB_DATA' from tdb.h, so if
			
 
				+ * you want to include both tdb.h and ntdb.h, you need to #include
			
 
				+ * tdb.h first.
			
 
				+ */
			
 
				+#ifndef __TDB_H__
			
 
				+struct TDB_DATA {
			
 
				+	unsigned char *dptr;
			
 
				+	size_t dsize;
			
 
				+};
			
 
				+#endif
			
 
				+
			
 
				+typedef struct TDB_DATA NTDB_DATA;
			
 
				+
			
 
				+/**
			
 
				+ * ntdb_open - open a database file
			
 
				+ * @name: the file name (or database name if flags contains NTDB_INTERNAL)
			
 
				+ * @ntdb_flags: options for this database
			
 
				+ * @open_flags: flags argument for ntdb's open() call.
			
 
				+ * @mode: mode argument for ntdb's open() call.
			
 
				+ * @attributes: linked list of extra attributes for this ntdb.
			
 
				+ *
			
 
				+ * This call opens (and potentially creates) a database file.
			
 
				+ * Multiple processes can have the NTDB file open at once.
			
 
				+ *
			
 
				+ * On failure it will return NULL, and set errno: it may also call
			
 
				+ * any log attribute found in @attributes.
			
 
				+ *
			
 
				+ * See also:
			
 
				+ *	union ntdb_attribute
			
 
				+ */
			
 
				+struct ntdb_context *ntdb_open(const char *name, int ntdb_flags,
			
 
				+			       int open_flags, mode_t mode,
			
 
				+			       union ntdb_attribute *attributes);
			
 
				+
			
 
				+
			
 
				+/* flags for ntdb_open() */
			
 
				+#define NTDB_DEFAULT 0 /* just a readability place holder */
			
 
				+#define NTDB_INTERNAL 2 /* don't store on disk */
			
 
				+#define NTDB_NOLOCK   4 /* don't do any locking */
			
 
				+#define NTDB_NOMMAP   8 /* don't use mmap */
			
 
				+#define NTDB_CONVERT 16 /* convert endian */
			
 
				+#define NTDB_NOSYNC   64 /* don't use synchronous transactions */
			
 
				+#define NTDB_SEQNUM   128 /* maintain a sequence number */
			
 
				+#define NTDB_ALLOW_NESTING   256 /* fake nested transactions */
			
 
				+#define NTDB_RDONLY   512 /* implied by O_RDONLY */
			
 
				+#define NTDB_CANT_CHECK  2048 /* has a feature which we don't understand */
			
 
				+
			
 
				+/**
			
 
				+ * ntdb_close - close and free a ntdb.
			
 
				+ * @ntdb: the ntdb context returned from ntdb_open()
			
 
				+ *
			
 
				+ * This always succeeds, in that @ntdb is unusable after this call.  But if
			
 
				+ * some unexpected error occurred while closing, it will return non-zero
			
 
				+ * (the only clue as to cause will be via the log attribute).
			
 
				+ */
			
 
				+int ntdb_close(struct ntdb_context *ntdb);
			
 
				+
			
 
				+/**
			
 
				+ * enum NTDB_ERROR - error returns for NTDB
			
 
				+ *
			
 
				+ * See Also:
			
 
				+ *	ntdb_errorstr()
			
 
				+ */
			
 
				+enum NTDB_ERROR {
			
 
				+	NTDB_SUCCESS	= 0,	/* No error. */
			
 
				+	NTDB_ERR_CORRUPT = -1,	/* We read the db, and it was bogus. */
			
 
				+	NTDB_ERR_IO	= -2,	/* We couldn't read/write the db. */
			
 
				+	NTDB_ERR_LOCK	= -3,	/* Locking failed. */
			
 
				+	NTDB_ERR_OOM	= -4,	/* Out of Memory. */
			
 
				+	NTDB_ERR_EXISTS	= -5,	/* The key already exists. */
			
 
				+	NTDB_ERR_NOEXIST	= -6,	/* The key does not exist. */
			
 
				+	NTDB_ERR_EINVAL	= -7,	/* You're using it wrong. */
			
 
				+	NTDB_ERR_RDONLY	= -8,	/* The database is read-only. */
			
 
				+	NTDB_ERR_LAST = NTDB_ERR_RDONLY
			
 
				+};
			
 
				+
			
 
				+/**
			
 
				+ * ntdb_store - store a key/value pair in a ntdb.
			
 
				+ * @ntdb: the ntdb context returned from ntdb_open()
			
 
				+ * @key: the key
			
 
				+ * @dbuf: the data to associate with the key.
			
 
				+ * @flag: NTDB_REPLACE, NTDB_INSERT or NTDB_MODIFY.
			
 
				+ *
			
 
				+ * This inserts (or overwrites) a key/value pair in the NTDB.  If flag
			
 
				+ * is NTDB_REPLACE, it doesn't matter whether the key exists or not;
			
 
				+ * NTDB_INSERT means it must not exist (returns NTDB_ERR_EXISTS otherwise),
			
 
				+ * and NTDB_MODIFY means it must exist (returns NTDB_ERR_NOEXIST otherwise).
			
 
				+ *
			
 
				+ * On success, this returns NTDB_SUCCESS.
			
 
				+ *
			
 
				+ * See also:
			
 
				+ *	ntdb_fetch, ntdb_transaction_start, ntdb_append, ntdb_delete.
			
 
				+ */
			
 
				+enum NTDB_ERROR ntdb_store(struct ntdb_context *ntdb,
			
 
				+			   NTDB_DATA key,
			
 
				+			   NTDB_DATA dbuf,
			
 
				+			   int flag);
			
 
				+
			
 
				+/* flags to ntdb_store() */
			
 
				+#define NTDB_REPLACE 1		/* A readability place holder */
			
 
				+#define NTDB_INSERT 2 		/* Don't overwrite an existing entry */
			
 
				+#define NTDB_MODIFY 3		/* Don't create an existing entry    */
			
 
				+
			
 
				+/**
			
 
				+ * ntdb_fetch - fetch a value from a ntdb.
			
 
				+ * @ntdb: the ntdb context returned from ntdb_open()
			
 
				+ * @key: the key
			
 
				+ * @data: pointer to data.
			
 
				+ *
			
 
				+ * This looks up a key in the database and sets it in @data.
			
 
				+ *
			
 
				+ * If it returns NTDB_SUCCESS, the key was found: it is your
			
 
				+ * responsibility to call free() on @data->dptr.
			
 
				+ *
			
 
				+ * Otherwise, it returns an error (usually, NTDB_ERR_NOEXIST) and @data is
			
 
				+ * undefined.
			
 
				+ */
			
 
				+enum NTDB_ERROR ntdb_fetch(struct ntdb_context *ntdb, NTDB_DATA key,
			
 
				+			   NTDB_DATA *data);
			
 
				+
			
 
				+/**
			
 
				+ * ntdb_errorstr - map the ntdb error onto a constant readable string
			
 
				+ * @ecode: the enum NTDB_ERROR to map.
			
 
				+ *
			
 
				+ * This is useful for displaying errors to users.
			
 
				+ */
			
 
				+const char *ntdb_errorstr(enum NTDB_ERROR ecode);
			
 
				+
			
 
				+/**
			
 
				+ * ntdb_append - append a value to a key/value pair in a ntdb.
			
 
				+ * @ntdb: the ntdb context returned from ntdb_open()
			
 
				+ * @key: the key
			
 
				+ * @dbuf: the data to append.
			
 
				+ *
			
 
				+ * This is equivalent to fetching a record, reallocating .dptr to add the
			
 
				+ * data, and writing it back, only it's much more efficient.  If the key
			
 
				+ * doesn't exist, it's equivalent to ntdb_store (with an additional hint that
			
 
				+ * you expect to expand the record in future).
			
 
				+ *
			
 
				+ * See Also:
			
 
				+ *	ntdb_fetch(), ntdb_store()
			
 
				+ */
			
 
				+enum NTDB_ERROR ntdb_append(struct ntdb_context *ntdb,
			
 
				+			    NTDB_DATA key, NTDB_DATA dbuf);
			
 
				+
			
 
				+/**
			
 
				+ * ntdb_delete - delete a key from a ntdb.
			
 
				+ * @ntdb: the ntdb context returned from ntdb_open()
			
 
				+ * @key: the key to delete.
			
 
				+ *
			
 
				+ * Returns NTDB_SUCCESS on success, or an error (usually NTDB_ERR_NOEXIST).
			
 
				+ *
			
 
				+ * See Also:
			
 
				+ *	ntdb_fetch(), ntdb_store()
			
 
				+ */
			
 
				+enum NTDB_ERROR ntdb_delete(struct ntdb_context *ntdb, NTDB_DATA key);
			
 
				+
			
 
				+/**
			
 
				+ * ntdb_exists - does a key exist in the database?
			
 
				+ * @ntdb: the ntdb context returned from ntdb_open()
			
 
				+ * @key: the key to search for.
			
 
				+ *
			
 
				+ * Returns true if it exists, or false if it doesn't or any other error.
			
 
				+ */
			
 
				+bool ntdb_exists(struct ntdb_context *ntdb, NTDB_DATA key);
			
 
				+
			
 
				+/**
			
 
				+ * ntdb_deq - are NTDB_DATA equal?
			
 
				+ * @a: one NTDB_DATA
			
 
				+ * @b: another NTDB_DATA
			
 
				+ */
			
 
				+static inline bool ntdb_deq(NTDB_DATA a, NTDB_DATA b)
			
 
				+{
			
 
				+	return a.dsize == b.dsize && memcmp(a.dptr, b.dptr, a.dsize) == 0;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * ntdb_mkdata - make a NTDB_DATA from const data
			
 
				+ * @p: the constant pointer
			
 
				+ * @len: the length
			
 
				+ *
			
 
				+ * As the dptr member of NTDB_DATA is not constant, you need to
			
 
				+ * cast it.  This function keeps thost casts in one place, as well as
			
 
				+ * suppressing the warning some compilers give when casting away a
			
 
				+ * qualifier (eg. gcc with -Wcast-qual)
			
 
				+ */
			
 
				+static inline NTDB_DATA ntdb_mkdata(const void *p, size_t len)
			
 
				+{
			
 
				+	NTDB_DATA d;
			
 
				+	d.dptr = cast_const(void *, p);
			
 
				+	d.dsize = len;
			
 
				+	return d;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * ntdb_transaction_start - start a transaction
			
 
				+ * @ntdb: the ntdb context returned from ntdb_open()
			
 
				+ *
			
 
				+ * This begins a series of atomic operations.  Other processes will be able
			
 
				+ * to read the ntdb, but not alter it (they will block), nor will they see
			
 
				+ * any changes until ntdb_transaction_commit() is called.
			
 
				+ *
			
 
				+ * Note that if the NTDB_ALLOW_NESTING flag is set, a ntdb_transaction_start()
			
 
				+ * within a transaction will succeed, but it's not a real transaction:
			
 
				+ * (1) An inner transaction which is committed is not actually committed until
			
 
				+ *     the outer transaction is; if the outer transaction is cancelled, the
			
 
				+ *     inner ones are discarded.
			
 
				+ * (2) ntdb_transaction_cancel() marks the outer transaction as having an error,
			
 
				+ *     so the final ntdb_transaction_commit() will fail.
			
 
				+ * (3) the outer transaction will see the results of the inner transaction.
			
 
				+ *
			
 
				+ * See Also:
			
 
				+ *	ntdb_transaction_cancel, ntdb_transaction_commit.
			
 
				+ */
			
 
				+enum NTDB_ERROR ntdb_transaction_start(struct ntdb_context *ntdb);
			
 
				+
			
 
				+/**
			
 
				+ * ntdb_transaction_cancel - abandon a transaction
			
 
				+ * @ntdb: the ntdb context returned from ntdb_open()
			
 
				+ *
			
 
				+ * This aborts a transaction, discarding any changes which were made.
			
 
				+ * ntdb_close() does this implicitly.
			
 
				+ */
			
 
				+void ntdb_transaction_cancel(struct ntdb_context *ntdb);
			
 
				+
			
 
				+/**
			
 
				+ * ntdb_transaction_commit - commit a transaction
			
 
				+ * @ntdb: the ntdb context returned from ntdb_open()
			
 
				+ *
			
 
				+ * This completes a transaction, writing any changes which were made.
			
 
				+ *
			
 
				+ * fsync() is used to commit the transaction (unless NTDB_NOSYNC is set),
			
 
				+ * making it robust against machine crashes, but very slow compared to
			
 
				+ * other NTDB operations.
			
 
				+ *
			
 
				+ * A failure can only be caused by unexpected errors (eg. I/O or
			
 
				+ * memory); this is no point looping on transaction failure.
			
 
				+ *
			
 
				+ * See Also:
			
 
				+ *	ntdb_transaction_prepare_commit()
			
 
				+ */
			
 
				+enum NTDB_ERROR ntdb_transaction_commit(struct ntdb_context *ntdb);
			
 
				+
			
 
				+/**
			
 
				+ * ntdb_transaction_prepare_commit - prepare to commit a transaction
			
 
				+ * @ntdb: the ntdb context returned from ntdb_open()
			
 
				+ *
			
 
				+ * This ensures we have the resources to commit a transaction (using
			
 
				+ * ntdb_transaction_commit): if this succeeds then a transaction will only
			
 
				+ * fail if the write() or fsync() calls fail.
			
 
				+ *
			
 
				+ * If this fails you must still call ntdb_transaction_cancel() to cancel
			
 
				+ * the transaction.
			
 
				+ *
			
 
				+ * See Also:
			
 
				+ *	ntdb_transaction_commit()
			
 
				+ */
			
 
				+enum NTDB_ERROR ntdb_transaction_prepare_commit(struct ntdb_context *ntdb);
			
 
				+
			
 
				+/**
			
 
				+ * ntdb_traverse - traverse a NTDB
			
 
				+ * @ntdb: the ntdb context returned from ntdb_open()
			
 
				+ * @fn: the function to call for every key/value pair (or NULL)
			
 
				+ * @p: the pointer to hand to @f
			
 
				+ *
			
 
				+ * This walks the NTDB until all they keys have been traversed, or @fn
			
 
				+ * returns non-zero.  If the traverse function or other processes are
			
 
				+ * changing data or adding or deleting keys, the traverse may be
			
 
				+ * unreliable: keys may be skipped or (rarely) visited twice.
			
 
				+ *
			
 
				+ * There is one specific exception: the special case of deleting the
			
 
				+ * current key does not undermine the reliability of the traversal.
			
 
				+ *
			
 
				+ * On success, returns the number of keys iterated.  On error returns
			
 
				+ * a negative enum NTDB_ERROR value.
			
 
				+ */
			
 
				+#define ntdb_traverse(ntdb, fn, p)					\
			
 
				+	ntdb_traverse_(ntdb, typesafe_cb_preargs(int, void *, (fn), (p), \
			
 
				+						 struct ntdb_context *,	\
			
 
				+						 NTDB_DATA, NTDB_DATA), (p))
			
 
				+
			
 
				+int64_t ntdb_traverse_(struct ntdb_context *ntdb,
			
 
				+		       int (*fn)(struct ntdb_context *,
			
 
				+				 NTDB_DATA, NTDB_DATA, void *), void *p);
			
 
				+
			
 
				+/**
			
 
				+ * ntdb_parse_record - operate directly on data in the database.
			
 
				+ * @ntdb: the ntdb context returned from ntdb_open()
			
 
				+ * @key: the key whose record we should hand to @parse
			
 
				+ * @parse: the function to call for the data
			
 
				+ * @data: the private pointer to hand to @parse (types must match).
			
 
				+ *
			
 
				+ * This avoids a copy for many cases, by handing you a pointer into
			
 
				+ * the memory-mapped database.  It also locks the record to prevent
			
 
				+ * other accesses at the same time, so it won't change.
			
 
				+ *
			
 
				+ * Within the @parse callback you can perform read operations on the
			
 
				+ * database, but no write operations: no ntdb_store() or
			
 
				+ * ntdb_delete(), for example.  The exception is if you call
			
 
				+ * ntdb_lockall() before ntdb_parse_record().
			
 
				+ *
			
 
				+ * Never alter the data handed to parse()!
			
 
				+ */
			
 
				+#define ntdb_parse_record(ntdb, key, parse, data)			\
			
 
				+	ntdb_parse_record_((ntdb), (key),				\
			
 
				+			   typesafe_cb_preargs(enum NTDB_ERROR, void *,	\
			
 
				+					       (parse), (data),		\
			
 
				+					       NTDB_DATA, NTDB_DATA), (data))
			
 
				+
			
 
				+enum NTDB_ERROR ntdb_parse_record_(struct ntdb_context *ntdb,
			
 
				+				   NTDB_DATA key,
			
 
				+				   enum NTDB_ERROR (*parse)(NTDB_DATA k,
			
 
				+							    NTDB_DATA d,
			
 
				+							    void *data),
			
 
				+				   void *data);
			
 
				+
			
 
				+/**
			
 
				+ * ntdb_get_seqnum - get a database sequence number
			
 
				+ * @ntdb: the ntdb context returned from ntdb_open()
			
 
				+ *
			
 
				+ * This returns a sequence number: any change to the database from a
			
 
				+ * ntdb context opened with the NTDB_SEQNUM flag will cause that number
			
 
				+ * to increment.  Note that the incrementing is unreliable (it is done
			
 
				+ * without locking), so this is only useful as an optimization.
			
 
				+ *
			
 
				+ * For example, you may have a regular database backup routine which
			
 
				+ * does not operate if the sequence number is unchanged.  In the
			
 
				+ * unlikely event of a failed increment, it will be backed up next
			
 
				+ * time any way.
			
 
				+ *
			
 
				+ * Returns an enum NTDB_ERROR (ie. negative) on error.
			
 
				+ */
			
 
				+int64_t ntdb_get_seqnum(struct ntdb_context *ntdb);
			
 
				+
			
 
				+/**
			
 
				+ * ntdb_firstkey - get the "first" key in a NTDB
			
 
				+ * @ntdb: the ntdb context returned from ntdb_open()
			
 
				+ * @key: pointer to key.
			
 
				+ *
			
 
				+ * This returns an arbitrary key in the database; with ntdb_nextkey() it allows
			
 
				+ * open-coded traversal of the database, though it is slightly less efficient
			
 
				+ * than ntdb_traverse.
			
 
				+ *
			
 
				+ * It is your responsibility to free @key->dptr on success.
			
 
				+ *
			
 
				+ * Returns NTDB_ERR_NOEXIST if the database is empty.
			
 
				+ */
			
 
				+enum NTDB_ERROR ntdb_firstkey(struct ntdb_context *ntdb, NTDB_DATA *key);
			
 
				+
			
 
				+/**
			
 
				+ * ntdb_nextkey - get the "next" key in a NTDB
			
 
				+ * @ntdb: the ntdb context returned from ntdb_open()
			
 
				+ * @key: a key returned by ntdb_firstkey() or ntdb_nextkey().
			
 
				+ *
			
 
				+ * This returns another key in the database; it will free @key.dptr for
			
 
				+ * your convenience.
			
 
				+ *
			
 
				+ * Returns NTDB_ERR_NOEXIST if there are no more keys.
			
 
				+ */
			
 
				+enum NTDB_ERROR ntdb_nextkey(struct ntdb_context *ntdb, NTDB_DATA *key);
			
 
				+
			
 
				+/**
			
 
				+ * ntdb_chainlock - lock a record in the NTDB
			
 
				+ * @ntdb: the ntdb context returned from ntdb_open()
			
 
				+ * @key: the key to lock.
			
 
				+ *
			
 
				+ * This prevents any access occurring to a group of keys including @key,
			
 
				+ * even if @key does not exist.  This allows primitive atomic updates of
			
 
				+ * records without using transactions.
			
 
				+ *
			
 
				+ * You cannot begin a transaction while holding a ntdb_chainlock(), nor can
			
 
				+ * you do any operations on any other keys in the database.  This also means
			
 
				+ * that you cannot hold more than one ntdb_chainlock() at a time.
			
 
				+ *
			
 
				+ * See Also:
			
 
				+ *	ntdb_chainunlock()
			
 
				+ */
			
 
				+enum NTDB_ERROR ntdb_chainlock(struct ntdb_context *ntdb, NTDB_DATA key);
			
 
				+
			
 
				+/**
			
 
				+ * ntdb_chainunlock - unlock a record in the NTDB
			
 
				+ * @ntdb: the ntdb context returned from ntdb_open()
			
 
				+ * @key: the key to unlock.
			
 
				+ *
			
 
				+ * The key must have previously been locked by ntdb_chainlock().
			
 
				+ */
			
 
				+void ntdb_chainunlock(struct ntdb_context *ntdb, NTDB_DATA key);
			
 
				+
			
 
				+/**
			
 
				+ * ntdb_chainlock_read - lock a record in the NTDB, for reading
			
 
				+ * @ntdb: the ntdb context returned from ntdb_open()
			
 
				+ * @key: the key to lock.
			
 
				+ *
			
 
				+ * This prevents any changes from occurring to a group of keys including @key,
			
 
				+ * even if @key does not exist.  This allows primitive atomic updates of
			
 
				+ * records without using transactions.
			
 
				+ *
			
 
				+ * You cannot begin a transaction while holding a ntdb_chainlock_read(), nor can
			
 
				+ * you do any operations on any other keys in the database.  This also means
			
 
				+ * that you cannot hold more than one ntdb_chainlock()/read() at a time.
			
 
				+ *
			
 
				+ * See Also:
			
 
				+ *	ntdb_chainlock()
			
 
				+ */
			
 
				+enum NTDB_ERROR ntdb_chainlock_read(struct ntdb_context *ntdb, NTDB_DATA key);
			
 
				+
			
 
				+/**
			
 
				+ * ntdb_chainunlock_read - unlock a record in the NTDB for reading
			
 
				+ * @ntdb: the ntdb context returned from ntdb_open()
			
 
				+ * @key: the key to unlock.
			
 
				+ *
			
 
				+ * The key must have previously been locked by ntdb_chainlock_read().
			
 
				+ */
			
 
				+void ntdb_chainunlock_read(struct ntdb_context *ntdb, NTDB_DATA key);
			
 
				+
			
 
				+/**
			
 
				+ * ntdb_lockall - lock the entire NTDB
			
 
				+ * @ntdb: the ntdb context returned from ntdb_open()
			
 
				+ *
			
 
				+ * You cannot hold a ntdb_chainlock while calling this.  It nests, so you
			
 
				+ * must call ntdb_unlockall as many times as you call ntdb_lockall.
			
 
				+ */
			
 
				+enum NTDB_ERROR ntdb_lockall(struct ntdb_context *ntdb);
			
 
				+
			
 
				+/**
			
 
				+ * ntdb_unlockall - unlock the entire NTDB
			
 
				+ * @ntdb: the ntdb context returned from ntdb_open()
			
 
				+ */
			
 
				+void ntdb_unlockall(struct ntdb_context *ntdb);
			
 
				+
			
 
				+/**
			
 
				+ * ntdb_lockall_read - lock the entire NTDB for reading
			
 
				+ * @ntdb: the ntdb context returned from ntdb_open()
			
 
				+ *
			
 
				+ * This prevents others writing to the database, eg. ntdb_delete, ntdb_store,
			
 
				+ * ntdb_append, but not ntdb_fetch.
			
 
				+ *
			
 
				+ * You cannot hold a ntdb_chainlock while calling this.  It nests, so you
			
 
				+ * must call ntdb_unlockall_read as many times as you call ntdb_lockall_read.
			
 
				+ */
			
 
				+enum NTDB_ERROR ntdb_lockall_read(struct ntdb_context *ntdb);
			
 
				+
			
 
				+/**
			
 
				+ * ntdb_unlockall_read - unlock the entire NTDB for reading
			
 
				+ * @ntdb: the ntdb context returned from ntdb_open()
			
 
				+ */
			
 
				+void ntdb_unlockall_read(struct ntdb_context *ntdb);
			
 
				+
			
 
				+/**
			
 
				+ * ntdb_wipe_all - wipe the database clean
			
 
				+ * @ntdb: the ntdb context returned from ntdb_open()
			
 
				+ *
			
 
				+ * Completely erase the database.  This is faster than iterating through
			
 
				+ * each key and doing ntdb_delete.
			
 
				+ */
			
 
				+enum NTDB_ERROR ntdb_wipe_all(struct ntdb_context *ntdb);
			
 
				+
			
 
				+/**
			
 
				+ * ntdb_repack - repack the database
			
 
				+ * @ntdb: the ntdb context returned from ntdb_open()
			
 
				+ *
			
 
				+ * This repacks the database; if it is suffering from a great deal of
			
 
				+ * fragmentation this might help.  However, it can take twice the
			
 
				+ * memory of the existing NTDB.
			
 
				+ */
			
 
				+enum NTDB_ERROR ntdb_repack(struct ntdb_context *ntdb);
			
 
				+
			
 
				+/**
			
 
				+ * ntdb_check - check a NTDB for consistency
			
 
				+ * @ntdb: the ntdb context returned from ntdb_open()
			
 
				+ * @check: function to check each key/data pair (or NULL)
			
 
				+ * @data: argument for @check, must match type.
			
 
				+ *
			
 
				+ * This performs a consistency check of the open database, optionally calling
			
 
				+ * a check() function on each record so you can do your own data consistency
			
 
				+ * checks as well.  If check() returns an error, that is returned from
			
 
				+ * ntdb_check().
			
 
				+ *
			
 
				+ * Note that the NTDB uses a feature which we don't understand which
			
 
				+ * indicates we can't run ntdb_check(), this will log a warning to that
			
 
				+ * effect and return NTDB_SUCCESS.  You can detect this condition by
			
 
				+ * looking for NTDB_CANT_CHECK in ntdb_get_flags().
			
 
				+ *
			
 
				+ * Returns NTDB_SUCCESS or an error.
			
 
				+ */
			
 
				+#define ntdb_check(ntdb, check, data)					\
			
 
				+	ntdb_check_((ntdb), typesafe_cb_preargs(enum NTDB_ERROR, void *, \
			
 
				+						(check), (data),	\
			
 
				+						NTDB_DATA,		\
			
 
				+						NTDB_DATA),		\
			
 
				+		    (data))
			
 
				+
			
 
				+enum NTDB_ERROR ntdb_check_(struct ntdb_context *ntdb,
			
 
				+			    enum NTDB_ERROR (*check)(NTDB_DATA k,
			
 
				+						     NTDB_DATA d,
			
 
				+						     void *data),
			
 
				+			    void *data);
			
 
				+
			
 
				+/**
			
 
				+ * enum ntdb_summary_flags - flags for ntdb_summary.
			
 
				+ */
			
 
				+enum ntdb_summary_flags {
			
 
				+	NTDB_SUMMARY_HISTOGRAMS = 1 /* Draw graphs in the summary. */
			
 
				+};
			
 
				+
			
 
				+/**
			
 
				+ * ntdb_summary - return a string describing the NTDB state
			
 
				+ * @ntdb: the ntdb context returned from ntdb_open()
			
 
				+ * @flags: flags to control the summary output.
			
 
				+ * @summary: pointer to string to allocate.
			
 
				+ *
			
 
				+ * This returns a developer-readable string describing the overall
			
 
				+ * state of the ntdb, such as the percentage used and sizes of records.
			
 
				+ * It is designed to provide information about the ntdb at a glance
			
 
				+ * without displaying any keys or data in the database.
			
 
				+ *
			
 
				+ * On success, sets @summary to point to a malloc()'ed nul-terminated
			
 
				+ * multi-line string.  It is your responsibility to free() it.
			
 
				+ */
			
 
				+enum NTDB_ERROR ntdb_summary(struct ntdb_context *ntdb,
			
 
				+			     enum ntdb_summary_flags flags,
			
 
				+			     char **summary);
			
 
				+
			
 
				+
			
 
				+/**
			
 
				+ * ntdb_get_flags - return the flags for a ntdb
			
 
				+ * @ntdb: the ntdb context returned from ntdb_open()
			
 
				+ *
			
 
				+ * This returns the flags on the current ntdb.  Some of these are caused by
			
 
				+ * the flags argument to ntdb_open(), others (such as NTDB_CONVERT) are
			
 
				+ * intuited.
			
 
				+ */
			
 
				+unsigned int ntdb_get_flags(struct ntdb_context *ntdb);
			
 
				+
			
 
				+/**
			
 
				+ * ntdb_add_flag - set a flag for a ntdb
			
 
				+ * @ntdb: the ntdb context returned from ntdb_open()
			
 
				+ * @flag: one of NTDB_NOLOCK, NTDB_NOMMAP, NTDB_NOSYNC or NTDB_ALLOW_NESTING.
			
 
				+ *
			
 
				+ * You can use this to set a flag on the NTDB.  You cannot set these flags
			
 
				+ * on a NTDB_INTERNAL ntdb.
			
 
				+ */
			
 
				+void ntdb_add_flag(struct ntdb_context *ntdb, unsigned flag);
			
 
				+
			
 
				+/**
			
 
				+ * ntdb_remove_flag - unset a flag for a ntdb
			
 
				+ * @ntdb: the ntdb context returned from ntdb_open()
			
 
				+ * @flag: one of NTDB_NOLOCK, NTDB_NOMMAP, NTDB_NOSYNC or NTDB_ALLOW_NESTING.
			
 
				+ *
			
 
				+ * You can use this to clear a flag on the NTDB.  You cannot clear flags
			
 
				+ * on a NTDB_INTERNAL ntdb.
			
 
				+ */
			
 
				+void ntdb_remove_flag(struct ntdb_context *ntdb, unsigned flag);
			
 
				+
			
 
				+/**
			
 
				+ * enum ntdb_attribute_type - descriminator for union ntdb_attribute.
			
 
				+ */
			
 
				+enum ntdb_attribute_type {
			
 
				+	NTDB_ATTRIBUTE_LOG = 0,
			
 
				+	NTDB_ATTRIBUTE_HASH = 1,
			
 
				+	NTDB_ATTRIBUTE_SEED = 2,
			
 
				+	NTDB_ATTRIBUTE_STATS = 3,
			
 
				+	NTDB_ATTRIBUTE_OPENHOOK = 4,
			
 
				+	NTDB_ATTRIBUTE_FLOCK = 5,
			
 
				+	NTDB_ATTRIBUTE_ALLOCATOR = 6,
			
 
				+	NTDB_ATTRIBUTE_HASHSIZE = 7
			
 
				+};
			
 
				+
			
 
				+/**
			
 
				+ * ntdb_get_attribute - get an attribute for an existing ntdb
			
 
				+ * @ntdb: the ntdb context returned from ntdb_open()
			
 
				+ * @attr: the union ntdb_attribute to set.
			
 
				+ *
			
 
				+ * This gets an attribute from a NTDB which has previously been set (or
			
 
				+ * may return the default values).  Set @attr.base.attr to the
			
 
				+ * attribute type you want get.
			
 
				+ */
			
 
				+enum NTDB_ERROR ntdb_get_attribute(struct ntdb_context *ntdb,
			
 
				+				   union ntdb_attribute *attr);
			
 
				+
			
 
				+/**
			
 
				+ * ntdb_set_attribute - set an attribute for an existing ntdb
			
 
				+ * @ntdb: the ntdb context returned from ntdb_open()
			
 
				+ * @attr: the union ntdb_attribute to set.
			
 
				+ *
			
 
				+ * This sets an attribute on a NTDB, overriding any previous attribute
			
 
				+ * of the same type.  It returns NTDB_ERR_EINVAL if the attribute is
			
 
				+ * unknown or invalid.
			
 
				+ *
			
 
				+ * Note that NTDB_ATTRIBUTE_HASH, NTDB_ATTRIBUTE_SEED, and
			
 
				+ * NTDB_ATTRIBUTE_OPENHOOK cannot currently be set after ntdb_open.
			
 
				+ */
			
 
				+enum NTDB_ERROR ntdb_set_attribute(struct ntdb_context *ntdb,
			
 
				+				   const union ntdb_attribute *attr);
			
 
				+
			
 
				+/**
			
 
				+ * ntdb_unset_attribute - reset an attribute for an existing ntdb
			
 
				+ * @ntdb: the ntdb context returned from ntdb_open()
			
 
				+ * @type: the attribute type to unset.
			
 
				+ *
			
 
				+ * This unsets an attribute on a NTDB, returning it to the defaults
			
 
				+ * (where applicable).
			
 
				+ *
			
 
				+ * Note that it only makes sense for NTDB_ATTRIBUTE_LOG and NTDB_ATTRIBUTE_FLOCK
			
 
				+ * to be unset.
			
 
				+ */
			
 
				+void ntdb_unset_attribute(struct ntdb_context *ntdb,
			
 
				+			  enum ntdb_attribute_type type);
			
 
				+
			
 
				+/**
			
 
				+ * ntdb_name - get the name of a ntdb
			
 
				+ * @ntdb: the ntdb context returned from ntdb_open()
			
 
				+ *
			
 
				+ * This returns a copy of the name string, made at ntdb_open() time.
			
 
				+ *
			
 
				+ * This is mostly useful for logging.
			
 
				+ */
			
 
				+const char *ntdb_name(const struct ntdb_context *ntdb);
			
 
				+
			
 
				+/**
			
 
				+ * ntdb_fd - get the file descriptor of a ntdb
			
 
				+ * @ntdb: the ntdb context returned from ntdb_open()
			
 
				+ *
			
 
				+ * This returns the file descriptor for the underlying database file, or -1
			
 
				+ * for NTDB_INTERNAL.
			
 
				+ */
			
 
				+int ntdb_fd(const struct ntdb_context *ntdb);
			
 
				+
			
 
				+/**
			
 
				+ * ntdb_foreach - iterate through every open NTDB.
			
 
				+ * @fn: the function to call for every NTDB
			
 
				+ * @p: the pointer to hand to @fn
			
 
				+ *
			
 
				+ * NTDB internally keeps track of all open TDBs; this function allows you to
			
 
				+ * iterate through them.  If @fn returns non-zero, traversal stops.
			
 
				+ */
			
 
				+#define ntdb_foreach(fn, p)						\
			
 
				+	ntdb_foreach_(typesafe_cb_preargs(int, void *, (fn), (p),	\
			
 
				+					  struct ntdb_context *), (p))
			
 
				+
			
 
				+void ntdb_foreach_(int (*fn)(struct ntdb_context *, void *), void *p);
			
 
				+
			
 
				+/**
			
 
				+ * struct ntdb_attribute_base - common fields for all ntdb attributes.
			
 
				+ */
			
 
				+struct ntdb_attribute_base {
			
 
				+	enum ntdb_attribute_type attr;
			
 
				+	union ntdb_attribute *next;
			
 
				+};
			
 
				+
			
 
				+/**
			
 
				+ * enum ntdb_log_level - log levels for ntdb_attribute_log
			
 
				+ * @NTDB_LOG_ERROR: used to log unrecoverable errors such as I/O errors
			
 
				+ *		   or internal consistency failures.
			
 
				+ * @NTDB_LOG_USE_ERROR: used to log usage errors such as invalid parameters
			
 
				+ *		   or writing to a read-only database.
			
 
				+ * @NTDB_LOG_WARNING: used for informational messages on issues which
			
 
				+ *		     are unusual but handled by NTDB internally, such
			
 
				+ *		     as a failure to mmap or failure to open /dev/urandom.
			
 
				+ *		     It's also used when ntdb_open() fails without O_CREAT
			
 
				+ *		     because a file does not exist.
			
 
				+ */
			
 
				+enum ntdb_log_level {
			
 
				+	NTDB_LOG_ERROR,
			
 
				+	NTDB_LOG_USE_ERROR,
			
 
				+	NTDB_LOG_WARNING
			
 
				+};
			
 
				+
			
 
				+/**
			
 
				+ * struct ntdb_attribute_log - log function attribute
			
 
				+ *
			
 
				+ * This attribute provides a hook for you to log errors.
			
 
				+ */
			
 
				+struct ntdb_attribute_log {
			
 
				+	struct ntdb_attribute_base base; /* .attr = NTDB_ATTRIBUTE_LOG */
			
 
				+	void (*fn)(struct ntdb_context *ntdb,
			
 
				+		   enum ntdb_log_level level,
			
 
				+		   enum NTDB_ERROR ecode,
			
 
				+		   const char *message,
			
 
				+		   void *data);
			
 
				+	void *data;
			
 
				+};
			
 
				+
			
 
				+/**
			
 
				+ * struct ntdb_attribute_hash - hash function attribute
			
 
				+ *
			
 
				+ * This attribute allows you to provide an alternative hash function.
			
 
				+ * This hash function will be handed keys from the database; it will also
			
 
				+ * be handed the 8-byte NTDB_HASH_MAGIC value for checking the header (the
			
 
				+ * ntdb_open() will fail if the hash value doesn't match the header).
			
 
				+ *
			
 
				+ * Note that if your hash function gives different results on
			
 
				+ * different machine endians, your ntdb will no longer work across
			
 
				+ * different architectures!
			
 
				+ */
			
 
				+struct ntdb_attribute_hash {
			
 
				+	struct ntdb_attribute_base base; /* .attr = NTDB_ATTRIBUTE_HASH */
			
 
				+	uint32_t (*fn)(const void *key, size_t len, uint32_t seed,
			
 
				+		       void *data);
			
 
				+	void *data;
			
 
				+};
			
 
				+
			
 
				+/**
			
 
				+ * struct ntdb_attribute_seed - hash function seed attribute
			
 
				+ *
			
 
				+ * The hash function seed is normally taken from /dev/urandom (or equivalent)
			
 
				+ * but can be set manually here.  This is mainly for testing purposes.
			
 
				+ */
			
 
				+struct ntdb_attribute_seed {
			
 
				+	struct ntdb_attribute_base base; /* .attr = NTDB_ATTRIBUTE_SEED */
			
 
				+	uint64_t seed;
			
 
				+};
			
 
				+
			
 
				+/**
			
 
				+ * struct ntdb_attribute_stats - ntdb operational statistics
			
 
				+ *
			
 
				+ * This attribute records statistics of various low-level NTDB operations.
			
 
				+ * This can be used to assist performance evaluation.  This is only
			
 
				+ * useful for ntdb_get_attribute().
			
 
				+ *
			
 
				+ * New fields will be added at the end, hence the "size" argument which
			
 
				+ * indicates how large your structure is: it must be filled in before
			
 
				+ * calling ntdb_get_attribute(), which will overwrite it with the size
			
 
				+ * ntdb knows about.
			
 
				+ */
			
 
				+struct ntdb_attribute_stats {
			
 
				+	struct ntdb_attribute_base base; /* .attr = NTDB_ATTRIBUTE_STATS */
			
 
				+	size_t size; /* = sizeof(struct ntdb_attribute_stats) */
			
 
				+	uint64_t allocs;
			
 
				+	uint64_t   alloc_subhash;
			
 
				+	uint64_t   alloc_chain;
			
 
				+	uint64_t   alloc_bucket_exact;
			
 
				+	uint64_t   alloc_bucket_max;
			
 
				+	uint64_t   alloc_leftover;
			
 
				+	uint64_t   alloc_coalesce_tried;
			
 
				+	uint64_t     alloc_coalesce_iterate_clash;
			
 
				+	uint64_t     alloc_coalesce_lockfail;
			
 
				+	uint64_t     alloc_coalesce_race;
			
 
				+	uint64_t     alloc_coalesce_succeeded;
			
 
				+	uint64_t       alloc_coalesce_num_merged;
			
 
				+	uint64_t compares;
			
 
				+	uint64_t   compare_wrong_offsetbits;
			
 
				+	uint64_t   compare_wrong_keylen;
			
 
				+	uint64_t   compare_wrong_rechash;
			
 
				+	uint64_t   compare_wrong_keycmp;
			
 
				+	uint64_t transactions;
			
 
				+	uint64_t   transaction_cancel;
			
 
				+	uint64_t   transaction_nest;
			
 
				+	uint64_t   transaction_expand_file;
			
 
				+	uint64_t   transaction_read_direct;
			
 
				+	uint64_t      transaction_read_direct_fail;
			
 
				+	uint64_t   transaction_write_direct;
			
 
				+	uint64_t      transaction_write_direct_fail;
			
 
				+	uint64_t traverses;
			
 
				+	uint64_t	traverse_val_vanished;
			
 
				+	uint64_t expands;
			
 
				+	uint64_t frees;
			
 
				+	uint64_t locks;
			
 
				+	uint64_t   lock_lowlevel;
			
 
				+	uint64_t   lock_nonblock;
			
 
				+	uint64_t     lock_nonblock_fail;
			
 
				+};
			
 
				+
			
 
				+/**
			
 
				+ * struct ntdb_attribute_openhook - ntdb special effects hook for open
			
 
				+ *
			
 
				+ * This attribute contains a function to call once we have the OPEN_LOCK
			
 
				+ * for the ntdb, but before we've examined its contents.  If this succeeds,
			
 
				+ * the ntdb will be populated if it's then zero-length.
			
 
				+ *
			
 
				+ * This is a hack to allow support for TDB-style TDB_CLEAR_IF_FIRST
			
 
				+ * behaviour.
			
 
				+ */
			
 
				+struct ntdb_attribute_openhook {
			
 
				+	struct ntdb_attribute_base base; /* .attr = NTDB_ATTRIBUTE_OPENHOOK */
			
 
				+	enum NTDB_ERROR (*fn)(int fd, void *data);
			
 
				+	void *data;
			
 
				+};
			
 
				+
			
 
				+/**
			
 
				+ * struct ntdb_attribute_flock - ntdb special effects hook for file locking
			
 
				+ *
			
 
				+ * This attribute contains function to call to place locks on a file; it can
			
 
				+ * be used to support non-blocking operations or lock proxying.
			
 
				+ *
			
 
				+ * They should return 0 on success, -1 on failure and set errno.
			
 
				+ *
			
 
				+ * An error will be logged on error if errno is neither EAGAIN nor EINTR
			
 
				+ * (normally it would only return EAGAIN if waitflag is false, and
			
 
				+ * loop internally on EINTR).
			
 
				+ */
			
 
				+struct ntdb_attribute_flock {
			
 
				+	struct ntdb_attribute_base base; /* .attr = NTDB_ATTRIBUTE_FLOCK */
			
 
				+	int (*lock)(int fd,int rw, off_t off, off_t len, bool waitflag, void *);
			
 
				+	int (*unlock)(int fd, int rw, off_t off, off_t len, void *);
			
 
				+	void *data;
			
 
				+};
			
 
				+
			
 
				+/**
			
 
				+ * struct ntdb_attribute_hashsize - ntdb hashsize setting.
			
 
				+ *
			
 
				+ * This attribute is only settable on ntdb_open; it indicates that we create
			
 
				+ * a hashtable of the given size, rather than the default.
			
 
				+ */
			
 
				+struct ntdb_attribute_hashsize {
			
 
				+	struct ntdb_attribute_base base; /* .attr = NTDB_ATTRIBUTE_HASHSIZE */
			
 
				+	uint32_t size;
			
 
				+};
			
 
				+
			
 
				+/**
			
 
				+ * struct ntdb_attribute_allocator - allocator for ntdb to use.
			
 
				+ *
			
 
				+ * You can replace malloc/free with your own allocation functions.
			
 
				+ * The allocator takes an "owner" pointer, which is either NULL (for
			
 
				+ * the initial struct ntdb_context and struct ntdb_file), or a
			
 
				+ * previously allocated pointer.  This is useful for relationship
			
 
				+ * tracking, such as the talloc library.
			
 
				+ *
			
 
				+ * The expand function is realloc, but only ever used to expand an
			
 
				+ * existing allocation.
			
 
				+ *
			
 
				+ * Be careful mixing allocators: two ntdb_contexts which have the same file
			
 
				+ * open will share the same struct ntdb_file.  This may be allocated by one
			
 
				+ * ntdb's allocator, and freed by the other.
			
 
				+ */
			
 
				+struct ntdb_attribute_allocator {
			
 
				+	struct ntdb_attribute_base base; /* .attr = NTDB_ATTRIBUTE_ALLOCATOR */
			
 
				+	void *(*alloc)(const void *owner, size_t len, void *priv_data);
			
 
				+	void *(*expand)(void *old, size_t newlen, void *priv_data);
			
 
				+	void (*free)(void *old, void *priv_data);
			
 
				+	void *priv_data;
			
 
				+};
			
 
				+
			
 
				+/**
			
 
				+ * union ntdb_attribute - ntdb attributes.
			
 
				+ *
			
 
				+ * This represents all the known attributes.
			
 
				+ *
			
 
				+ * See also:
			
 
				+ *	struct ntdb_attribute_log, struct ntdb_attribute_hash,
			
 
				+ *	struct ntdb_attribute_seed, struct ntdb_attribute_stats,
			
 
				+ *	struct ntdb_attribute_openhook, struct ntdb_attribute_flock,
			
 
				+ *	struct ntdb_attribute_allocator alloc.
			
 
				+ */
			
 
				+union ntdb_attribute {
			
 
				+	struct ntdb_attribute_base base;
			
 
				+	struct ntdb_attribute_log log;
			
 
				+	struct ntdb_attribute_hash hash;
			
 
				+	struct ntdb_attribute_seed seed;
			
 
				+	struct ntdb_attribute_stats stats;
			
 
				+	struct ntdb_attribute_openhook openhook;
			
 
				+	struct ntdb_attribute_flock flock;
			
 
				+	struct ntdb_attribute_allocator alloc;
			
 
				+	struct ntdb_attribute_hashsize hashsize;
			
 
				+};
			
 
				+
			
 
				+#ifdef  __cplusplus
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+#endif /* ntdb.h */
			
--- a/ccan/ntdb/ntdb.pc.in
+++ b/ccan/ntdb/ntdb.pc.in
@@ -0,0 +1,11 @@
 
				+prefix=@prefix@
			
 
				+exec_prefix=@exec_prefix@
			
 
				+libdir=@libdir@
			
 
				+includedir=@includedir@
			
 
				+
			
 
				+Name: ntdb
			
 
				+Description: A (not-so) trivial database
			
 
				+Version: @PACKAGE_VERSION@
			
 
				+Libs: @LIB_RPATH@ -L${libdir} -lntdb
			
 
				+Cflags: -I${includedir}
			
 
				+URL: http://tdb.samba.org/
			
--- a/ccan/ntdb/open.c
+++ b/ccan/ntdb/open.c
@@ -0,0 +1,911 @@
 
				+ /*
			
 
				+   Trivial Database 2: opening and closing TDBs
			
 
				+   Copyright (C) Rusty Russell 2010
			
 
				+
			
 
				+   This library is free software; you can redistribute it and/or
			
 
				+   modify it under the terms of the GNU Lesser General Public
			
 
				+   License as published by the Free Software Foundation; either
			
 
				+   version 3 of the License, or (at your option) any later version.
			
 
				+
			
 
				+   This library is distributed in the hope that it will be useful,
			
 
				+   but WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
			
 
				+   Lesser General Public License for more details.
			
 
				+
			
 
				+   You should have received a copy of the GNU Lesser General Public
			
 
				+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
			
 
				+*/
			
 
				+#include "private.h"
			
 
				+#include <ccan/build_assert/build_assert.h>
			
 
				+
			
 
				+/* all tdbs, to detect double-opens (fcntl file don't nest!) */
			
 
				+static struct ntdb_context *tdbs = NULL;
			
 
				+
			
 
				+static struct ntdb_file *find_file(dev_t device, ino_t ino)
			
 
				+{
			
 
				+	struct ntdb_context *i;
			
 
				+
			
 
				+	for (i = tdbs; i; i = i->next) {
			
 
				+		if (i->file->device == device && i->file->inode == ino) {
			
 
				+			i->file->refcnt++;
			
 
				+			return i->file;
			
 
				+		}
			
 
				+	}
			
 
				+	return NULL;
			
 
				+}
			
 
				+
			
 
				+static bool read_all(int fd, void *buf, size_t len)
			
 
				+{
			
 
				+	while (len) {
			
 
				+		ssize_t ret;
			
 
				+		ret = read(fd, buf, len);
			
 
				+		if (ret < 0)
			
 
				+			return false;
			
 
				+		if (ret == 0) {
			
 
				+			/* ETOOSHORT? */
			
 
				+			errno = EWOULDBLOCK;
			
 
				+			return false;
			
 
				+		}
			
 
				+		buf = (char *)buf + ret;
			
 
				+		len -= ret;
			
 
				+	}
			
 
				+	return true;
			
 
				+}
			
 
				+
			
 
				+static uint32_t random_number(struct ntdb_context *ntdb)
			
 
				+{
			
 
				+	int fd;
			
 
				+	uint32_t ret = 0;
			
 
				+	struct timeval now;
			
 
				+
			
 
				+	fd = open("/dev/urandom", O_RDONLY);
			
 
				+	if (fd >= 0) {
			
 
				+		if (read_all(fd, &ret, sizeof(ret))) {
			
 
				+			close(fd);
			
 
				+			return ret;
			
 
				+		}
			
 
				+		close(fd);
			
 
				+	}
			
 
				+	/* FIXME: Untested!  Based on Wikipedia protocol description! */
			
 
				+	fd = open("/dev/egd-pool", O_RDWR);
			
 
				+	if (fd >= 0) {
			
 
				+		/* Command is 1, next byte is size we want to read. */
			
 
				+		char cmd[2] = { 1, sizeof(uint32_t) };
			
 
				+		if (write(fd, cmd, sizeof(cmd)) == sizeof(cmd)) {
			
 
				+			char reply[1 + sizeof(uint32_t)];
			
 
				+			int r = read(fd, reply, sizeof(reply));
			
 
				+			if (r > 1) {
			
 
				+				/* Copy at least some bytes. */
			
 
				+				memcpy(&ret, reply+1, r - 1);
			
 
				+				if (reply[0] == sizeof(uint32_t)
			
 
				+				    && r == sizeof(reply)) {
			
 
				+					close(fd);
			
 
				+					return ret;
			
 
				+				}
			
 
				+			}
			
 
				+		}
			
 
				+		close(fd);
			
 
				+	}
			
 
				+
			
 
				+	/* Fallback: pid and time. */
			
 
				+	gettimeofday(&now, NULL);
			
 
				+	ret = getpid() * 100132289ULL + now.tv_sec * 1000000ULL + now.tv_usec;
			
 
				+	ntdb_logerr(ntdb, NTDB_SUCCESS, NTDB_LOG_WARNING,
			
 
				+		   "ntdb_open: random from getpid and time");
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+static void ntdb_context_init(struct ntdb_context *ntdb)
			
 
				+{
			
 
				+	/* Initialize the NTDB fields here */
			
 
				+	ntdb_io_init(ntdb);
			
 
				+	ntdb->transaction = NULL;
			
 
				+	ntdb->access = NULL;
			
 
				+}
			
 
				+
			
 
				+/* initialise a new database:
			
 
				+ *
			
 
				+ *	struct ntdb_header;
			
 
				+ *	struct {
			
 
				+ *		struct ntdb_used_record hash_header;
			
 
				+ *		ntdb_off_t hash_buckets[1 << ntdb->hash_bits];
			
 
				+ *	} hash;
			
 
				+ *	struct ntdb_freetable ftable;
			
 
				+ *	struct {
			
 
				+ *		struct ntdb_free_record free_header;
			
 
				+ *		char forty_three[...];
			
 
				+ *	} remainder;
			
 
				+ */
			
 
				+#define NEW_DATABASE_HDR_SIZE(hbits)					\
			
 
				+	(sizeof(struct ntdb_header)					\
			
 
				+	 + sizeof(struct ntdb_used_record) + (sizeof(ntdb_off_t) << hbits) \
			
 
				+	 + sizeof(struct ntdb_freetable)				\
			
 
				+	 + sizeof(struct ntdb_free_record))
			
 
				+
			
 
				+static enum NTDB_ERROR ntdb_new_database(struct ntdb_context *ntdb,
			
 
				+					 struct ntdb_attribute_seed *seed,
			
 
				+					 struct ntdb_header *rhdr)
			
 
				+{
			
 
				+	/* We make it up in memory, then write it out if not internal */
			
 
				+	struct ntdb_freetable *ftable;
			
 
				+	struct ntdb_used_record *htable;
			
 
				+	struct ntdb_header *hdr;
			
 
				+	struct ntdb_free_record *remainder;
			
 
				+	char *mem;
			
 
				+	unsigned int magic_len;
			
 
				+	ssize_t rlen;
			
 
				+	size_t dbsize, hashsize, hdrsize, remaindersize;
			
 
				+	enum NTDB_ERROR ecode;
			
 
				+
			
 
				+	hashsize = sizeof(ntdb_off_t) << ntdb->hash_bits;
			
 
				+
			
 
				+	/* Always make db a multiple of NTDB_PGSIZE */
			
 
				+	hdrsize = NEW_DATABASE_HDR_SIZE(ntdb->hash_bits);
			
 
				+	dbsize = (hdrsize + NTDB_PGSIZE-1) & ~(NTDB_PGSIZE-1);
			
 
				+
			
 
				+	mem = ntdb->alloc_fn(ntdb, dbsize, ntdb->alloc_data);
			
 
				+	if (!mem) {
			
 
				+		return ntdb_logerr(ntdb, NTDB_ERR_OOM, NTDB_LOG_ERROR,
			
 
				+				   "ntdb_new_database: failed to allocate");
			
 
				+	}
			
 
				+
			
 
				+	hdr = (void *)mem;
			
 
				+	htable = (void *)(mem + sizeof(*hdr));
			
 
				+	ftable = (void *)(mem + sizeof(*hdr) + sizeof(*htable) + hashsize);
			
 
				+	remainder = (void *)(mem + sizeof(*hdr) + sizeof(*htable) + hashsize
			
 
				+			     + sizeof(*ftable));
			
 
				+
			
 
				+	/* Fill in the header */
			
 
				+	hdr->version = NTDB_VERSION;
			
 
				+	if (seed)
			
 
				+		hdr->hash_seed = seed->seed;
			
 
				+	else
			
 
				+		hdr->hash_seed = random_number(ntdb);
			
 
				+	hdr->hash_test = NTDB_HASH_MAGIC;
			
 
				+	hdr->hash_test = ntdb->hash_fn(&hdr->hash_test,
			
 
				+				       sizeof(hdr->hash_test),
			
 
				+				       hdr->hash_seed,
			
 
				+				       ntdb->hash_data);
			
 
				+	hdr->hash_bits = ntdb->hash_bits;
			
 
				+	hdr->recovery = 0;
			
 
				+	hdr->features_used = hdr->features_offered = NTDB_FEATURE_MASK;
			
 
				+	hdr->seqnum = 0;
			
 
				+	hdr->capabilities = 0;
			
 
				+	memset(hdr->reserved, 0, sizeof(hdr->reserved));
			
 
				+
			
 
				+	/* Hash is all zero after header. */
			
 
				+	set_header(NULL, htable, NTDB_HTABLE_MAGIC, 0, hashsize, hashsize);
			
 
				+	memset(htable + 1, 0, hashsize);
			
 
				+
			
 
				+	/* Free is empty. */
			
 
				+	hdr->free_table = (char *)ftable - (char *)hdr;
			
 
				+	memset(ftable, 0, sizeof(*ftable));
			
 
				+	ecode = set_header(NULL, &ftable->hdr, NTDB_FTABLE_MAGIC, 0,
			
 
				+			   sizeof(*ftable) - sizeof(ftable->hdr),
			
 
				+			   sizeof(*ftable) - sizeof(ftable->hdr));
			
 
				+	if (ecode != NTDB_SUCCESS) {
			
 
				+		goto out;
			
 
				+	}
			
 
				+
			
 
				+	/* Rest of database is a free record, containing junk. */
			
 
				+	remaindersize = dbsize - hdrsize;
			
 
				+	remainder->ftable_and_len
			
 
				+		= (remaindersize + sizeof(*remainder)
			
 
				+		   - sizeof(struct ntdb_used_record));
			
 
				+	remainder->next = 0;
			
 
				+	remainder->magic_and_prev
			
 
				+		= (NTDB_FREE_MAGIC << (64-NTDB_OFF_UPPER_STEAL))
			
 
				+		| ((char *)remainder - (char *)hdr);
			
 
				+	memset(remainder + 1, 0x43, remaindersize);
			
 
				+
			
 
				+	/* Put in our single free entry. */
			
 
				+	ftable->buckets[size_to_bucket(remaindersize)] =
			
 
				+		(char *)remainder - (char *)hdr;
			
 
				+
			
 
				+	/* Magic food */
			
 
				+	memset(hdr->magic_food, 0, sizeof(hdr->magic_food));
			
 
				+	strcpy(hdr->magic_food, NTDB_MAGIC_FOOD);
			
 
				+
			
 
				+	/* This creates an endian-converted database, as if read from disk */
			
 
				+	magic_len = sizeof(hdr->magic_food);
			
 
				+	ntdb_convert(ntdb, (char *)hdr + magic_len, hdrsize - magic_len);
			
 
				+
			
 
				+	/* Return copy of header. */
			
 
				+	*rhdr = *hdr;
			
 
				+
			
 
				+	if (ntdb->flags & NTDB_INTERNAL) {
			
 
				+		ntdb->file->map_size = dbsize;
			
 
				+		ntdb->file->map_ptr = hdr;
			
 
				+		return NTDB_SUCCESS;
			
 
				+	}
			
 
				+	if (lseek(ntdb->file->fd, 0, SEEK_SET) == -1) {
			
 
				+		ecode = ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
			
 
				+				    "ntdb_new_database:"
			
 
				+				    " failed to seek: %s", strerror(errno));
			
 
				+		goto out;
			
 
				+	}
			
 
				+
			
 
				+	if (ftruncate(ntdb->file->fd, 0) == -1) {
			
 
				+		ecode = ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
			
 
				+				    "ntdb_new_database:"
			
 
				+				    " failed to truncate: %s", strerror(errno));
			
 
				+		goto out;
			
 
				+	}
			
 
				+
			
 
				+	rlen = write(ntdb->file->fd, hdr, dbsize);
			
 
				+	if (rlen != dbsize) {
			
 
				+		if (rlen >= 0)
			
 
				+			errno = ENOSPC;
			
 
				+		ecode = ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
			
 
				+				    "ntdb_new_database: %zi writing header: %s",
			
 
				+				    rlen, strerror(errno));
			
 
				+		goto out;
			
 
				+	}
			
 
				+
			
 
				+out:
			
 
				+	ntdb->free_fn(hdr, ntdb->alloc_data);
			
 
				+	return ecode;
			
 
				+}
			
 
				+
			
 
				+static enum NTDB_ERROR ntdb_new_file(struct ntdb_context *ntdb)
			
 
				+{
			
 
				+	ntdb->file = ntdb->alloc_fn(NULL, sizeof(*ntdb->file), ntdb->alloc_data);
			
 
				+	if (!ntdb->file)
			
 
				+		return ntdb_logerr(ntdb, NTDB_ERR_OOM, NTDB_LOG_ERROR,
			
 
				+				  "ntdb_open: cannot alloc ntdb_file structure");
			
 
				+	ntdb->file->num_lockrecs = 0;
			
 
				+	ntdb->file->lockrecs = NULL;
			
 
				+	ntdb->file->allrecord_lock.count = 0;
			
 
				+	ntdb->file->refcnt = 1;
			
 
				+	ntdb->file->map_ptr = NULL;
			
 
				+	ntdb->file->direct_count = 0;
			
 
				+	ntdb->file->old_mmaps = NULL;
			
 
				+	return NTDB_SUCCESS;
			
 
				+}
			
 
				+
			
 
				+_PUBLIC_ enum NTDB_ERROR ntdb_set_attribute(struct ntdb_context *ntdb,
			
 
				+				 const union ntdb_attribute *attr)
			
 
				+{
			
 
				+	switch (attr->base.attr) {
			
 
				+	case NTDB_ATTRIBUTE_LOG:
			
 
				+		ntdb->log_fn = attr->log.fn;
			
 
				+		ntdb->log_data = attr->log.data;
			
 
				+		break;
			
 
				+	case NTDB_ATTRIBUTE_HASH:
			
 
				+	case NTDB_ATTRIBUTE_SEED:
			
 
				+	case NTDB_ATTRIBUTE_OPENHOOK:
			
 
				+	case NTDB_ATTRIBUTE_HASHSIZE:
			
 
				+		return ntdb_logerr(ntdb, NTDB_ERR_EINVAL,
			
 
				+				   NTDB_LOG_USE_ERROR,
			
 
				+				   "ntdb_set_attribute:"
			
 
				+				   " cannot set %s after opening",
			
 
				+				   attr->base.attr == NTDB_ATTRIBUTE_HASH
			
 
				+				   ? "NTDB_ATTRIBUTE_HASH"
			
 
				+				   : attr->base.attr == NTDB_ATTRIBUTE_SEED
			
 
				+				   ? "NTDB_ATTRIBUTE_SEED"
			
 
				+				   : attr->base.attr == NTDB_ATTRIBUTE_OPENHOOK
			
 
				+				   ? "NTDB_ATTRIBUTE_OPENHOOK"
			
 
				+				   : "NTDB_ATTRIBUTE_HASHSIZE");
			
 
				+	case NTDB_ATTRIBUTE_STATS:
			
 
				+		return ntdb_logerr(ntdb, NTDB_ERR_EINVAL,
			
 
				+				   NTDB_LOG_USE_ERROR,
			
 
				+				   "ntdb_set_attribute:"
			
 
				+				   " cannot set NTDB_ATTRIBUTE_STATS");
			
 
				+	case NTDB_ATTRIBUTE_FLOCK:
			
 
				+		ntdb->lock_fn = attr->flock.lock;
			
 
				+		ntdb->unlock_fn = attr->flock.unlock;
			
 
				+		ntdb->lock_data = attr->flock.data;
			
 
				+		break;
			
 
				+	case NTDB_ATTRIBUTE_ALLOCATOR:
			
 
				+		ntdb->alloc_fn = attr->alloc.alloc;
			
 
				+		ntdb->expand_fn = attr->alloc.expand;
			
 
				+		ntdb->free_fn = attr->alloc.free;
			
 
				+		ntdb->alloc_data = attr->alloc.priv_data;
			
 
				+		break;
			
 
				+	default:
			
 
				+		return ntdb_logerr(ntdb, NTDB_ERR_EINVAL,
			
 
				+				   NTDB_LOG_USE_ERROR,
			
 
				+				   "ntdb_set_attribute:"
			
 
				+				   " unknown attribute type %u",
			
 
				+				   attr->base.attr);
			
 
				+	}
			
 
				+	return NTDB_SUCCESS;
			
 
				+}
			
 
				+
			
 
				+_PUBLIC_ enum NTDB_ERROR ntdb_get_attribute(struct ntdb_context *ntdb,
			
 
				+				 union ntdb_attribute *attr)
			
 
				+{
			
 
				+	switch (attr->base.attr) {
			
 
				+	case NTDB_ATTRIBUTE_LOG:
			
 
				+		if (!ntdb->log_fn)
			
 
				+			return NTDB_ERR_NOEXIST;
			
 
				+		attr->log.fn = ntdb->log_fn;
			
 
				+		attr->log.data = ntdb->log_data;
			
 
				+		break;
			
 
				+	case NTDB_ATTRIBUTE_HASH:
			
 
				+		attr->hash.fn = ntdb->hash_fn;
			
 
				+		attr->hash.data = ntdb->hash_data;
			
 
				+		break;
			
 
				+	case NTDB_ATTRIBUTE_SEED:
			
 
				+		attr->seed.seed = ntdb->hash_seed;
			
 
				+		break;
			
 
				+	case NTDB_ATTRIBUTE_OPENHOOK:
			
 
				+		if (!ntdb->openhook)
			
 
				+			return NTDB_ERR_NOEXIST;
			
 
				+		attr->openhook.fn = ntdb->openhook;
			
 
				+		attr->openhook.data = ntdb->openhook_data;
			
 
				+		break;
			
 
				+	case NTDB_ATTRIBUTE_STATS: {
			
 
				+		size_t size = attr->stats.size;
			
 
				+		if (size > ntdb->stats.size)
			
 
				+			size = ntdb->stats.size;
			
 
				+		memcpy(&attr->stats, &ntdb->stats, size);
			
 
				+		break;
			
 
				+	}
			
 
				+	case NTDB_ATTRIBUTE_FLOCK:
			
 
				+		attr->flock.lock = ntdb->lock_fn;
			
 
				+		attr->flock.unlock = ntdb->unlock_fn;
			
 
				+		attr->flock.data = ntdb->lock_data;
			
 
				+		break;
			
 
				+	case NTDB_ATTRIBUTE_ALLOCATOR:
			
 
				+		attr->alloc.alloc = ntdb->alloc_fn;
			
 
				+		attr->alloc.expand = ntdb->expand_fn;
			
 
				+		attr->alloc.free = ntdb->free_fn;
			
 
				+		attr->alloc.priv_data = ntdb->alloc_data;
			
 
				+		break;
			
 
				+	case NTDB_ATTRIBUTE_HASHSIZE:
			
 
				+		attr->hashsize.size = 1 << ntdb->hash_bits;
			
 
				+		break;
			
 
				+	default:
			
 
				+		return ntdb_logerr(ntdb, NTDB_ERR_EINVAL,
			
 
				+				   NTDB_LOG_USE_ERROR,
			
 
				+				   "ntdb_get_attribute:"
			
 
				+				   " unknown attribute type %u",
			
 
				+				   attr->base.attr);
			
 
				+	}
			
 
				+	attr->base.next = NULL;
			
 
				+	return NTDB_SUCCESS;
			
 
				+}
			
 
				+
			
 
				+_PUBLIC_ void ntdb_unset_attribute(struct ntdb_context *ntdb,
			
 
				+			 enum ntdb_attribute_type type)
			
 
				+{
			
 
				+	switch (type) {
			
 
				+	case NTDB_ATTRIBUTE_LOG:
			
 
				+		ntdb->log_fn = NULL;
			
 
				+		break;
			
 
				+	case NTDB_ATTRIBUTE_OPENHOOK:
			
 
				+		ntdb->openhook = NULL;
			
 
				+		break;
			
 
				+	case NTDB_ATTRIBUTE_HASH:
			
 
				+	case NTDB_ATTRIBUTE_SEED:
			
 
				+		ntdb_logerr(ntdb, NTDB_ERR_EINVAL, NTDB_LOG_USE_ERROR,
			
 
				+			   "ntdb_unset_attribute: cannot unset %s after opening",
			
 
				+			   type == NTDB_ATTRIBUTE_HASH
			
 
				+			   ? "NTDB_ATTRIBUTE_HASH"
			
 
				+			   : "NTDB_ATTRIBUTE_SEED");
			
 
				+		break;
			
 
				+	case NTDB_ATTRIBUTE_STATS:
			
 
				+		ntdb_logerr(ntdb, NTDB_ERR_EINVAL,
			
 
				+			   NTDB_LOG_USE_ERROR,
			
 
				+			   "ntdb_unset_attribute:"
			
 
				+			   "cannot unset NTDB_ATTRIBUTE_STATS");
			
 
				+		break;
			
 
				+	case NTDB_ATTRIBUTE_FLOCK:
			
 
				+		ntdb->lock_fn = ntdb_fcntl_lock;
			
 
				+		ntdb->unlock_fn = ntdb_fcntl_unlock;
			
 
				+		break;
			
 
				+	default:
			
 
				+		ntdb_logerr(ntdb, NTDB_ERR_EINVAL,
			
 
				+			   NTDB_LOG_USE_ERROR,
			
 
				+			   "ntdb_unset_attribute: unknown attribute type %u",
			
 
				+			   type);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/* The top three bits of the capability tell us whether it matters. */
			
 
				+enum NTDB_ERROR unknown_capability(struct ntdb_context *ntdb, const char *caller,
			
 
				+				  ntdb_off_t type)
			
 
				+{
			
 
				+	if (type & NTDB_CAP_NOOPEN) {
			
 
				+		return ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
			
 
				+				  "%s: file has unknown capability %llu",
			
 
				+				  caller, type & NTDB_CAP_NOOPEN);
			
 
				+	}
			
 
				+
			
 
				+	if ((type & NTDB_CAP_NOWRITE) && !(ntdb->flags & NTDB_RDONLY)) {
			
 
				+		return ntdb_logerr(ntdb, NTDB_ERR_RDONLY, NTDB_LOG_ERROR,
			
 
				+				  "%s: file has unknown capability %llu"
			
 
				+				  " (cannot write to it)",
			
 
				+				  caller, type & NTDB_CAP_NOOPEN);
			
 
				+	}
			
 
				+
			
 
				+	if (type & NTDB_CAP_NOCHECK) {
			
 
				+		ntdb->flags |= NTDB_CANT_CHECK;
			
 
				+	}
			
 
				+	return NTDB_SUCCESS;
			
 
				+}
			
 
				+
			
 
				+static enum NTDB_ERROR capabilities_ok(struct ntdb_context *ntdb,
			
 
				+				      ntdb_off_t capabilities)
			
 
				+{
			
 
				+	ntdb_off_t off, next;
			
 
				+	enum NTDB_ERROR ecode = NTDB_SUCCESS;
			
 
				+	const struct ntdb_capability *cap;
			
 
				+
			
 
				+	/* Check capability list. */
			
 
				+	for (off = capabilities; off && ecode == NTDB_SUCCESS; off = next) {
			
 
				+		cap = ntdb_access_read(ntdb, off, sizeof(*cap), true);
			
 
				+		if (NTDB_PTR_IS_ERR(cap)) {
			
 
				+			return NTDB_PTR_ERR(cap);
			
 
				+		}
			
 
				+
			
 
				+		switch (cap->type & NTDB_CAP_TYPE_MASK) {
			
 
				+		/* We don't understand any capabilities (yet). */
			
 
				+		default:
			
 
				+			ecode = unknown_capability(ntdb, "ntdb_open", cap->type);
			
 
				+		}
			
 
				+		next = cap->next;
			
 
				+		ntdb_access_release(ntdb, cap);
			
 
				+	}
			
 
				+	return ecode;
			
 
				+}
			
 
				+
			
 
				+static void *default_alloc(const void *owner, size_t len, void *priv_data)
			
 
				+{
			
 
				+	return malloc(len);
			
 
				+}
			
 
				+
			
 
				+static void *default_expand(void *ptr, size_t len, void *priv_data)
			
 
				+{
			
 
				+	return realloc(ptr, len);
			
 
				+}
			
 
				+
			
 
				+static void default_free(void *ptr, void *priv_data)
			
 
				+{
			
 
				+	free(ptr);
			
 
				+}
			
 
				+
			
 
				+/* First allocation needs manual search of attributes. */
			
 
				+static struct ntdb_context *alloc_ntdb(const union ntdb_attribute *attr,
			
 
				+				       const char *name)
			
 
				+{
			
 
				+	size_t len = sizeof(struct ntdb_context) + strlen(name) + 1;
			
 
				+
			
 
				+	while (attr) {
			
 
				+		if  (attr->base.attr == NTDB_ATTRIBUTE_ALLOCATOR) {
			
 
				+			return attr->alloc.alloc(NULL, len,
			
 
				+						 attr->alloc.priv_data);
			
 
				+		}
			
 
				+		attr = attr->base.next;
			
 
				+	}
			
 
				+	return default_alloc(NULL, len, NULL);
			
 
				+}
			
 
				+
			
 
				+static unsigned int next_pow2(uint64_t size)
			
 
				+{
			
 
				+	unsigned int bits = 1;
			
 
				+
			
 
				+	while ((1ULL << bits) < size)
			
 
				+		bits++;
			
 
				+	return bits;
			
 
				+}
			
 
				+
			
 
				+_PUBLIC_ struct ntdb_context *ntdb_open(const char *name, int ntdb_flags,
			
 
				+					int open_flags, mode_t mode,
			
 
				+					union ntdb_attribute *attr)
			
 
				+{
			
 
				+	struct ntdb_context *ntdb;
			
 
				+	struct stat st;
			
 
				+	int saved_errno = 0;
			
 
				+	uint64_t hash_test;
			
 
				+	unsigned v;
			
 
				+	ssize_t rlen;
			
 
				+	struct ntdb_header hdr;
			
 
				+	struct ntdb_attribute_seed *seed = NULL;
			
 
				+	ntdb_bool_err berr;
			
 
				+	enum NTDB_ERROR ecode;
			
 
				+	int openlock;
			
 
				+
			
 
				+	ntdb = alloc_ntdb(attr, name);
			
 
				+	if (!ntdb) {
			
 
				+		/* Can't log this */
			
 
				+		errno = ENOMEM;
			
 
				+		return NULL;
			
 
				+	}
			
 
				+	/* Set name immediately for logging functions. */
			
 
				+	ntdb->name = strcpy((char *)(ntdb + 1), name);
			
 
				+	ntdb->flags = ntdb_flags;
			
 
				+	ntdb->log_fn = NULL;
			
 
				+	ntdb->open_flags = open_flags;
			
 
				+	ntdb->file = NULL;
			
 
				+	ntdb->openhook = NULL;
			
 
				+	ntdb->lock_fn = ntdb_fcntl_lock;
			
 
				+	ntdb->unlock_fn = ntdb_fcntl_unlock;
			
 
				+	ntdb->hash_fn = ntdb_jenkins_hash;
			
 
				+	memset(&ntdb->stats, 0, sizeof(ntdb->stats));
			
 
				+	ntdb->stats.base.attr = NTDB_ATTRIBUTE_STATS;
			
 
				+	ntdb->stats.size = sizeof(ntdb->stats);
			
 
				+	ntdb->alloc_fn = default_alloc;
			
 
				+	ntdb->expand_fn = default_expand;
			
 
				+	ntdb->free_fn = default_free;
			
 
				+	ntdb->hash_bits = NTDB_DEFAULT_HBITS; /* 64k of hash by default. */
			
 
				+
			
 
				+	while (attr) {
			
 
				+		switch (attr->base.attr) {
			
 
				+		case NTDB_ATTRIBUTE_HASH:
			
 
				+			ntdb->hash_fn = attr->hash.fn;
			
 
				+			ntdb->hash_data = attr->hash.data;
			
 
				+			break;
			
 
				+		case NTDB_ATTRIBUTE_SEED:
			
 
				+			seed = &attr->seed;
			
 
				+			break;
			
 
				+		case NTDB_ATTRIBUTE_OPENHOOK:
			
 
				+			ntdb->openhook = attr->openhook.fn;
			
 
				+			ntdb->openhook_data = attr->openhook.data;
			
 
				+			break;
			
 
				+		case NTDB_ATTRIBUTE_HASHSIZE:
			
 
				+			ntdb->hash_bits = next_pow2(attr->hashsize.size);
			
 
				+			if (ntdb->hash_bits > 31) {
			
 
				+				ecode = ntdb_logerr(ntdb, NTDB_ERR_EINVAL,
			
 
				+						    NTDB_LOG_USE_ERROR,
			
 
				+						    "ntdb_open: hash_size %u"
			
 
				+						    " too large",
			
 
				+						    attr->hashsize.size);
			
 
				+				goto fail;
			
 
				+			}
			
 
				+			break;
			
 
				+		default:
			
 
				+			/* These are set as normal. */
			
 
				+			ecode = ntdb_set_attribute(ntdb, attr);
			
 
				+			if (ecode != NTDB_SUCCESS)
			
 
				+				goto fail;
			
 
				+		}
			
 
				+		attr = attr->base.next;
			
 
				+	}
			
 
				+
			
 
				+	if (ntdb_flags & ~(NTDB_INTERNAL | NTDB_NOLOCK | NTDB_NOMMAP | NTDB_CONVERT
			
 
				+			  | NTDB_NOSYNC | NTDB_SEQNUM | NTDB_ALLOW_NESTING
			
 
				+			  | NTDB_RDONLY)) {
			
 
				+		ecode = ntdb_logerr(ntdb, NTDB_ERR_EINVAL, NTDB_LOG_USE_ERROR,
			
 
				+				   "ntdb_open: unknown flags %u", ntdb_flags);
			
 
				+		goto fail;
			
 
				+	}
			
 
				+
			
 
				+	if (seed) {
			
 
				+		if (!(ntdb_flags & NTDB_INTERNAL) && !(open_flags & O_CREAT)) {
			
 
				+			ecode = ntdb_logerr(ntdb, NTDB_ERR_EINVAL,
			
 
				+					   NTDB_LOG_USE_ERROR,
			
 
				+					   "ntdb_open:"
			
 
				+					   " cannot set NTDB_ATTRIBUTE_SEED"
			
 
				+					   " without O_CREAT.");
			
 
				+			goto fail;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	if ((open_flags & O_ACCMODE) == O_WRONLY) {
			
 
				+		ecode = ntdb_logerr(ntdb, NTDB_ERR_EINVAL, NTDB_LOG_USE_ERROR,
			
 
				+				   "ntdb_open: can't open ntdb %s write-only",
			
 
				+				   name);
			
 
				+		goto fail;
			
 
				+	}
			
 
				+
			
 
				+	if ((open_flags & O_ACCMODE) == O_RDONLY) {
			
 
				+		openlock = F_RDLCK;
			
 
				+		ntdb->flags |= NTDB_RDONLY;
			
 
				+	} else {
			
 
				+		if (ntdb_flags & NTDB_RDONLY) {
			
 
				+			ecode = ntdb_logerr(ntdb, NTDB_ERR_EINVAL,
			
 
				+					   NTDB_LOG_USE_ERROR,
			
 
				+					   "ntdb_open: can't use NTDB_RDONLY"
			
 
				+					   " without O_RDONLY");
			
 
				+			goto fail;
			
 
				+		}
			
 
				+		openlock = F_WRLCK;
			
 
				+	}
			
 
				+
			
 
				+	/* internal databases don't need any of the rest. */
			
 
				+	if (ntdb->flags & NTDB_INTERNAL) {
			
 
				+		ntdb->flags |= (NTDB_NOLOCK | NTDB_NOMMAP);
			
 
				+		ecode = ntdb_new_file(ntdb);
			
 
				+		if (ecode != NTDB_SUCCESS) {
			
 
				+			goto fail;
			
 
				+		}
			
 
				+		ntdb->file->fd = -1;
			
 
				+		ecode = ntdb_new_database(ntdb, seed, &hdr);
			
 
				+		if (ecode == NTDB_SUCCESS) {
			
 
				+			ntdb_convert(ntdb, &hdr.hash_seed,
			
 
				+				    sizeof(hdr.hash_seed));
			
 
				+			ntdb->hash_seed = hdr.hash_seed;
			
 
				+			ntdb_context_init(ntdb);
			
 
				+			ntdb_ftable_init(ntdb);
			
 
				+		}
			
 
				+		if (ecode != NTDB_SUCCESS) {
			
 
				+			goto fail;
			
 
				+		}
			
 
				+		return ntdb;
			
 
				+	}
			
 
				+
			
 
				+	if (stat(name, &st) != -1)
			
 
				+		ntdb->file = find_file(st.st_dev, st.st_ino);
			
 
				+
			
 
				+	if (!ntdb->file) {
			
 
				+		ecode = ntdb_new_file(ntdb);
			
 
				+		if (ecode != NTDB_SUCCESS) {
			
 
				+			goto fail;
			
 
				+		}
			
 
				+
			
 
				+		/* Set this now, as ntdb_nest_lock examines it. */
			
 
				+		ntdb->file->map_size = 0;
			
 
				+
			
 
				+		if ((ntdb->file->fd = open(name, open_flags, mode)) == -1) {
			
 
				+			enum ntdb_log_level lvl;
			
 
				+			/* errno set by open(2) */
			
 
				+			saved_errno = errno;
			
 
				+
			
 
				+			/* Probing for files like this is a common pattern. */
			
 
				+			if (!(open_flags & O_CREAT) && errno == ENOENT) {
			
 
				+				lvl = NTDB_LOG_WARNING;
			
 
				+			} else {
			
 
				+				lvl = NTDB_LOG_ERROR;
			
 
				+			}
			
 
				+			ntdb_logerr(ntdb, NTDB_ERR_IO, lvl,
			
 
				+				   "ntdb_open: could not open file %s: %s",
			
 
				+				   name, strerror(errno));
			
 
				+
			
 
				+			goto fail_errno;
			
 
				+		}
			
 
				+
			
 
				+		/* ensure there is only one process initialising at once:
			
 
				+		 * do it immediately to reduce the create/openlock race. */
			
 
				+		ecode = ntdb_lock_open(ntdb, openlock,
			
 
				+				       NTDB_LOCK_WAIT|NTDB_LOCK_NOCHECK);
			
 
				+		if (ecode != NTDB_SUCCESS) {
			
 
				+			saved_errno = errno;
			
 
				+			goto fail_errno;
			
 
				+		}
			
 
				+
			
 
				+		/* on exec, don't inherit the fd */
			
 
				+		v = fcntl(ntdb->file->fd, F_GETFD, 0);
			
 
				+		fcntl(ntdb->file->fd, F_SETFD, v | FD_CLOEXEC);
			
 
				+
			
 
				+		if (fstat(ntdb->file->fd, &st) == -1) {
			
 
				+			saved_errno = errno;
			
 
				+			ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
			
 
				+				   "ntdb_open: could not stat open %s: %s",
			
 
				+				   name, strerror(errno));
			
 
				+			goto fail_errno;
			
 
				+		}
			
 
				+
			
 
				+		ntdb->file->device = st.st_dev;
			
 
				+		ntdb->file->inode = st.st_ino;
			
 
				+
			
 
				+		/* call their open hook if they gave us one. */
			
 
				+		if (ntdb->openhook) {
			
 
				+			ecode = ntdb->openhook(ntdb->file->fd, ntdb->openhook_data);
			
 
				+			if (ecode != NTDB_SUCCESS) {
			
 
				+				ntdb_logerr(ntdb, ecode, NTDB_LOG_ERROR,
			
 
				+					    "ntdb_open: open hook failed");
			
 
				+				goto fail;
			
 
				+			}
			
 
				+			open_flags |= O_CREAT;
			
 
				+		}
			
 
				+	} else {
			
 
				+		/* ensure there is only one process initialising at once */
			
 
				+		ecode = ntdb_lock_open(ntdb, openlock,
			
 
				+				       NTDB_LOCK_WAIT|NTDB_LOCK_NOCHECK);
			
 
				+		if (ecode != NTDB_SUCCESS) {
			
 
				+			saved_errno = errno;
			
 
				+			goto fail_errno;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	/* If they used O_TRUNC, read will return 0. */
			
 
				+	rlen = pread(ntdb->file->fd, &hdr, sizeof(hdr), 0);
			
 
				+	if (rlen == 0 && (open_flags & O_CREAT)) {
			
 
				+		ecode = ntdb_new_database(ntdb, seed, &hdr);
			
 
				+		if (ecode != NTDB_SUCCESS) {
			
 
				+			goto fail;
			
 
				+		}
			
 
				+	} else if (rlen < 0) {
			
 
				+		ecode = ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
			
 
				+				   "ntdb_open: error %s reading %s",
			
 
				+				   strerror(errno), name);
			
 
				+		goto fail;
			
 
				+	} else if (rlen < sizeof(hdr)
			
 
				+		   || strcmp(hdr.magic_food, NTDB_MAGIC_FOOD) != 0) {
			
 
				+		ecode = ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
			
 
				+				   "ntdb_open: %s is not a ntdb file", name);
			
 
				+		goto fail;
			
 
				+	}
			
 
				+
			
 
				+	if (hdr.version != NTDB_VERSION) {
			
 
				+		if (hdr.version == bswap_64(NTDB_VERSION))
			
 
				+			ntdb->flags |= NTDB_CONVERT;
			
 
				+		else {
			
 
				+			/* wrong version */
			
 
				+			ecode = ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
			
 
				+					   "ntdb_open:"
			
 
				+					   " %s is unknown version 0x%llx",
			
 
				+					   name, (long long)hdr.version);
			
 
				+			goto fail;
			
 
				+		}
			
 
				+	} else if (ntdb->flags & NTDB_CONVERT) {
			
 
				+		ecode = ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
			
 
				+				   "ntdb_open:"
			
 
				+				   " %s does not need NTDB_CONVERT",
			
 
				+				   name);
			
 
				+		goto fail;
			
 
				+	}
			
 
				+
			
 
				+	ntdb_context_init(ntdb);
			
 
				+
			
 
				+	ntdb_convert(ntdb, &hdr, sizeof(hdr));
			
 
				+	ntdb->hash_bits = hdr.hash_bits;
			
 
				+	ntdb->hash_seed = hdr.hash_seed;
			
 
				+	hash_test = NTDB_HASH_MAGIC;
			
 
				+	hash_test = ntdb_hash(ntdb, &hash_test, sizeof(hash_test));
			
 
				+	if (hdr.hash_test != hash_test) {
			
 
				+		/* wrong hash variant */
			
 
				+		ecode = ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
			
 
				+				   "ntdb_open:"
			
 
				+				   " %s uses a different hash function",
			
 
				+				   name);
			
 
				+		goto fail;
			
 
				+	}
			
 
				+
			
 
				+	ecode = capabilities_ok(ntdb, hdr.capabilities);
			
 
				+	if (ecode != NTDB_SUCCESS) {
			
 
				+		goto fail;
			
 
				+	}
			
 
				+
			
 
				+	/* Clear any features we don't understand. */
			
 
				+	if ((open_flags & O_ACCMODE) != O_RDONLY) {
			
 
				+		hdr.features_used &= NTDB_FEATURE_MASK;
			
 
				+		ecode = ntdb_write_convert(ntdb, offsetof(struct ntdb_header,
			
 
				+							features_used),
			
 
				+					  &hdr.features_used,
			
 
				+					  sizeof(hdr.features_used));
			
 
				+		if (ecode != NTDB_SUCCESS)
			
 
				+			goto fail;
			
 
				+	}
			
 
				+
			
 
				+	ntdb_unlock_open(ntdb, openlock);
			
 
				+
			
 
				+	/* This makes sure we have current map_size and mmap. */
			
 
				+	ecode = ntdb_oob(ntdb, ntdb->file->map_size, 1, true);
			
 
				+	if (unlikely(ecode != NTDB_SUCCESS))
			
 
				+		goto fail;
			
 
				+
			
 
				+	if (ntdb->file->map_size % NTDB_PGSIZE != 0) {
			
 
				+		ecode = ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
			
 
				+				    "ntdb_open:"
			
 
				+				    " %s size %llu isn't a multiple of %u",
			
 
				+				    name, (long long)ntdb->file->map_size,
			
 
				+				    NTDB_PGSIZE);
			
 
				+		goto fail;
			
 
				+	}
			
 
				+
			
 
				+	/* Now it's fully formed, recover if necessary. */
			
 
				+	berr = ntdb_needs_recovery(ntdb);
			
 
				+	if (unlikely(berr != false)) {
			
 
				+		if (berr < 0) {
			
 
				+			ecode = NTDB_OFF_TO_ERR(berr);
			
 
				+			goto fail;
			
 
				+		}
			
 
				+		ecode = ntdb_lock_and_recover(ntdb);
			
 
				+		if (ecode != NTDB_SUCCESS) {
			
 
				+			goto fail;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	ecode = ntdb_ftable_init(ntdb);
			
 
				+	if (ecode != NTDB_SUCCESS) {
			
 
				+		goto fail;
			
 
				+	}
			
 
				+
			
 
				+	ntdb->next = tdbs;
			
 
				+	tdbs = ntdb;
			
 
				+	return ntdb;
			
 
				+
			
 
				+ fail:
			
 
				+	/* Map ecode to some logical errno. */
			
 
				+	switch (NTDB_ERR_TO_OFF(ecode)) {
			
 
				+	case NTDB_ERR_TO_OFF(NTDB_ERR_CORRUPT):
			
 
				+	case NTDB_ERR_TO_OFF(NTDB_ERR_IO):
			
 
				+		saved_errno = EIO;
			
 
				+		break;
			
 
				+	case NTDB_ERR_TO_OFF(NTDB_ERR_LOCK):
			
 
				+		saved_errno = EWOULDBLOCK;
			
 
				+		break;
			
 
				+	case NTDB_ERR_TO_OFF(NTDB_ERR_OOM):
			
 
				+		saved_errno = ENOMEM;
			
 
				+		break;
			
 
				+	case NTDB_ERR_TO_OFF(NTDB_ERR_EINVAL):
			
 
				+		saved_errno = EINVAL;
			
 
				+		break;
			
 
				+	default:
			
 
				+		saved_errno = EINVAL;
			
 
				+		break;
			
 
				+	}
			
 
				+
			
 
				+fail_errno:
			
 
				+#ifdef NTDB_TRACE
			
 
				+	close(ntdb->tracefd);
			
 
				+#endif
			
 
				+	if (ntdb->file) {
			
 
				+		ntdb_lock_cleanup(ntdb);
			
 
				+		if (--ntdb->file->refcnt == 0) {
			
 
				+			assert(ntdb->file->num_lockrecs == 0);
			
 
				+			if (ntdb->file->map_ptr) {
			
 
				+				if (ntdb->flags & NTDB_INTERNAL) {
			
 
				+					ntdb->free_fn(ntdb->file->map_ptr,
			
 
				+						      ntdb->alloc_data);
			
 
				+				} else
			
 
				+					ntdb_munmap(ntdb);
			
 
				+			}
			
 
				+			if (ntdb->file->fd != -1 && close(ntdb->file->fd) != 0)
			
 
				+				ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
			
 
				+					   "ntdb_open: failed to close ntdb fd"
			
 
				+					   " on error: %s", strerror(errno));
			
 
				+			ntdb->free_fn(ntdb->file->lockrecs, ntdb->alloc_data);
			
 
				+			ntdb->free_fn(ntdb->file, ntdb->alloc_data);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	ntdb->free_fn(ntdb, ntdb->alloc_data);
			
 
				+	errno = saved_errno;
			
 
				+	return NULL;
			
 
				+}
			
 
				+
			
 
				+_PUBLIC_ int ntdb_close(struct ntdb_context *ntdb)
			
 
				+{
			
 
				+	int ret = 0;
			
 
				+	struct ntdb_context **i;
			
 
				+
			
 
				+	ntdb_trace(ntdb, "ntdb_close");
			
 
				+
			
 
				+	if (ntdb->transaction) {
			
 
				+		ntdb_transaction_cancel(ntdb);
			
 
				+	}
			
 
				+
			
 
				+	ntdb_lock_cleanup(ntdb);
			
 
				+	if (--ntdb->file->refcnt == 0) {
			
 
				+		if (ntdb->file->map_ptr) {
			
 
				+			if (ntdb->flags & NTDB_INTERNAL) {
			
 
				+				ntdb->free_fn(ntdb->file->map_ptr,
			
 
				+					      ntdb->alloc_data);
			
 
				+			} else {
			
 
				+				ntdb_munmap(ntdb);
			
 
				+			}
			
 
				+		}
			
 
				+		ret = close(ntdb->file->fd);
			
 
				+		ntdb->free_fn(ntdb->file->lockrecs, ntdb->alloc_data);
			
 
				+		ntdb->free_fn(ntdb->file, ntdb->alloc_data);
			
 
				+	}
			
 
				+
			
 
				+	/* Remove from tdbs list */
			
 
				+	for (i = &tdbs; *i; i = &(*i)->next) {
			
 
				+		if (*i == ntdb) {
			
 
				+			*i = ntdb->next;
			
 
				+			break;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+#ifdef NTDB_TRACE
			
 
				+	close(ntdb->tracefd);
			
 
				+#endif
			
 
				+	ntdb->free_fn(ntdb, ntdb->alloc_data);
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+_PUBLIC_ void ntdb_foreach_(int (*fn)(struct ntdb_context *, void *), void *p)
			
 
				+{
			
 
				+	struct ntdb_context *i;
			
 
				+
			
 
				+	for (i = tdbs; i; i = i->next) {
			
 
				+		if (fn(i, p) != 0)
			
 
				+			break;
			
 
				+	}
			
 
				+}
			
--- a/ccan/ntdb/private.h
+++ b/ccan/ntdb/private.h
@@ -0,0 +1,677 @@
 
				+#ifndef NTDB_PRIVATE_H
			
 
				+#define NTDB_PRIVATE_H
			
 
				+/*
			
 
				+  Trivial Database 2: private types and prototypes
			
 
				+  Copyright (C) Rusty Russell 2010
			
 
				+
			
 
				+  This library is free software; you can redistribute it and/or
			
 
				+  modify it under the terms of the GNU Lesser General Public
			
 
				+  License as published by the Free Software Foundation; either
			
 
				+  version 3 of the License, or (at your option) any later version.
			
 
				+
			
 
				+  This library is distributed in the hope that it will be useful,
			
 
				+  but WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
			
 
				+  Lesser General Public License for more details.
			
 
				+
			
 
				+  You should have received a copy of the GNU Lesser General Public
			
 
				+  License along with this library; if not, see <http://www.gnu.org/licenses/>.
			
 
				+*/
			
 
				+
			
 
				+#include "config.h"
			
 
				+#ifndef HAVE_CCAN
			
 
				+#error You need ccan to build ntdb!
			
 
				+#endif
			
 
				+#include "ntdb.h"
			
 
				+#include <ccan/compiler/compiler.h>
			
 
				+#include <ccan/likely/likely.h>
			
 
				+#include <ccan/endian/endian.h>
			
 
				+
			
 
				+#ifdef HAVE_LIBREPLACE
			
 
				+#include "replace.h"
			
 
				+#include "system/filesys.h"
			
 
				+#include "system/time.h"
			
 
				+#include "system/shmem.h"
			
 
				+#include "system/select.h"
			
 
				+#include "system/wait.h"
			
 
				+#else
			
 
				+#include <stdarg.h>
			
 
				+#include <stdint.h>
			
 
				+#include <stdbool.h>
			
 
				+#include <stdlib.h>
			
 
				+#include <stddef.h>
			
 
				+#include <sys/time.h>
			
 
				+#include <sys/mman.h>
			
 
				+#include <unistd.h>
			
 
				+#include <fcntl.h>
			
 
				+#include <errno.h>
			
 
				+#include <stdio.h>
			
 
				+#include <utime.h>
			
 
				+#include <unistd.h>
			
 
				+#include <ctype.h>
			
 
				+#include <string.h>
			
 
				+#include <sys/wait.h>
			
 
				+#include <time.h>
			
 
				+#endif
			
 
				+#include <assert.h>
			
 
				+
			
 
				+#ifndef TEST_IT
			
 
				+#define TEST_IT(cond)
			
 
				+#endif
			
 
				+
			
 
				+/* #define NTDB_TRACE 1 */
			
 
				+
			
 
				+#ifndef __STRING
			
 
				+#define __STRING(x)    #x
			
 
				+#endif
			
 
				+
			
 
				+#ifndef __STRINGSTRING
			
 
				+#define __STRINGSTRING(x) __STRING(x)
			
 
				+#endif
			
 
				+
			
 
				+#ifndef __location__
			
 
				+#define __location__ __FILE__ ":" __STRINGSTRING(__LINE__)
			
 
				+#endif
			
 
				+
			
 
				+typedef uint64_t ntdb_len_t;
			
 
				+typedef uint64_t ntdb_off_t;
			
 
				+
			
 
				+#define NTDB_MAGIC_FOOD "NTDB file\n"
			
 
				+#define NTDB_VERSION ((uint64_t)(0x26011967 + 7))
			
 
				+#define NTDB_USED_MAGIC ((uint64_t)0x1999)
			
 
				+#define NTDB_HTABLE_MAGIC ((uint64_t)0x1888)
			
 
				+#define NTDB_CHAIN_MAGIC ((uint64_t)0x1777)
			
 
				+#define NTDB_FTABLE_MAGIC ((uint64_t)0x1666)
			
 
				+#define NTDB_CAP_MAGIC ((uint64_t)0x1555)
			
 
				+#define NTDB_FREE_MAGIC ((uint64_t)0xFE)
			
 
				+#define NTDB_HASH_MAGIC (0xA1ABE11A01092008ULL)
			
 
				+#define NTDB_RECOVERY_MAGIC (0xf53bc0e7ad124589ULL)
			
 
				+#define NTDB_RECOVERY_INVALID_MAGIC (0x0ULL)
			
 
				+
			
 
				+/* Capability bits. */
			
 
				+#define NTDB_CAP_TYPE_MASK	0x1FFFFFFFFFFFFFFFULL
			
 
				+#define NTDB_CAP_NOCHECK		0x8000000000000000ULL
			
 
				+#define NTDB_CAP_NOWRITE		0x4000000000000000ULL
			
 
				+#define NTDB_CAP_NOOPEN		0x2000000000000000ULL
			
 
				+
			
 
				+#define NTDB_OFF_IS_ERR(off) unlikely(off >= (ntdb_off_t)(long)NTDB_ERR_LAST)
			
 
				+#define NTDB_OFF_TO_ERR(off) ((enum NTDB_ERROR)(long)(off))
			
 
				+#define NTDB_ERR_TO_OFF(ecode) ((ntdb_off_t)(long)(ecode))
			
 
				+
			
 
				+/* Packing errors into pointers and v.v. */
			
 
				+#define NTDB_PTR_IS_ERR(ptr)						\
			
 
				+	unlikely((unsigned long)(ptr) >= (unsigned long)NTDB_ERR_LAST)
			
 
				+#define NTDB_PTR_ERR(p) ((enum NTDB_ERROR)(long)(p))
			
 
				+#define NTDB_ERR_PTR(err) ((void *)(long)(err))
			
 
				+
			
 
				+/* This doesn't really need to be pagesize, but we use it for similar
			
 
				+ * reasons. */
			
 
				+#define NTDB_PGSIZE 16384
			
 
				+
			
 
				+/* Common case of returning true, false or -ve error. */
			
 
				+typedef int ntdb_bool_err;
			
 
				+
			
 
				+/* Prevent others from opening the file. */
			
 
				+#define NTDB_OPEN_LOCK 0
			
 
				+/* Expanding file. */
			
 
				+#define NTDB_EXPANSION_LOCK 2
			
 
				+/* Doing a transaction. */
			
 
				+#define NTDB_TRANSACTION_LOCK 8
			
 
				+/* Hash chain locks. */
			
 
				+#define NTDB_HASH_LOCK_START 64
			
 
				+
			
 
				+/* Extend file by least 100 times larger than needed. */
			
 
				+#define NTDB_EXTENSION_FACTOR 100
			
 
				+
			
 
				+/* We steal this many upper bits, giving a maximum offset of 64 exabytes. */
			
 
				+#define NTDB_OFF_UPPER_STEAL 8
			
 
				+
			
 
				+/* And we use the lower bit, too. */
			
 
				+#define NTDB_OFF_CHAIN_BIT	0
			
 
				+
			
 
				+/* Hash table sits just after the header. */
			
 
				+#define NTDB_HASH_OFFSET (sizeof(struct ntdb_header))
			
 
				+
			
 
				+/* Additional features we understand.  Currently: none. */
			
 
				+#define NTDB_FEATURE_MASK ((uint64_t)0)
			
 
				+
			
 
				+/* The bit number where we store the extra hash bits. */
			
 
				+/* Convenience mask to get actual offset. */
			
 
				+#define NTDB_OFF_MASK							\
			
 
				+	(((1ULL << (64 - NTDB_OFF_UPPER_STEAL)) - 1) - (1<<NTDB_OFF_CHAIN_BIT))
			
 
				+
			
 
				+/* How many buckets in a free list: see size_to_bucket(). */
			
 
				+#define NTDB_FREE_BUCKETS (64 - NTDB_OFF_UPPER_STEAL)
			
 
				+
			
 
				+/* We have to be able to fit a free record here. */
			
 
				+#define NTDB_MIN_DATA_LEN						\
			
 
				+	(sizeof(struct ntdb_free_record) - sizeof(struct ntdb_used_record))
			
 
				+
			
 
				+/* Indicates this entry is not on an flist (can happen during coalescing) */
			
 
				+#define NTDB_FTABLE_NONE ((1ULL << NTDB_OFF_UPPER_STEAL) - 1)
			
 
				+
			
 
				+/* By default, hash is 64k bytes */
			
 
				+#define NTDB_DEFAULT_HBITS 13
			
 
				+
			
 
				+struct ntdb_used_record {
			
 
				+	/* For on-disk compatibility, we avoid bitfields:
			
 
				+	   magic: 16,        (highest)
			
 
				+	   key_len_bits: 5,
			
 
				+	   extra_padding: 32
			
 
				+	*/
			
 
				+        uint64_t magic_and_meta;
			
 
				+	/* The bottom key_len_bits*2 are key length, rest is data length. */
			
 
				+        uint64_t key_and_data_len;
			
 
				+};
			
 
				+
			
 
				+static inline unsigned rec_key_bits(const struct ntdb_used_record *r)
			
 
				+{
			
 
				+	return ((r->magic_and_meta >> 43) & ((1 << 5)-1)) * 2;
			
 
				+}
			
 
				+
			
 
				+static inline uint64_t rec_key_length(const struct ntdb_used_record *r)
			
 
				+{
			
 
				+	return r->key_and_data_len & ((1ULL << rec_key_bits(r)) - 1);
			
 
				+}
			
 
				+
			
 
				+static inline uint64_t rec_data_length(const struct ntdb_used_record *r)
			
 
				+{
			
 
				+	return r->key_and_data_len >> rec_key_bits(r);
			
 
				+}
			
 
				+
			
 
				+static inline uint64_t rec_extra_padding(const struct ntdb_used_record *r)
			
 
				+{
			
 
				+	return (r->magic_and_meta >> 11) & 0xFFFFFFFF;
			
 
				+}
			
 
				+
			
 
				+static inline uint16_t rec_magic(const struct ntdb_used_record *r)
			
 
				+{
			
 
				+	return (r->magic_and_meta >> 48);
			
 
				+}
			
 
				+
			
 
				+struct ntdb_free_record {
			
 
				+        uint64_t magic_and_prev; /* NTDB_OFF_UPPER_STEAL bits magic, then prev */
			
 
				+        uint64_t ftable_and_len; /* Len not counting these two fields. */
			
 
				+	/* This is why the minimum record size is 8 bytes.  */
			
 
				+	uint64_t next;
			
 
				+};
			
 
				+
			
 
				+static inline uint64_t frec_prev(const struct ntdb_free_record *f)
			
 
				+{
			
 
				+	return f->magic_and_prev & ((1ULL << (64 - NTDB_OFF_UPPER_STEAL)) - 1);
			
 
				+}
			
 
				+
			
 
				+static inline uint64_t frec_magic(const struct ntdb_free_record *f)
			
 
				+{
			
 
				+	return f->magic_and_prev >> (64 - NTDB_OFF_UPPER_STEAL);
			
 
				+}
			
 
				+
			
 
				+static inline uint64_t frec_len(const struct ntdb_free_record *f)
			
 
				+{
			
 
				+	return f->ftable_and_len & ((1ULL << (64 - NTDB_OFF_UPPER_STEAL))-1);
			
 
				+}
			
 
				+
			
 
				+static inline unsigned frec_ftable(const struct ntdb_free_record *f)
			
 
				+{
			
 
				+	return f->ftable_and_len >> (64 - NTDB_OFF_UPPER_STEAL);
			
 
				+}
			
 
				+
			
 
				+struct ntdb_recovery_record {
			
 
				+	uint64_t magic;
			
 
				+	/* Length of record (add this header to get total length). */
			
 
				+	uint64_t max_len;
			
 
				+	/* Length used. */
			
 
				+	uint64_t len;
			
 
				+	/* Old length of file before transaction. */
			
 
				+	uint64_t eof;
			
 
				+};
			
 
				+
			
 
				+/* this is stored at the front of every database */
			
 
				+struct ntdb_header {
			
 
				+	char magic_food[64]; /* for /etc/magic */
			
 
				+	/* FIXME: Make me 32 bit? */
			
 
				+	uint64_t version; /* version of the code */
			
 
				+	uint64_t hash_bits; /* bits for toplevel hash table. */
			
 
				+	uint64_t hash_test; /* result of hashing HASH_MAGIC. */
			
 
				+	uint64_t hash_seed; /* "random" seed written at creation time. */
			
 
				+	ntdb_off_t free_table; /* (First) free table. */
			
 
				+	ntdb_off_t recovery; /* Transaction recovery area. */
			
 
				+
			
 
				+	uint64_t features_used; /* Features all writers understand */
			
 
				+	uint64_t features_offered; /* Features offered */
			
 
				+
			
 
				+	uint64_t seqnum; /* Sequence number for NTDB_SEQNUM */
			
 
				+
			
 
				+	ntdb_off_t capabilities; /* Optional linked list of capabilities. */
			
 
				+	ntdb_off_t reserved[22];
			
 
				+
			
 
				+	/*
			
 
				+	 * Hash table is next:
			
 
				+	 *
			
 
				+	 * struct ntdb_used_record htable_hdr;
			
 
				+	 * ntdb_off_t htable[1 << hash_bits];
			
 
				+	 */
			
 
				+};
			
 
				+
			
 
				+struct ntdb_freetable {
			
 
				+	struct ntdb_used_record hdr;
			
 
				+	ntdb_off_t next;
			
 
				+	ntdb_off_t buckets[NTDB_FREE_BUCKETS];
			
 
				+};
			
 
				+
			
 
				+struct ntdb_capability {
			
 
				+	struct ntdb_used_record hdr;
			
 
				+	ntdb_off_t type;
			
 
				+	ntdb_off_t next;
			
 
				+	/* ... */
			
 
				+};
			
 
				+
			
 
				+/* Information about a particular (locked) hash entry. */
			
 
				+struct hash_info {
			
 
				+	/* Full hash value of entry. */
			
 
				+	uint32_t h;
			
 
				+	/* Start of hash table / chain. */
			
 
				+	ntdb_off_t table;
			
 
				+	/* Number of entries in this table/chain. */
			
 
				+	ntdb_off_t table_size;
			
 
				+	/* Bucket we (or an empty space) were found in. */
			
 
				+	ntdb_off_t bucket;
			
 
				+	/* Old value that was in that entry (if not found) */
			
 
				+	ntdb_off_t old_val;
			
 
				+};
			
 
				+
			
 
				+enum ntdb_lock_flags {
			
 
				+	/* WAIT == F_SETLKW, NOWAIT == F_SETLK */
			
 
				+	NTDB_LOCK_NOWAIT = 0,
			
 
				+	NTDB_LOCK_WAIT = 1,
			
 
				+	/* If set, don't log an error on failure. */
			
 
				+	NTDB_LOCK_PROBE = 2,
			
 
				+	/* If set, don't check for recovery (used by recovery code). */
			
 
				+	NTDB_LOCK_NOCHECK = 4,
			
 
				+};
			
 
				+
			
 
				+struct ntdb_lock {
			
 
				+	struct ntdb_context *owner;
			
 
				+	off_t off;
			
 
				+	uint32_t count;
			
 
				+	uint32_t ltype;
			
 
				+};
			
 
				+
			
 
				+/* This is only needed for ntdb_access_commit, but used everywhere to
			
 
				+ * simplify. */
			
 
				+struct ntdb_access_hdr {
			
 
				+	struct ntdb_access_hdr *next;
			
 
				+	ntdb_off_t off;
			
 
				+	ntdb_len_t len;
			
 
				+	bool convert;
			
 
				+};
			
 
				+
			
 
				+/* mmaps we are keeping around because they are still direct accessed */
			
 
				+struct ntdb_old_mmap {
			
 
				+	struct ntdb_old_mmap *next;
			
 
				+
			
 
				+	void *map_ptr;
			
 
				+	ntdb_len_t map_size;
			
 
				+};
			
 
				+
			
 
				+struct ntdb_file {
			
 
				+	/* How many are sharing us? */
			
 
				+	unsigned int refcnt;
			
 
				+
			
 
				+	/* Mmap (if any), or malloc (for NTDB_INTERNAL). */
			
 
				+	void *map_ptr;
			
 
				+
			
 
				+	/* How much space has been mapped (<= current file size) */
			
 
				+	ntdb_len_t map_size;
			
 
				+
			
 
				+	/* The file descriptor (-1 for NTDB_INTERNAL). */
			
 
				+	int fd;
			
 
				+
			
 
				+	/* How many are accessing directly? */
			
 
				+	unsigned int direct_count;
			
 
				+
			
 
				+	/* Old maps, still direct accessed. */
			
 
				+	struct ntdb_old_mmap *old_mmaps;
			
 
				+
			
 
				+	/* Lock information */
			
 
				+	pid_t locker;
			
 
				+	struct ntdb_lock allrecord_lock;
			
 
				+	size_t num_lockrecs;
			
 
				+	struct ntdb_lock *lockrecs;
			
 
				+
			
 
				+	/* Identity of this file. */
			
 
				+	dev_t device;
			
 
				+	ino_t inode;
			
 
				+};
			
 
				+
			
 
				+struct ntdb_methods {
			
 
				+	enum NTDB_ERROR (*tread)(struct ntdb_context *, ntdb_off_t, void *,
			
 
				+				 ntdb_len_t);
			
 
				+	enum NTDB_ERROR (*twrite)(struct ntdb_context *, ntdb_off_t, const void *,
			
 
				+				  ntdb_len_t);
			
 
				+	enum NTDB_ERROR (*oob)(struct ntdb_context *, ntdb_off_t, ntdb_len_t, bool);
			
 
				+	enum NTDB_ERROR (*expand_file)(struct ntdb_context *, ntdb_len_t);
			
 
				+	void *(*direct)(struct ntdb_context *, ntdb_off_t, size_t, bool);
			
 
				+	ntdb_off_t (*read_off)(struct ntdb_context *ntdb, ntdb_off_t off);
			
 
				+	enum NTDB_ERROR (*write_off)(struct ntdb_context *ntdb, ntdb_off_t off,
			
 
				+				     ntdb_off_t val);
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+  internal prototypes
			
 
				+*/
			
 
				+/* Get bits from a value. */
			
 
				+static inline uint32_t bits_from(uint64_t val, unsigned start, unsigned num)
			
 
				+{
			
 
				+	assert(num <= 32);
			
 
				+	return (val >> start) & ((1U << num) - 1);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/* hash.c: */
			
 
				+uint32_t ntdb_jenkins_hash(const void *key, size_t length, uint32_t seed,
			
 
				+			   void *unused);
			
 
				+
			
 
				+enum NTDB_ERROR first_in_hash(struct ntdb_context *ntdb,
			
 
				+			      struct hash_info *h,
			
 
				+			      NTDB_DATA *kbuf, size_t *dlen);
			
 
				+
			
 
				+enum NTDB_ERROR next_in_hash(struct ntdb_context *ntdb,
			
 
				+			     struct hash_info *h,
			
 
				+			     NTDB_DATA *kbuf, size_t *dlen);
			
 
				+
			
 
				+/* Hash random memory. */
			
 
				+uint32_t ntdb_hash(struct ntdb_context *ntdb, const void *ptr, size_t len);
			
 
				+
			
 
				+/* Find and lock a hash entry (or where it would be). */
			
 
				+ntdb_off_t find_and_lock(struct ntdb_context *ntdb,
			
 
				+			 NTDB_DATA key,
			
 
				+			 int ltype,
			
 
				+			 struct hash_info *h,
			
 
				+			 struct ntdb_used_record *rec,
			
 
				+			 const char **rkey);
			
 
				+
			
 
				+enum NTDB_ERROR replace_in_hash(struct ntdb_context *ntdb,
			
 
				+				const struct hash_info *h,
			
 
				+				ntdb_off_t new_off);
			
 
				+
			
 
				+enum NTDB_ERROR add_to_hash(struct ntdb_context *ntdb,
			
 
				+			    const struct hash_info *h,
			
 
				+			    ntdb_off_t new_off);
			
 
				+
			
 
				+enum NTDB_ERROR delete_from_hash(struct ntdb_context *ntdb,
			
 
				+				 const struct hash_info *h);
			
 
				+
			
 
				+/* For ntdb_check */
			
 
				+bool is_subhash(ntdb_off_t val);
			
 
				+enum NTDB_ERROR unknown_capability(struct ntdb_context *ntdb, const char *caller,
			
 
				+				   ntdb_off_t type);
			
 
				+
			
 
				+/* free.c: */
			
 
				+enum NTDB_ERROR ntdb_ftable_init(struct ntdb_context *ntdb);
			
 
				+
			
 
				+/* check.c needs these to iterate through free lists. */
			
 
				+ntdb_off_t first_ftable(struct ntdb_context *ntdb);
			
 
				+ntdb_off_t next_ftable(struct ntdb_context *ntdb, ntdb_off_t ftable);
			
 
				+
			
 
				+/* This returns space or -ve error number. */
			
 
				+ntdb_off_t alloc(struct ntdb_context *ntdb, size_t keylen, size_t datalen,
			
 
				+		 unsigned magic, bool growing);
			
 
				+
			
 
				+/* Put this record in a free list. */
			
 
				+enum NTDB_ERROR add_free_record(struct ntdb_context *ntdb,
			
 
				+				ntdb_off_t off, ntdb_len_t len_with_header,
			
 
				+				enum ntdb_lock_flags waitflag,
			
 
				+				bool coalesce_ok);
			
 
				+
			
 
				+/* Set up header for a used/ftable/htable/chain/capability record. */
			
 
				+enum NTDB_ERROR set_header(struct ntdb_context *ntdb,
			
 
				+			   struct ntdb_used_record *rec,
			
 
				+			   unsigned magic, uint64_t keylen, uint64_t datalen,
			
 
				+			   uint64_t actuallen);
			
 
				+
			
 
				+/* Used by ntdb_check to verify. */
			
 
				+unsigned int size_to_bucket(ntdb_len_t data_len);
			
 
				+ntdb_off_t bucket_off(ntdb_off_t ftable_off, unsigned bucket);
			
 
				+
			
 
				+/* Used by ntdb_summary */
			
 
				+ntdb_off_t dead_space(struct ntdb_context *ntdb, ntdb_off_t off);
			
 
				+
			
 
				+/* Adjust expansion, used by create_recovery_area */
			
 
				+ntdb_off_t ntdb_expand_adjust(ntdb_off_t map_size, ntdb_off_t size);
			
 
				+
			
 
				+/* io.c: */
			
 
				+/* Initialize ntdb->methods. */
			
 
				+void ntdb_io_init(struct ntdb_context *ntdb);
			
 
				+
			
 
				+/* Convert endian of the buffer if required. */
			
 
				+void *ntdb_convert(const struct ntdb_context *ntdb, void *buf, ntdb_len_t size);
			
 
				+
			
 
				+/* Unmap and try to map the ntdb. */
			
 
				+enum NTDB_ERROR ntdb_munmap(struct ntdb_context *ntdb);
			
 
				+enum NTDB_ERROR ntdb_mmap(struct ntdb_context *ntdb);
			
 
				+
			
 
				+/* Either alloc a copy, or give direct access.  Release frees or noop. */
			
 
				+const void *ntdb_access_read(struct ntdb_context *ntdb,
			
 
				+			     ntdb_off_t off, ntdb_len_t len, bool convert);
			
 
				+void *ntdb_access_write(struct ntdb_context *ntdb,
			
 
				+			ntdb_off_t off, ntdb_len_t len, bool convert);
			
 
				+
			
 
				+/* Release result of ntdb_access_read/write. */
			
 
				+void ntdb_access_release(struct ntdb_context *ntdb, const void *p);
			
 
				+/* Commit result of ntdb_acces_write. */
			
 
				+enum NTDB_ERROR ntdb_access_commit(struct ntdb_context *ntdb, void *p);
			
 
				+
			
 
				+/* Clear an ondisk area. */
			
 
				+enum NTDB_ERROR zero_out(struct ntdb_context *ntdb, ntdb_off_t off, ntdb_len_t len);
			
 
				+
			
 
				+/* Return a non-zero offset between >= start < end in this array (or end). */
			
 
				+ntdb_off_t ntdb_find_nonzero_off(struct ntdb_context *ntdb,
			
 
				+				 ntdb_off_t base,
			
 
				+				 uint64_t start,
			
 
				+				 uint64_t end);
			
 
				+
			
 
				+/* Return a zero offset in this array, or num. */
			
 
				+ntdb_off_t ntdb_find_zero_off(struct ntdb_context *ntdb, ntdb_off_t off,
			
 
				+			      uint64_t num);
			
 
				+
			
 
				+/* Allocate and make a copy of some offset. */
			
 
				+void *ntdb_alloc_read(struct ntdb_context *ntdb, ntdb_off_t offset, ntdb_len_t len);
			
 
				+
			
 
				+/* Writes a converted copy of a record. */
			
 
				+enum NTDB_ERROR ntdb_write_convert(struct ntdb_context *ntdb, ntdb_off_t off,
			
 
				+				   const void *rec, size_t len);
			
 
				+
			
 
				+/* Reads record and converts it */
			
 
				+enum NTDB_ERROR ntdb_read_convert(struct ntdb_context *ntdb, ntdb_off_t off,
			
 
				+				  void *rec, size_t len);
			
 
				+
			
 
				+/* Bump the seqnum (caller checks for ntdb->flags & NTDB_SEQNUM) */
			
 
				+void ntdb_inc_seqnum(struct ntdb_context *ntdb);
			
 
				+
			
 
				+/* lock.c: */
			
 
				+/* Print message because another ntdb owns a lock we want. */
			
 
				+enum NTDB_ERROR owner_conflict(struct ntdb_context *ntdb, const char *call);
			
 
				+
			
 
				+/* If we fork, we no longer really own locks. */
			
 
				+bool check_lock_pid(struct ntdb_context *ntdb, const char *call, bool log);
			
 
				+
			
 
				+/* Lock/unlock a hash bucket. */
			
 
				+enum NTDB_ERROR ntdb_lock_hash(struct ntdb_context *ntdb,
			
 
				+			       unsigned int hbucket,
			
 
				+			       int ltype);
			
 
				+enum NTDB_ERROR ntdb_unlock_hash(struct ntdb_context *ntdb,
			
 
				+				 unsigned int hash, int ltype);
			
 
				+
			
 
				+/* For closing the file. */
			
 
				+void ntdb_lock_cleanup(struct ntdb_context *ntdb);
			
 
				+
			
 
				+/* Lock/unlock a particular free bucket. */
			
 
				+enum NTDB_ERROR ntdb_lock_free_bucket(struct ntdb_context *ntdb, ntdb_off_t b_off,
			
 
				+				      enum ntdb_lock_flags waitflag);
			
 
				+void ntdb_unlock_free_bucket(struct ntdb_context *ntdb, ntdb_off_t b_off);
			
 
				+
			
 
				+/* Serialize transaction start. */
			
 
				+enum NTDB_ERROR ntdb_transaction_lock(struct ntdb_context *ntdb, int ltype);
			
 
				+void ntdb_transaction_unlock(struct ntdb_context *ntdb, int ltype);
			
 
				+
			
 
				+/* Do we have any hash locks (ie. via ntdb_chainlock) ? */
			
 
				+bool ntdb_has_hash_locks(struct ntdb_context *ntdb);
			
 
				+
			
 
				+/* Lock entire database. */
			
 
				+enum NTDB_ERROR ntdb_allrecord_lock(struct ntdb_context *ntdb, int ltype,
			
 
				+				    enum ntdb_lock_flags flags, bool upgradable);
			
 
				+void ntdb_allrecord_unlock(struct ntdb_context *ntdb, int ltype);
			
 
				+enum NTDB_ERROR ntdb_allrecord_upgrade(struct ntdb_context *ntdb, off_t start);
			
 
				+
			
 
				+/* Serialize db open. */
			
 
				+enum NTDB_ERROR ntdb_lock_open(struct ntdb_context *ntdb,
			
 
				+			       int ltype, enum ntdb_lock_flags flags);
			
 
				+void ntdb_unlock_open(struct ntdb_context *ntdb, int ltype);
			
 
				+bool ntdb_has_open_lock(struct ntdb_context *ntdb);
			
 
				+
			
 
				+/* Serialize db expand. */
			
 
				+enum NTDB_ERROR ntdb_lock_expand(struct ntdb_context *ntdb, int ltype);
			
 
				+void ntdb_unlock_expand(struct ntdb_context *ntdb, int ltype);
			
 
				+bool ntdb_has_expansion_lock(struct ntdb_context *ntdb);
			
 
				+
			
 
				+/* If it needs recovery, grab all the locks and do it. */
			
 
				+enum NTDB_ERROR ntdb_lock_and_recover(struct ntdb_context *ntdb);
			
 
				+
			
 
				+/* Default lock and unlock functions. */
			
 
				+int ntdb_fcntl_lock(int fd, int rw, off_t off, off_t len, bool waitflag, void *);
			
 
				+int ntdb_fcntl_unlock(int fd, int rw, off_t off, off_t len, void *);
			
 
				+
			
 
				+/* transaction.c: */
			
 
				+enum NTDB_ERROR ntdb_transaction_recover(struct ntdb_context *ntdb);
			
 
				+ntdb_bool_err ntdb_needs_recovery(struct ntdb_context *ntdb);
			
 
				+
			
 
				+struct ntdb_context {
			
 
				+	/* Single list of all TDBs, to detect multiple opens. */
			
 
				+	struct ntdb_context *next;
			
 
				+
			
 
				+	/* Filename of the database. */
			
 
				+	const char *name;
			
 
				+
			
 
				+	/* Logging function */
			
 
				+	void (*log_fn)(struct ntdb_context *ntdb,
			
 
				+		       enum ntdb_log_level level,
			
 
				+		       enum NTDB_ERROR ecode,
			
 
				+		       const char *message,
			
 
				+		       void *data);
			
 
				+	void *log_data;
			
 
				+
			
 
				+	/* Open flags passed to ntdb_open. */
			
 
				+	int open_flags;
			
 
				+
			
 
				+	/* low level (fnctl) lock functions. */
			
 
				+	int (*lock_fn)(int fd, int rw, off_t off, off_t len, bool w, void *);
			
 
				+	int (*unlock_fn)(int fd, int rw, off_t off, off_t len, void *);
			
 
				+	void *lock_data;
			
 
				+
			
 
				+	/* the ntdb flags passed to ntdb_open. */
			
 
				+	uint32_t flags;
			
 
				+
			
 
				+	/* Our statistics. */
			
 
				+	struct ntdb_attribute_stats stats;
			
 
				+
			
 
				+	/* The actual file information */
			
 
				+	struct ntdb_file *file;
			
 
				+
			
 
				+	/* Hash function. */
			
 
				+	uint32_t (*hash_fn)(const void *key, size_t len, uint32_t seed, void *);
			
 
				+	void *hash_data;
			
 
				+	uint32_t hash_seed;
			
 
				+	/* Bits in toplevel hash table. */
			
 
				+	unsigned int hash_bits;
			
 
				+
			
 
				+	/* Allocate and free functions. */
			
 
				+	void *(*alloc_fn)(const void *owner, size_t len, void *priv_data);
			
 
				+	void *(*expand_fn)(void *old, size_t newlen, void *priv_data);
			
 
				+	void (*free_fn)(void *old, void *priv_data);
			
 
				+	void *alloc_data;
			
 
				+
			
 
				+	/* Our open hook, if any. */
			
 
				+	enum NTDB_ERROR (*openhook)(int fd, void *data);
			
 
				+	void *openhook_data;
			
 
				+
			
 
				+	/* Set if we are in a transaction. */
			
 
				+	struct ntdb_transaction *transaction;
			
 
				+
			
 
				+	/* What free table are we using? */
			
 
				+	ntdb_off_t ftable_off;
			
 
				+	unsigned int ftable;
			
 
				+
			
 
				+	/* IO methods: changes for transactions. */
			
 
				+	const struct ntdb_methods *io;
			
 
				+
			
 
				+	/* Direct access information */
			
 
				+	struct ntdb_access_hdr *access;
			
 
				+};
			
 
				+
			
 
				+/* ntdb.c: */
			
 
				+enum NTDB_ERROR COLD PRINTF_FMT(4, 5)
			
 
				+	ntdb_logerr(struct ntdb_context *ntdb,
			
 
				+		    enum NTDB_ERROR ecode,
			
 
				+		    enum ntdb_log_level level,
			
 
				+		    const char *fmt, ...);
			
 
				+
			
 
				+static inline enum NTDB_ERROR ntdb_oob(struct ntdb_context *ntdb,
			
 
				+				       ntdb_off_t off, ntdb_len_t len,
			
 
				+				       bool probe)
			
 
				+{
			
 
				+	if (likely(off + len >= off)
			
 
				+	    && likely(off + len <= ntdb->file->map_size)
			
 
				+	    && likely(!probe)) {
			
 
				+		    return NTDB_SUCCESS;
			
 
				+	}
			
 
				+	return ntdb->io->oob(ntdb, off, len, probe);
			
 
				+}
			
 
				+
			
 
				+/* Convenience routine to get an offset. */
			
 
				+static inline ntdb_off_t ntdb_read_off(struct ntdb_context *ntdb,
			
 
				+				       ntdb_off_t off)
			
 
				+{
			
 
				+	return ntdb->io->read_off(ntdb, off);
			
 
				+}
			
 
				+
			
 
				+/* Write an offset at an offset. */
			
 
				+static inline enum NTDB_ERROR ntdb_write_off(struct ntdb_context *ntdb,
			
 
				+					     ntdb_off_t off,
			
 
				+			       ntdb_off_t val)
			
 
				+{
			
 
				+	return ntdb->io->write_off(ntdb, off, val);
			
 
				+}
			
 
				+
			
 
				+#ifdef NTDB_TRACE
			
 
				+void ntdb_trace(struct ntdb_context *ntdb, const char *op);
			
 
				+void ntdb_trace_seqnum(struct ntdb_context *ntdb, uint32_t seqnum, const char *op);
			
 
				+void ntdb_trace_open(struct ntdb_context *ntdb, const char *op,
			
 
				+		     unsigned hash_size, unsigned ntdb_flags, unsigned open_flags);
			
 
				+void ntdb_trace_ret(struct ntdb_context *ntdb, const char *op, int ret);
			
 
				+void ntdb_trace_retrec(struct ntdb_context *ntdb, const char *op, NTDB_DATA ret);
			
 
				+void ntdb_trace_1rec(struct ntdb_context *ntdb, const char *op,
			
 
				+		     NTDB_DATA rec);
			
 
				+void ntdb_trace_1rec_ret(struct ntdb_context *ntdb, const char *op,
			
 
				+			 NTDB_DATA rec, int ret);
			
 
				+void ntdb_trace_1rec_retrec(struct ntdb_context *ntdb, const char *op,
			
 
				+			    NTDB_DATA rec, NTDB_DATA ret);
			
 
				+void ntdb_trace_2rec_flag_ret(struct ntdb_context *ntdb, const char *op,
			
 
				+			      NTDB_DATA rec1, NTDB_DATA rec2, unsigned flag,
			
 
				+			      int ret);
			
 
				+void ntdb_trace_2rec_retrec(struct ntdb_context *ntdb, const char *op,
			
 
				+			    NTDB_DATA rec1, NTDB_DATA rec2, NTDB_DATA ret);
			
 
				+#else
			
 
				+#define ntdb_trace(ntdb, op)
			
 
				+#define ntdb_trace_seqnum(ntdb, seqnum, op)
			
 
				+#define ntdb_trace_open(ntdb, op, hash_size, ntdb_flags, open_flags)
			
 
				+#define ntdb_trace_ret(ntdb, op, ret)
			
 
				+#define ntdb_trace_retrec(ntdb, op, ret)
			
 
				+#define ntdb_trace_1rec(ntdb, op, rec)
			
 
				+#define ntdb_trace_1rec_ret(ntdb, op, rec, ret)
			
 
				+#define ntdb_trace_1rec_retrec(ntdb, op, rec, ret)
			
 
				+#define ntdb_trace_2rec_flag_ret(ntdb, op, rec1, rec2, flag, ret)
			
 
				+#define ntdb_trace_2rec_retrec(ntdb, op, rec1, rec2, ret)
			
 
				+#endif /* !NTDB_TRACE */
			
 
				+
			
 
				+#endif
			
--- a/ccan/ntdb/pyntdb.c
+++ b/ccan/ntdb/pyntdb.c
@@ -0,0 +1,643 @@
 
				+/*
			
 
				+   Unix SMB/CIFS implementation.
			
 
				+
			
 
				+   Python interface to ntdb.  Simply modified from tdb version.
			
 
				+
			
 
				+   Copyright (C) 2004-2006 Tim Potter <tpot@samba.org>
			
 
				+   Copyright (C) 2007-2008 Jelmer Vernooij <jelmer@samba.org>
			
 
				+   Copyright (C) 2011 Rusty Russell <rusty@rustcorp.com.au>
			
 
				+
			
 
				+     ** NOTE! The following LGPL license applies to the ntdb
			
 
				+     ** library. This does NOT imply that all of Samba is released
			
 
				+     ** under the LGPL
			
 
				+
			
 
				+   This library is free software; you can redistribute it and/or
			
 
				+   modify it under the terms of the GNU Lesser General Public
			
 
				+   License as published by the Free Software Foundation; either
			
 
				+   version 3 of the License, or (at your option) any later version.
			
 
				+
			
 
				+   This library is distributed in the hope that it will be useful,
			
 
				+   but WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
			
 
				+   Lesser General Public License for more details.
			
 
				+
			
 
				+   You should have received a copy of the GNU Lesser General Public
			
 
				+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
			
 
				+*/
			
 
				+
			
 
				+#include <Python.h>
			
 
				+#include "replace.h"
			
 
				+#include "system/filesys.h"
			
 
				+
			
 
				+/* Include ntdb headers */
			
 
				+#include <ntdb.h>
			
 
				+
			
 
				+typedef struct {
			
 
				+	PyObject_HEAD
			
 
				+	struct ntdb_context *ctx;
			
 
				+	bool closed;
			
 
				+} PyNtdbObject;
			
 
				+
			
 
				+static PyTypeObject PyNtdb;
			
 
				+
			
 
				+static void PyErr_SetTDBError(enum NTDB_ERROR e)
			
 
				+{
			
 
				+	PyErr_SetObject(PyExc_RuntimeError,
			
 
				+		Py_BuildValue("(i,s)", e, ntdb_errorstr(e)));
			
 
				+}
			
 
				+
			
 
				+static NTDB_DATA PyString_AsNtdb_Data(PyObject *data)
			
 
				+{
			
 
				+	NTDB_DATA ret;
			
 
				+	ret.dptr = (unsigned char *)PyString_AsString(data);
			
 
				+	ret.dsize = PyString_Size(data);
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+static PyObject *PyString_FromNtdb_Data(NTDB_DATA data)
			
 
				+{
			
 
				+	PyObject *ret = PyString_FromStringAndSize((const char *)data.dptr,
			
 
				+						   data.dsize);
			
 
				+	free(data.dptr);
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+#define PyErr_NTDB_ERROR_IS_ERR_RAISE(ret) \
			
 
				+	if (ret != NTDB_SUCCESS) { \
			
 
				+		PyErr_SetTDBError(ret); \
			
 
				+		return NULL; \
			
 
				+	}
			
 
				+
			
 
				+#define PyNtdb_CHECK_CLOSED(pyobj) \
			
 
				+	if (pyobj->closed) {\
			
 
				+		PyErr_SetObject(PyExc_RuntimeError, \
			
 
				+			Py_BuildValue("(i,s)", NTDB_ERR_EINVAL, "database is closed")); \
			
 
				+		return NULL; \
			
 
				+	}
			
 
				+
			
 
				+static void stderr_log(struct ntdb_context *ntdb,
			
 
				+		       enum ntdb_log_level level,
			
 
				+		       enum NTDB_ERROR ecode,
			
 
				+		       const char *message,
			
 
				+		       void *data)
			
 
				+{
			
 
				+	fprintf(stderr, "%s:%s:%s\n",
			
 
				+		ntdb_name(ntdb), ntdb_errorstr(ecode), message);
			
 
				+}
			
 
				+
			
 
				+static PyObject *py_ntdb_open(PyTypeObject *type, PyObject *args, PyObject *kwargs)
			
 
				+{
			
 
				+	char *name = NULL;
			
 
				+	int ntdb_flags = NTDB_DEFAULT, flags = O_RDWR, mode = 0600;
			
 
				+	struct ntdb_context *ctx;
			
 
				+	PyNtdbObject *ret;
			
 
				+	union ntdb_attribute logattr;
			
 
				+	const char *kwnames[] = { "name", "ntdb_flags", "flags", "mode", NULL };
			
 
				+
			
 
				+	if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|siii", cast_const2(char **, kwnames), &name, &ntdb_flags, &flags, &mode))
			
 
				+		return NULL;
			
 
				+
			
 
				+	if (name == NULL) {
			
 
				+		ntdb_flags |= NTDB_INTERNAL;
			
 
				+		name = "<internal>";
			
 
				+	}
			
 
				+
			
 
				+	logattr.log.base.attr = NTDB_ATTRIBUTE_LOG;
			
 
				+	logattr.log.base.next = NULL;
			
 
				+	logattr.log.fn = stderr_log;
			
 
				+	ctx = ntdb_open(name, ntdb_flags, flags, mode, &logattr);
			
 
				+	if (ctx == NULL) {
			
 
				+		PyErr_SetFromErrno(PyExc_IOError);
			
 
				+		return NULL;
			
 
				+	}
			
 
				+
			
 
				+	ret = PyObject_New(PyNtdbObject, &PyNtdb);
			
 
				+	if (!ret) {
			
 
				+		ntdb_close(ctx);
			
 
				+		return NULL;
			
 
				+	}
			
 
				+
			
 
				+	ret->ctx = ctx;
			
 
				+	ret->closed = false;
			
 
				+	return (PyObject *)ret;
			
 
				+}
			
 
				+
			
 
				+static PyObject *obj_transaction_cancel(PyNtdbObject *self)
			
 
				+{
			
 
				+	PyNtdb_CHECK_CLOSED(self);
			
 
				+	ntdb_transaction_cancel(self->ctx);
			
 
				+	Py_RETURN_NONE;
			
 
				+}
			
 
				+
			
 
				+static PyObject *obj_transaction_commit(PyNtdbObject *self)
			
 
				+{
			
 
				+	enum NTDB_ERROR ret;
			
 
				+	PyNtdb_CHECK_CLOSED(self);
			
 
				+	ret = ntdb_transaction_commit(self->ctx);
			
 
				+	PyErr_NTDB_ERROR_IS_ERR_RAISE(ret);
			
 
				+	Py_RETURN_NONE;
			
 
				+}
			
 
				+
			
 
				+static PyObject *obj_transaction_prepare_commit(PyNtdbObject *self)
			
 
				+{
			
 
				+	enum NTDB_ERROR ret;
			
 
				+	PyNtdb_CHECK_CLOSED(self);
			
 
				+	ret = ntdb_transaction_prepare_commit(self->ctx);
			
 
				+	PyErr_NTDB_ERROR_IS_ERR_RAISE(ret);
			
 
				+	Py_RETURN_NONE;
			
 
				+}
			
 
				+
			
 
				+static PyObject *obj_transaction_start(PyNtdbObject *self)
			
 
				+{
			
 
				+	enum NTDB_ERROR ret;
			
 
				+	PyNtdb_CHECK_CLOSED(self);
			
 
				+	ret = ntdb_transaction_start(self->ctx);
			
 
				+	PyErr_NTDB_ERROR_IS_ERR_RAISE(ret);
			
 
				+	Py_RETURN_NONE;
			
 
				+}
			
 
				+
			
 
				+static PyObject *obj_lockall(PyNtdbObject *self)
			
 
				+{
			
 
				+	enum NTDB_ERROR ret;
			
 
				+	PyNtdb_CHECK_CLOSED(self);
			
 
				+	ret = ntdb_lockall(self->ctx);
			
 
				+	PyErr_NTDB_ERROR_IS_ERR_RAISE(ret);
			
 
				+	Py_RETURN_NONE;
			
 
				+}
			
 
				+
			
 
				+static PyObject *obj_unlockall(PyNtdbObject *self)
			
 
				+{
			
 
				+	PyNtdb_CHECK_CLOSED(self);
			
 
				+	ntdb_unlockall(self->ctx);
			
 
				+	Py_RETURN_NONE;
			
 
				+}
			
 
				+
			
 
				+static PyObject *obj_lockall_read(PyNtdbObject *self)
			
 
				+{
			
 
				+	enum NTDB_ERROR ret;
			
 
				+	PyNtdb_CHECK_CLOSED(self);
			
 
				+	ret = ntdb_lockall_read(self->ctx);
			
 
				+	PyErr_NTDB_ERROR_IS_ERR_RAISE(ret);
			
 
				+	Py_RETURN_NONE;
			
 
				+}
			
 
				+
			
 
				+static PyObject *obj_unlockall_read(PyNtdbObject *self)
			
 
				+{
			
 
				+	PyNtdb_CHECK_CLOSED(self);
			
 
				+	ntdb_unlockall_read(self->ctx);
			
 
				+	Py_RETURN_NONE;
			
 
				+}
			
 
				+
			
 
				+static PyObject *obj_close(PyNtdbObject *self)
			
 
				+{
			
 
				+	int ret;
			
 
				+	if (self->closed)
			
 
				+		Py_RETURN_NONE;
			
 
				+	ret = ntdb_close(self->ctx);
			
 
				+	self->closed = true;
			
 
				+	if (ret != 0) {
			
 
				+		PyErr_SetTDBError(NTDB_ERR_IO);
			
 
				+		return NULL;
			
 
				+	}
			
 
				+	Py_RETURN_NONE;
			
 
				+}
			
 
				+
			
 
				+static PyObject *obj_get(PyNtdbObject *self, PyObject *args)
			
 
				+{
			
 
				+	NTDB_DATA key, data;
			
 
				+	PyObject *py_key;
			
 
				+	enum NTDB_ERROR ret;
			
 
				+
			
 
				+	PyNtdb_CHECK_CLOSED(self);
			
 
				+
			
 
				+	if (!PyArg_ParseTuple(args, "O", &py_key))
			
 
				+		return NULL;
			
 
				+
			
 
				+	key = PyString_AsNtdb_Data(py_key);
			
 
				+	ret = ntdb_fetch(self->ctx, key, &data);
			
 
				+	if (ret == NTDB_ERR_NOEXIST)
			
 
				+		Py_RETURN_NONE;
			
 
				+	PyErr_NTDB_ERROR_IS_ERR_RAISE(ret);
			
 
				+	return PyString_FromNtdb_Data(data);
			
 
				+}
			
 
				+
			
 
				+static PyObject *obj_append(PyNtdbObject *self, PyObject *args)
			
 
				+{
			
 
				+	NTDB_DATA key, data;
			
 
				+	PyObject *py_key, *py_data;
			
 
				+	enum NTDB_ERROR ret;
			
 
				+
			
 
				+	PyNtdb_CHECK_CLOSED(self);
			
 
				+
			
 
				+	if (!PyArg_ParseTuple(args, "OO", &py_key, &py_data))
			
 
				+		return NULL;
			
 
				+
			
 
				+	key = PyString_AsNtdb_Data(py_key);
			
 
				+	data = PyString_AsNtdb_Data(py_data);
			
 
				+
			
 
				+	ret = ntdb_append(self->ctx, key, data);
			
 
				+	PyErr_NTDB_ERROR_IS_ERR_RAISE(ret);
			
 
				+	Py_RETURN_NONE;
			
 
				+}
			
 
				+
			
 
				+static PyObject *obj_firstkey(PyNtdbObject *self)
			
 
				+{
			
 
				+	enum NTDB_ERROR ret;
			
 
				+	NTDB_DATA key;
			
 
				+
			
 
				+	PyNtdb_CHECK_CLOSED(self);
			
 
				+
			
 
				+	ret = ntdb_firstkey(self->ctx, &key);
			
 
				+	if (ret == NTDB_ERR_NOEXIST)
			
 
				+		Py_RETURN_NONE;
			
 
				+	PyErr_NTDB_ERROR_IS_ERR_RAISE(ret);
			
 
				+
			
 
				+	return PyString_FromNtdb_Data(key);
			
 
				+}
			
 
				+
			
 
				+static PyObject *obj_nextkey(PyNtdbObject *self, PyObject *args)
			
 
				+{
			
 
				+	NTDB_DATA key;
			
 
				+	PyObject *py_key;
			
 
				+	enum NTDB_ERROR ret;
			
 
				+
			
 
				+	PyNtdb_CHECK_CLOSED(self);
			
 
				+
			
 
				+	if (!PyArg_ParseTuple(args, "O", &py_key))
			
 
				+		return NULL;
			
 
				+
			
 
				+	/* Malloc here, since ntdb_nextkey frees. */
			
 
				+	key.dsize = PyString_Size(py_key);
			
 
				+	key.dptr = malloc(key.dsize);
			
 
				+	memcpy(key.dptr, PyString_AsString(py_key), key.dsize);
			
 
				+
			
 
				+	ret = ntdb_nextkey(self->ctx, &key);
			
 
				+	if (ret == NTDB_ERR_NOEXIST)
			
 
				+		Py_RETURN_NONE;
			
 
				+	PyErr_NTDB_ERROR_IS_ERR_RAISE(ret);
			
 
				+
			
 
				+	return PyString_FromNtdb_Data(key);
			
 
				+}
			
 
				+
			
 
				+static PyObject *obj_delete(PyNtdbObject *self, PyObject *args)
			
 
				+{
			
 
				+	NTDB_DATA key;
			
 
				+	PyObject *py_key;
			
 
				+	enum NTDB_ERROR ret;
			
 
				+
			
 
				+	PyNtdb_CHECK_CLOSED(self);
			
 
				+
			
 
				+	if (!PyArg_ParseTuple(args, "O", &py_key))
			
 
				+		return NULL;
			
 
				+
			
 
				+	key = PyString_AsNtdb_Data(py_key);
			
 
				+	ret = ntdb_delete(self->ctx, key);
			
 
				+	PyErr_NTDB_ERROR_IS_ERR_RAISE(ret);
			
 
				+	Py_RETURN_NONE;
			
 
				+}
			
 
				+
			
 
				+static PyObject *obj_has_key(PyNtdbObject *self, PyObject *args)
			
 
				+{
			
 
				+	NTDB_DATA key;
			
 
				+	PyObject *py_key;
			
 
				+
			
 
				+	PyNtdb_CHECK_CLOSED(self);
			
 
				+
			
 
				+	if (!PyArg_ParseTuple(args, "O", &py_key))
			
 
				+		return NULL;
			
 
				+
			
 
				+	key = PyString_AsNtdb_Data(py_key);
			
 
				+	if (ntdb_exists(self->ctx, key))
			
 
				+		return Py_True;
			
 
				+	return Py_False;
			
 
				+}
			
 
				+
			
 
				+static PyObject *obj_store(PyNtdbObject *self, PyObject *args)
			
 
				+{
			
 
				+	NTDB_DATA key, value;
			
 
				+	enum NTDB_ERROR ret;
			
 
				+	int flag = NTDB_REPLACE;
			
 
				+	PyObject *py_key, *py_value;
			
 
				+	PyNtdb_CHECK_CLOSED(self);
			
 
				+
			
 
				+	if (!PyArg_ParseTuple(args, "OO|i", &py_key, &py_value, &flag))
			
 
				+		return NULL;
			
 
				+
			
 
				+	key = PyString_AsNtdb_Data(py_key);
			
 
				+	value = PyString_AsNtdb_Data(py_value);
			
 
				+
			
 
				+	ret = ntdb_store(self->ctx, key, value, flag);
			
 
				+	PyErr_NTDB_ERROR_IS_ERR_RAISE(ret);
			
 
				+	Py_RETURN_NONE;
			
 
				+}
			
 
				+
			
 
				+static PyObject *obj_add_flag(PyNtdbObject *self, PyObject *args)
			
 
				+{
			
 
				+	unsigned flag;
			
 
				+	PyNtdb_CHECK_CLOSED(self);
			
 
				+
			
 
				+	if (!PyArg_ParseTuple(args, "I", &flag))
			
 
				+		return NULL;
			
 
				+
			
 
				+	ntdb_add_flag(self->ctx, flag);
			
 
				+	Py_RETURN_NONE;
			
 
				+}
			
 
				+
			
 
				+static PyObject *obj_remove_flag(PyNtdbObject *self, PyObject *args)
			
 
				+{
			
 
				+	unsigned flag;
			
 
				+
			
 
				+	PyNtdb_CHECK_CLOSED(self);
			
 
				+
			
 
				+	if (!PyArg_ParseTuple(args, "I", &flag))
			
 
				+		return NULL;
			
 
				+
			
 
				+	ntdb_remove_flag(self->ctx, flag);
			
 
				+	Py_RETURN_NONE;
			
 
				+}
			
 
				+
			
 
				+typedef struct {
			
 
				+	PyObject_HEAD
			
 
				+	NTDB_DATA current;
			
 
				+	bool end;
			
 
				+	PyNtdbObject *iteratee;
			
 
				+} PyNtdbIteratorObject;
			
 
				+
			
 
				+static PyObject *ntdb_iter_next(PyNtdbIteratorObject *self)
			
 
				+{
			
 
				+	enum NTDB_ERROR e;
			
 
				+	PyObject *ret;
			
 
				+	if (self->end)
			
 
				+		return NULL;
			
 
				+	ret = PyString_FromStringAndSize((const char *)self->current.dptr,
			
 
				+					 self->current.dsize);
			
 
				+	e = ntdb_nextkey(self->iteratee->ctx, &self->current);
			
 
				+	if (e == NTDB_ERR_NOEXIST)
			
 
				+		self->end = true;
			
 
				+	else
			
 
				+		PyErr_NTDB_ERROR_IS_ERR_RAISE(e);
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+static void ntdb_iter_dealloc(PyNtdbIteratorObject *self)
			
 
				+{
			
 
				+	Py_DECREF(self->iteratee);
			
 
				+	PyObject_Del(self);
			
 
				+}
			
 
				+
			
 
				+PyTypeObject PyNtdbIterator = {
			
 
				+	.tp_name = "Iterator",
			
 
				+	.tp_basicsize = sizeof(PyNtdbIteratorObject),
			
 
				+	.tp_iternext = (iternextfunc)ntdb_iter_next,
			
 
				+	.tp_dealloc = (destructor)ntdb_iter_dealloc,
			
 
				+	.tp_flags = Py_TPFLAGS_DEFAULT,
			
 
				+	.tp_iter = PyObject_SelfIter,
			
 
				+};
			
 
				+
			
 
				+static PyObject *ntdb_object_iter(PyNtdbObject *self)
			
 
				+{
			
 
				+	PyNtdbIteratorObject *ret;
			
 
				+	enum NTDB_ERROR e;
			
 
				+	PyNtdb_CHECK_CLOSED(self);
			
 
				+
			
 
				+	ret = PyObject_New(PyNtdbIteratorObject, &PyNtdbIterator);
			
 
				+	if (!ret)
			
 
				+		return NULL;
			
 
				+	e = ntdb_firstkey(self->ctx, &ret->current);
			
 
				+	if (e == NTDB_ERR_NOEXIST) {
			
 
				+		ret->end = true;
			
 
				+	} else {
			
 
				+		PyErr_NTDB_ERROR_IS_ERR_RAISE(e);
			
 
				+		ret->end = false;
			
 
				+	}
			
 
				+	ret->iteratee = self;
			
 
				+	Py_INCREF(self);
			
 
				+	return (PyObject *)ret;
			
 
				+}
			
 
				+
			
 
				+static PyObject *obj_clear(PyNtdbObject *self)
			
 
				+{
			
 
				+	enum NTDB_ERROR ret;
			
 
				+	PyNtdb_CHECK_CLOSED(self);
			
 
				+	ret = ntdb_wipe_all(self->ctx);
			
 
				+	PyErr_NTDB_ERROR_IS_ERR_RAISE(ret);
			
 
				+	Py_RETURN_NONE;
			
 
				+}
			
 
				+
			
 
				+static PyObject *obj_enable_seqnum(PyNtdbObject *self)
			
 
				+{
			
 
				+	PyNtdb_CHECK_CLOSED(self);
			
 
				+	ntdb_add_flag(self->ctx, NTDB_SEQNUM);
			
 
				+	Py_RETURN_NONE;
			
 
				+}
			
 
				+
			
 
				+static PyMethodDef ntdb_object_methods[] = {
			
 
				+	{ "transaction_cancel", (PyCFunction)obj_transaction_cancel, METH_NOARGS,
			
 
				+		"S.transaction_cancel() -> None\n"
			
 
				+		"Cancel the currently active transaction." },
			
 
				+	{ "transaction_commit", (PyCFunction)obj_transaction_commit, METH_NOARGS,
			
 
				+		"S.transaction_commit() -> None\n"
			
 
				+		"Commit the currently active transaction." },
			
 
				+	{ "transaction_prepare_commit", (PyCFunction)obj_transaction_prepare_commit, METH_NOARGS,
			
 
				+		"S.transaction_prepare_commit() -> None\n"
			
 
				+		"Prepare to commit the currently active transaction" },
			
 
				+	{ "transaction_start", (PyCFunction)obj_transaction_start, METH_NOARGS,
			
 
				+		"S.transaction_start() -> None\n"
			
 
				+		"Start a new transaction." },
			
 
				+	{ "lock_all", (PyCFunction)obj_lockall, METH_NOARGS, NULL },
			
 
				+	{ "unlock_all", (PyCFunction)obj_unlockall, METH_NOARGS, NULL },
			
 
				+	{ "read_lock_all", (PyCFunction)obj_lockall_read, METH_NOARGS, NULL },
			
 
				+	{ "read_unlock_all", (PyCFunction)obj_unlockall_read, METH_NOARGS, NULL },
			
 
				+	{ "close", (PyCFunction)obj_close, METH_NOARGS, NULL },
			
 
				+	{ "get", (PyCFunction)obj_get, METH_VARARGS, "S.get(key) -> value\n"
			
 
				+		"Fetch a value." },
			
 
				+	{ "append", (PyCFunction)obj_append, METH_VARARGS, "S.append(key, value) -> None\n"
			
 
				+		"Append data to an existing key." },
			
 
				+	{ "firstkey", (PyCFunction)obj_firstkey, METH_NOARGS, "S.firstkey() -> data\n"
			
 
				+		"Return the first key in this database." },
			
 
				+	{ "nextkey", (PyCFunction)obj_nextkey, METH_NOARGS, "S.nextkey(key) -> data\n"
			
 
				+		"Return the next key in this database." },
			
 
				+	{ "delete", (PyCFunction)obj_delete, METH_VARARGS, "S.delete(key) -> None\n"
			
 
				+		"Delete an entry." },
			
 
				+	{ "has_key", (PyCFunction)obj_has_key, METH_VARARGS, "S.has_key(key) -> None\n"
			
 
				+		"Check whether key exists in this database." },
			
 
				+	{ "store", (PyCFunction)obj_store, METH_VARARGS, "S.store(key, data, flag=REPLACE) -> None"
			
 
				+		"Store data." },
			
 
				+	{ "add_flag", (PyCFunction)obj_add_flag, METH_VARARGS, "S.add_flag(flag) -> None" },
			
 
				+	{ "remove_flag", (PyCFunction)obj_remove_flag, METH_VARARGS, "S.remove_flag(flag) -> None" },
			
 
				+	{ "iterkeys", (PyCFunction)ntdb_object_iter, METH_NOARGS, "S.iterkeys() -> iterator" },
			
 
				+	{ "clear", (PyCFunction)obj_clear, METH_NOARGS, "S.clear() -> None\n"
			
 
				+		"Wipe the entire database." },
			
 
				+	{ "enable_seqnum", (PyCFunction)obj_enable_seqnum, METH_NOARGS,
			
 
				+		"S.enable_seqnum() -> None" },
			
 
				+	{ NULL }
			
 
				+};
			
 
				+
			
 
				+static PyObject *obj_get_flags(PyNtdbObject *self, void *closure)
			
 
				+{
			
 
				+	PyNtdb_CHECK_CLOSED(self);
			
 
				+	return PyInt_FromLong(ntdb_get_flags(self->ctx));
			
 
				+}
			
 
				+
			
 
				+static PyObject *obj_get_filename(PyNtdbObject *self, void *closure)
			
 
				+{
			
 
				+	PyNtdb_CHECK_CLOSED(self);
			
 
				+	return PyString_FromString(ntdb_name(self->ctx));
			
 
				+}
			
 
				+
			
 
				+static PyObject *obj_get_seqnum(PyNtdbObject *self, void *closure)
			
 
				+{
			
 
				+	PyNtdb_CHECK_CLOSED(self);
			
 
				+	return PyInt_FromLong(ntdb_get_seqnum(self->ctx));
			
 
				+}
			
 
				+
			
 
				+
			
 
				+static PyGetSetDef ntdb_object_getsetters[] = {
			
 
				+	{ cast_const(char *, "flags"), (getter)obj_get_flags, NULL, NULL },
			
 
				+	{ cast_const(char *, "filename"), (getter)obj_get_filename, NULL,
			
 
				+	  cast_const(char *, "The filename of this NTDB file.")},
			
 
				+	{ cast_const(char *, "seqnum"), (getter)obj_get_seqnum, NULL, NULL },
			
 
				+	{ NULL }
			
 
				+};
			
 
				+
			
 
				+static PyObject *ntdb_object_repr(PyNtdbObject *self)
			
 
				+{
			
 
				+	if (ntdb_get_flags(self->ctx) & NTDB_INTERNAL) {
			
 
				+		return PyString_FromString("Ntdb(<internal>)");
			
 
				+	} else {
			
 
				+		return PyString_FromFormat("Ntdb('%s')", ntdb_name(self->ctx));
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void ntdb_object_dealloc(PyNtdbObject *self)
			
 
				+{
			
 
				+	if (!self->closed)
			
 
				+		ntdb_close(self->ctx);
			
 
				+	self->ob_type->tp_free(self);
			
 
				+}
			
 
				+
			
 
				+static PyObject *obj_getitem(PyNtdbObject *self, PyObject *key)
			
 
				+{
			
 
				+	NTDB_DATA tkey, val;
			
 
				+	enum NTDB_ERROR ret;
			
 
				+
			
 
				+	PyNtdb_CHECK_CLOSED(self);
			
 
				+
			
 
				+	if (!PyString_Check(key)) {
			
 
				+		PyErr_SetString(PyExc_TypeError, "Expected string as key");
			
 
				+		return NULL;
			
 
				+	}
			
 
				+
			
 
				+	tkey.dptr = (unsigned char *)PyString_AsString(key);
			
 
				+	tkey.dsize = PyString_Size(key);
			
 
				+
			
 
				+	ret = ntdb_fetch(self->ctx, tkey, &val);
			
 
				+	if (ret == NTDB_ERR_NOEXIST) {
			
 
				+		PyErr_SetString(PyExc_KeyError, "No such NTDB entry");
			
 
				+		return NULL;
			
 
				+	} else {
			
 
				+		PyErr_NTDB_ERROR_IS_ERR_RAISE(ret);
			
 
				+		return PyString_FromNtdb_Data(val);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static int obj_setitem(PyNtdbObject *self, PyObject *key, PyObject *value)
			
 
				+{
			
 
				+	NTDB_DATA tkey, tval;
			
 
				+	enum NTDB_ERROR ret;
			
 
				+	if (self->closed) {
			
 
				+		PyErr_SetObject(PyExc_RuntimeError,
			
 
				+			Py_BuildValue("(i,s)", NTDB_ERR_EINVAL, "database is closed"));
			
 
				+		return -1;
			
 
				+	}
			
 
				+
			
 
				+	if (!PyString_Check(key)) {
			
 
				+		PyErr_SetString(PyExc_TypeError, "Expected string as key");
			
 
				+		return -1;
			
 
				+	}
			
 
				+
			
 
				+	tkey = PyString_AsNtdb_Data(key);
			
 
				+
			
 
				+	if (value == NULL) {
			
 
				+		ret = ntdb_delete(self->ctx, tkey);
			
 
				+	} else {
			
 
				+		if (!PyString_Check(value)) {
			
 
				+			PyErr_SetString(PyExc_TypeError, "Expected string as value");
			
 
				+			return -1;
			
 
				+		}
			
 
				+
			
 
				+		tval = PyString_AsNtdb_Data(value);
			
 
				+
			
 
				+		ret = ntdb_store(self->ctx, tkey, tval, NTDB_REPLACE);
			
 
				+	}
			
 
				+
			
 
				+	if (ret != NTDB_SUCCESS) {
			
 
				+		PyErr_SetTDBError(ret);
			
 
				+		return -1;
			
 
				+	}
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+static PyMappingMethods ntdb_object_mapping = {
			
 
				+	.mp_subscript = (binaryfunc)obj_getitem,
			
 
				+	.mp_ass_subscript = (objobjargproc)obj_setitem,
			
 
				+};
			
 
				+
			
 
				+static PyTypeObject PyNtdb = {
			
 
				+	.tp_name = "ntdb.Ntdb",
			
 
				+	.tp_basicsize = sizeof(PyNtdbObject),
			
 
				+	.tp_methods = ntdb_object_methods,
			
 
				+	.tp_getset = ntdb_object_getsetters,
			
 
				+	.tp_new = py_ntdb_open,
			
 
				+	.tp_doc = "A NTDB file",
			
 
				+	.tp_repr = (reprfunc)ntdb_object_repr,
			
 
				+	.tp_dealloc = (destructor)ntdb_object_dealloc,
			
 
				+	.tp_as_mapping = &ntdb_object_mapping,
			
 
				+	.tp_flags = Py_TPFLAGS_DEFAULT|Py_TPFLAGS_BASETYPE|Py_TPFLAGS_HAVE_ITER,
			
 
				+	.tp_iter = (getiterfunc)ntdb_object_iter,
			
 
				+};
			
 
				+
			
 
				+static PyMethodDef ntdb_methods[] = {
			
 
				+	{ "open", (PyCFunction)py_ntdb_open, METH_VARARGS|METH_KEYWORDS, "open(name, hash_size=0, ntdb_flags=NTDB_DEFAULT, flags=O_RDWR, mode=0600)\n"
			
 
				+		"Open a NTDB file." },
			
 
				+	{ NULL }
			
 
				+};
			
 
				+
			
 
				+void initntdb(void);
			
 
				+void initntdb(void)
			
 
				+{
			
 
				+	PyObject *m;
			
 
				+
			
 
				+	if (PyType_Ready(&PyNtdb) < 0)
			
 
				+		return;
			
 
				+
			
 
				+	if (PyType_Ready(&PyNtdbIterator) < 0)
			
 
				+		return;
			
 
				+
			
 
				+	m = Py_InitModule3("ntdb", ntdb_methods, "NTDB is a simple key-value database similar to GDBM that supports multiple writers.");
			
 
				+	if (m == NULL)
			
 
				+		return;
			
 
				+
			
 
				+	PyModule_AddObject(m, "REPLACE", PyInt_FromLong(NTDB_REPLACE));
			
 
				+	PyModule_AddObject(m, "INSERT", PyInt_FromLong(NTDB_INSERT));
			
 
				+	PyModule_AddObject(m, "MODIFY", PyInt_FromLong(NTDB_MODIFY));
			
 
				+
			
 
				+	PyModule_AddObject(m, "DEFAULT", PyInt_FromLong(NTDB_DEFAULT));
			
 
				+	PyModule_AddObject(m, "INTERNAL", PyInt_FromLong(NTDB_INTERNAL));
			
 
				+	PyModule_AddObject(m, "NOLOCK", PyInt_FromLong(NTDB_NOLOCK));
			
 
				+	PyModule_AddObject(m, "NOMMAP", PyInt_FromLong(NTDB_NOMMAP));
			
 
				+	PyModule_AddObject(m, "CONVERT", PyInt_FromLong(NTDB_CONVERT));
			
 
				+	PyModule_AddObject(m, "NOSYNC", PyInt_FromLong(NTDB_NOSYNC));
			
 
				+	PyModule_AddObject(m, "SEQNUM", PyInt_FromLong(NTDB_SEQNUM));
			
 
				+	PyModule_AddObject(m, "ALLOW_NESTING", PyInt_FromLong(NTDB_ALLOW_NESTING));
			
 
				+
			
 
				+	PyModule_AddObject(m, "__docformat__", PyString_FromString("restructuredText"));
			
 
				+
			
 
				+	PyModule_AddObject(m, "__version__", PyString_FromString(PACKAGE_VERSION));
			
 
				+
			
 
				+	Py_INCREF(&PyNtdb);
			
 
				+	PyModule_AddObject(m, "Ntdb", (PyObject *)&PyNtdb);
			
 
				+
			
 
				+	Py_INCREF(&PyNtdbIterator);
			
 
				+}
			
--- a/ccan/ntdb/summary.c
+++ b/ccan/ntdb/summary.c
@@ -0,0 +1,321 @@
 
				+ /*
			
 
				+   Trivial Database 2: human-readable summary code
			
 
				+   Copyright (C) Rusty Russell 2010
			
 
				+
			
 
				+   This library is free software; you can redistribute it and/or
			
 
				+   modify it under the terms of the GNU Lesser General Public
			
 
				+   License as published by the Free Software Foundation; either
			
 
				+   version 3 of the License, or (at your option) any later version.
			
 
				+
			
 
				+   This library is distributed in the hope that it will be useful,
			
 
				+   but WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
			
 
				+   Lesser General Public License for more details.
			
 
				+
			
 
				+   You should have received a copy of the GNU Lesser General Public
			
 
				+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
			
 
				+*/
			
 
				+#include "private.h"
			
 
				+#include <ccan/tally/tally.h>
			
 
				+
			
 
				+#define SUMMARY_FORMAT \
			
 
				+	"Size of file/data: %zu/%zu\n" \
			
 
				+	"Number of records: %zu\n" \
			
 
				+	"Smallest/average/largest keys: %zu/%zu/%zu\n%s" \
			
 
				+	"Smallest/average/largest data: %zu/%zu/%zu\n%s" \
			
 
				+	"Smallest/average/largest padding: %zu/%zu/%zu\n%s" \
			
 
				+	"Number of free records: %zu\n" \
			
 
				+	"Smallest/average/largest free records: %zu/%zu/%zu\n%s" \
			
 
				+	"Number of uncoalesced records: %zu\n" \
			
 
				+	"Smallest/average/largest uncoalesced runs: %zu/%zu/%zu\n%s" \
			
 
				+	"Toplevel hash used: %u of %u\n" \
			
 
				+	"Number of hashes: %zu\n" \
			
 
				+	"Smallest/average/largest hash chains: %zu/%zu/%zu\n%s" \
			
 
				+	"Percentage keys/data/padding/free/rechdrs/freehdrs/hashes: %.0f/%.0f/%.0f/%.0f/%.0f/%.0f/%.0f\n"
			
 
				+
			
 
				+#define BUCKET_SUMMARY_FORMAT_A					\
			
 
				+	"Free bucket %zu: total entries %zu.\n"			\
			
 
				+	"Smallest/average/largest length: %zu/%zu/%zu\n%s"
			
 
				+#define BUCKET_SUMMARY_FORMAT_B					\
			
 
				+	"Free bucket %zu-%zu: total entries %zu.\n"		\
			
 
				+	"Smallest/average/largest length: %zu/%zu/%zu\n%s"
			
 
				+#define CAPABILITY_FORMAT					\
			
 
				+	"Capability %llu%s\n"
			
 
				+
			
 
				+#define HISTO_WIDTH 70
			
 
				+#define HISTO_HEIGHT 20
			
 
				+
			
 
				+static ntdb_off_t count_hash(struct ntdb_context *ntdb,
			
 
				+			     ntdb_off_t hash_off,
			
 
				+			     ntdb_off_t num)
			
 
				+{
			
 
				+	const ntdb_off_t *h;
			
 
				+	ntdb_off_t i, count = 0;
			
 
				+
			
 
				+	h = ntdb_access_read(ntdb, hash_off, sizeof(*h) * num, true);
			
 
				+	if (NTDB_PTR_IS_ERR(h)) {
			
 
				+		return NTDB_ERR_TO_OFF(NTDB_PTR_ERR(h));
			
 
				+	}
			
 
				+	for (i = 0; i < num; i++)
			
 
				+		count += (h[i] != 0);
			
 
				+
			
 
				+	ntdb_access_release(ntdb, h);
			
 
				+	return count;
			
 
				+}
			
 
				+
			
 
				+static enum NTDB_ERROR summarize(struct ntdb_context *ntdb,
			
 
				+				struct tally *ftables,
			
 
				+				struct tally *fr,
			
 
				+				struct tally *keys,
			
 
				+				struct tally *data,
			
 
				+				struct tally *extra,
			
 
				+				struct tally *uncoal,
			
 
				+				struct tally *hashes,
			
 
				+				size_t *num_caps)
			
 
				+{
			
 
				+	ntdb_off_t off;
			
 
				+	ntdb_len_t len;
			
 
				+	ntdb_len_t unc = 0;
			
 
				+
			
 
				+	for (off = sizeof(struct ntdb_header);
			
 
				+	     off < ntdb->file->map_size;
			
 
				+	     off += len) {
			
 
				+		const union {
			
 
				+			struct ntdb_used_record u;
			
 
				+			struct ntdb_free_record f;
			
 
				+			struct ntdb_recovery_record r;
			
 
				+		} *p;
			
 
				+		/* We might not be able to get the whole thing. */
			
 
				+		p = ntdb_access_read(ntdb, off, sizeof(p->f), true);
			
 
				+		if (NTDB_PTR_IS_ERR(p)) {
			
 
				+			return NTDB_PTR_ERR(p);
			
 
				+		}
			
 
				+		if (frec_magic(&p->f) != NTDB_FREE_MAGIC) {
			
 
				+			if (unc > 1) {
			
 
				+				tally_add(uncoal, unc);
			
 
				+				unc = 0;
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+		if (p->r.magic == NTDB_RECOVERY_INVALID_MAGIC
			
 
				+		    || p->r.magic == NTDB_RECOVERY_MAGIC) {
			
 
				+			len = sizeof(p->r) + p->r.max_len;
			
 
				+		} else if (frec_magic(&p->f) == NTDB_FREE_MAGIC) {
			
 
				+			len = frec_len(&p->f);
			
 
				+			tally_add(fr, len);
			
 
				+			len += sizeof(p->u);
			
 
				+			unc++;
			
 
				+		} else if (rec_magic(&p->u) == NTDB_USED_MAGIC) {
			
 
				+			len = sizeof(p->u)
			
 
				+				+ rec_key_length(&p->u)
			
 
				+				+ rec_data_length(&p->u)
			
 
				+				+ rec_extra_padding(&p->u);
			
 
				+
			
 
				+			tally_add(keys, rec_key_length(&p->u));
			
 
				+			tally_add(data, rec_data_length(&p->u));
			
 
				+			tally_add(extra, rec_extra_padding(&p->u));
			
 
				+		} else if (rec_magic(&p->u) == NTDB_HTABLE_MAGIC) {
			
 
				+			ntdb_off_t count = count_hash(ntdb,
			
 
				+						      off + sizeof(p->u),
			
 
				+						      1 << ntdb->hash_bits);
			
 
				+			if (NTDB_OFF_IS_ERR(count)) {
			
 
				+				return NTDB_OFF_TO_ERR(count);
			
 
				+			}
			
 
				+			tally_add(hashes, count);
			
 
				+			tally_add(extra, rec_extra_padding(&p->u));
			
 
				+			len = sizeof(p->u)
			
 
				+				+ rec_data_length(&p->u)
			
 
				+				+ rec_extra_padding(&p->u);
			
 
				+		} else if (rec_magic(&p->u) == NTDB_FTABLE_MAGIC) {
			
 
				+			len = sizeof(p->u)
			
 
				+				+ rec_data_length(&p->u)
			
 
				+				+ rec_extra_padding(&p->u);
			
 
				+			tally_add(ftables, rec_data_length(&p->u));
			
 
				+			tally_add(extra, rec_extra_padding(&p->u));
			
 
				+		} else if (rec_magic(&p->u) == NTDB_CHAIN_MAGIC) {
			
 
				+			len = sizeof(p->u)
			
 
				+				+ rec_data_length(&p->u)
			
 
				+				+ rec_extra_padding(&p->u);
			
 
				+			tally_add(hashes,
			
 
				+				  rec_data_length(&p->u)/sizeof(ntdb_off_t));
			
 
				+			tally_add(extra, rec_extra_padding(&p->u));
			
 
				+		} else if (rec_magic(&p->u) == NTDB_CAP_MAGIC) {
			
 
				+			len = sizeof(p->u)
			
 
				+				+ rec_data_length(&p->u)
			
 
				+				+ rec_extra_padding(&p->u);
			
 
				+			(*num_caps)++;
			
 
				+		} else {
			
 
				+			len = dead_space(ntdb, off);
			
 
				+			if (NTDB_OFF_IS_ERR(len)) {
			
 
				+				return NTDB_OFF_TO_ERR(len);
			
 
				+			}
			
 
				+		}
			
 
				+		ntdb_access_release(ntdb, p);
			
 
				+	}
			
 
				+	if (unc)
			
 
				+		tally_add(uncoal, unc);
			
 
				+	return NTDB_SUCCESS;
			
 
				+}
			
 
				+
			
 
				+static void add_capabilities(struct ntdb_context *ntdb, char *summary)
			
 
				+{
			
 
				+	ntdb_off_t off, next;
			
 
				+	const struct ntdb_capability *cap;
			
 
				+	size_t count = 0;
			
 
				+
			
 
				+	/* Append to summary. */
			
 
				+	summary += strlen(summary);
			
 
				+
			
 
				+	off = ntdb_read_off(ntdb, offsetof(struct ntdb_header, capabilities));
			
 
				+	if (NTDB_OFF_IS_ERR(off))
			
 
				+		return;
			
 
				+
			
 
				+	/* Walk capability list. */
			
 
				+	for (; off; off = next) {
			
 
				+		cap = ntdb_access_read(ntdb, off, sizeof(*cap), true);
			
 
				+		if (NTDB_PTR_IS_ERR(cap)) {
			
 
				+			break;
			
 
				+		}
			
 
				+		count++;
			
 
				+		sprintf(summary, CAPABILITY_FORMAT,
			
 
				+			cap->type & NTDB_CAP_TYPE_MASK,
			
 
				+			/* Noopen?  How did we get here? */
			
 
				+			(cap->type & NTDB_CAP_NOOPEN) ? " (unopenable)"
			
 
				+			: ((cap->type & NTDB_CAP_NOWRITE)
			
 
				+			   && (cap->type & NTDB_CAP_NOCHECK)) ? " (uncheckable,read-only)"
			
 
				+			: (cap->type & NTDB_CAP_NOWRITE) ? " (read-only)"
			
 
				+			: (cap->type & NTDB_CAP_NOCHECK) ? " (uncheckable)"
			
 
				+			: "");
			
 
				+		summary += strlen(summary);
			
 
				+		next = cap->next;
			
 
				+		ntdb_access_release(ntdb, cap);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+_PUBLIC_ enum NTDB_ERROR ntdb_summary(struct ntdb_context *ntdb,
			
 
				+			   enum ntdb_summary_flags flags,
			
 
				+			   char **summary)
			
 
				+{
			
 
				+	ntdb_len_t len;
			
 
				+	size_t num_caps = 0;
			
 
				+	struct tally *ftables, *freet, *keys, *data, *extra, *uncoal, *hashes;
			
 
				+	char *freeg, *keysg, *datag, *extrag, *uncoalg, *hashesg;
			
 
				+	enum NTDB_ERROR ecode;
			
 
				+
			
 
				+	freeg = keysg = datag = extrag = uncoalg = hashesg = NULL;
			
 
				+
			
 
				+	ecode = ntdb_allrecord_lock(ntdb, F_RDLCK, NTDB_LOCK_WAIT, false);
			
 
				+	if (ecode != NTDB_SUCCESS) {
			
 
				+		return ecode;
			
 
				+	}
			
 
				+
			
 
				+	ecode = ntdb_lock_expand(ntdb, F_RDLCK);
			
 
				+	if (ecode != NTDB_SUCCESS) {
			
 
				+		ntdb_allrecord_unlock(ntdb, F_RDLCK);
			
 
				+		return ecode;
			
 
				+	}
			
 
				+
			
 
				+	/* Start stats off empty. */
			
 
				+	ftables = tally_new(HISTO_HEIGHT);
			
 
				+	freet = tally_new(HISTO_HEIGHT);
			
 
				+	keys = tally_new(HISTO_HEIGHT);
			
 
				+	data = tally_new(HISTO_HEIGHT);
			
 
				+	extra = tally_new(HISTO_HEIGHT);
			
 
				+	uncoal = tally_new(HISTO_HEIGHT);
			
 
				+	hashes = tally_new(HISTO_HEIGHT);
			
 
				+	if (!ftables || !freet || !keys || !data || !extra
			
 
				+	    || !uncoal || !hashes) {
			
 
				+		ecode = ntdb_logerr(ntdb, NTDB_ERR_OOM, NTDB_LOG_ERROR,
			
 
				+				   "ntdb_summary: failed to allocate"
			
 
				+				   " tally structures");
			
 
				+		goto unlock;
			
 
				+	}
			
 
				+
			
 
				+	ecode = summarize(ntdb, ftables, freet, keys, data, extra,
			
 
				+			  uncoal, hashes, &num_caps);
			
 
				+	if (ecode != NTDB_SUCCESS) {
			
 
				+		goto unlock;
			
 
				+	}
			
 
				+
			
 
				+	if (flags & NTDB_SUMMARY_HISTOGRAMS) {
			
 
				+		freeg = tally_histogram(freet, HISTO_WIDTH, HISTO_HEIGHT);
			
 
				+		keysg = tally_histogram(keys, HISTO_WIDTH, HISTO_HEIGHT);
			
 
				+		datag = tally_histogram(data, HISTO_WIDTH, HISTO_HEIGHT);
			
 
				+		extrag = tally_histogram(extra, HISTO_WIDTH, HISTO_HEIGHT);
			
 
				+		uncoalg = tally_histogram(uncoal, HISTO_WIDTH, HISTO_HEIGHT);
			
 
				+		hashesg = tally_histogram(hashes, HISTO_WIDTH, HISTO_HEIGHT);
			
 
				+	}
			
 
				+
			
 
				+	/* 20 is max length of a %llu. */
			
 
				+	len = strlen(SUMMARY_FORMAT) + 33*20 + 1
			
 
				+		+ (freeg ? strlen(freeg) : 0)
			
 
				+		+ (keysg ? strlen(keysg) : 0)
			
 
				+		+ (datag ? strlen(datag) : 0)
			
 
				+		+ (extrag ? strlen(extrag) : 0)
			
 
				+		+ (uncoalg ? strlen(uncoalg) : 0)
			
 
				+		+ (hashesg ? strlen(hashesg) : 0)
			
 
				+		+ num_caps * (strlen(CAPABILITY_FORMAT) + 20
			
 
				+			      + strlen(" (uncheckable,read-only)"));
			
 
				+
			
 
				+	*summary = ntdb->alloc_fn(ntdb, len, ntdb->alloc_data);
			
 
				+	if (!*summary) {
			
 
				+		ecode = ntdb_logerr(ntdb, NTDB_ERR_OOM, NTDB_LOG_ERROR,
			
 
				+				   "ntdb_summary: failed to allocate string");
			
 
				+		goto unlock;
			
 
				+	}
			
 
				+
			
 
				+	sprintf(*summary, SUMMARY_FORMAT,
			
 
				+		(size_t)ntdb->file->map_size,
			
 
				+		tally_total(keys, NULL) + tally_total(data, NULL),
			
 
				+		tally_num(keys),
			
 
				+		tally_min(keys), tally_mean(keys), tally_max(keys),
			
 
				+		keysg ? keysg : "",
			
 
				+		tally_min(data), tally_mean(data), tally_max(data),
			
 
				+		datag ? datag : "",
			
 
				+		tally_min(extra), tally_mean(extra), tally_max(extra),
			
 
				+		extrag ? extrag : "",
			
 
				+		tally_num(freet),
			
 
				+		tally_min(freet), tally_mean(freet), tally_max(freet),
			
 
				+		freeg ? freeg : "",
			
 
				+		tally_total(uncoal, NULL),
			
 
				+		tally_min(uncoal), tally_mean(uncoal), tally_max(uncoal),
			
 
				+		uncoalg ? uncoalg : "",
			
 
				+		(unsigned)count_hash(ntdb, sizeof(struct ntdb_header),
			
 
				+				     1 << ntdb->hash_bits),
			
 
				+		1 << ntdb->hash_bits,
			
 
				+		tally_num(hashes),
			
 
				+		tally_min(hashes), tally_mean(hashes), tally_max(hashes),
			
 
				+		hashesg ? hashesg : "",
			
 
				+		tally_total(keys, NULL) * 100.0 / ntdb->file->map_size,
			
 
				+		tally_total(data, NULL) * 100.0 / ntdb->file->map_size,
			
 
				+		tally_total(extra, NULL) * 100.0 / ntdb->file->map_size,
			
 
				+		tally_total(freet, NULL) * 100.0 / ntdb->file->map_size,
			
 
				+		(tally_num(keys) + tally_num(freet) + tally_num(hashes))
			
 
				+		* sizeof(struct ntdb_used_record) * 100.0 / ntdb->file->map_size,
			
 
				+		tally_num(ftables) * sizeof(struct ntdb_freetable)
			
 
				+		* 100.0 / ntdb->file->map_size,
			
 
				+		(tally_total(hashes, NULL) * sizeof(ntdb_off_t)
			
 
				+		 + (sizeof(ntdb_off_t) << ntdb->hash_bits))
			
 
				+		* 100.0 / ntdb->file->map_size);
			
 
				+
			
 
				+	add_capabilities(ntdb, *summary);
			
 
				+
			
 
				+unlock:
			
 
				+	ntdb->free_fn(freeg, ntdb->alloc_data);
			
 
				+	ntdb->free_fn(keysg, ntdb->alloc_data);
			
 
				+	ntdb->free_fn(datag, ntdb->alloc_data);
			
 
				+	ntdb->free_fn(extrag, ntdb->alloc_data);
			
 
				+	ntdb->free_fn(uncoalg, ntdb->alloc_data);
			
 
				+	ntdb->free_fn(hashesg, ntdb->alloc_data);
			
 
				+	ntdb->free_fn(freet, ntdb->alloc_data);
			
 
				+	ntdb->free_fn(keys, ntdb->alloc_data);
			
 
				+	ntdb->free_fn(data, ntdb->alloc_data);
			
 
				+	ntdb->free_fn(extra, ntdb->alloc_data);
			
 
				+	ntdb->free_fn(uncoal, ntdb->alloc_data);
			
 
				+	ntdb->free_fn(ftables, ntdb->alloc_data);
			
 
				+	ntdb->free_fn(hashes, ntdb->alloc_data);
			
 
				+
			
 
				+	ntdb_allrecord_unlock(ntdb, F_RDLCK);
			
 
				+	ntdb_unlock_expand(ntdb, F_RDLCK);
			
 
				+	return ecode;
			
 
				+}
			
--- a/ccan/ntdb/test/api-12-store.c
+++ b/ccan/ntdb/test/api-12-store.c
@@ -0,0 +1,55 @@
 
				+#include "config.h"
			
 
				+#include "ntdb.h"
			
 
				+#include "private.h"
			
 
				+#include "tap-interface.h"
			
 
				+#include <ccan/hash/hash.h>
			
 
				+
			
 
				+#include "logging.h"
			
 
				+
			
 
				+/* We use the same seed which we saw a failure on. */
			
 
				+static uint32_t fixedhash(const void *key, size_t len, uint32_t seed, void *p)
			
 
				+{
			
 
				+	return hash64_stable((const unsigned char *)key, len,
			
 
				+			     *(uint64_t *)p);
			
 
				+}
			
 
				+
			
 
				+int main(int argc, char *argv[])
			
 
				+{
			
 
				+	unsigned int i, j;
			
 
				+	struct ntdb_context *ntdb;
			
 
				+	uint64_t seed = 16014841315512641303ULL;
			
 
				+	union ntdb_attribute fixed_hattr
			
 
				+		= { .hash = { .base = { NTDB_ATTRIBUTE_HASH },
			
 
				+			      .fn = fixedhash,
			
 
				+			      .data = &seed } };
			
 
				+	int flags[] = { NTDB_INTERNAL, NTDB_DEFAULT, NTDB_NOMMAP,
			
 
				+			NTDB_INTERNAL|NTDB_CONVERT, NTDB_CONVERT,
			
 
				+			NTDB_NOMMAP|NTDB_CONVERT };
			
 
				+	NTDB_DATA key = { (unsigned char *)&j, sizeof(j) };
			
 
				+	NTDB_DATA data = { (unsigned char *)&j, sizeof(j) };
			
 
				+
			
 
				+	fixed_hattr.base.next = &tap_log_attr;
			
 
				+
			
 
				+	plan_tests(sizeof(flags) / sizeof(flags[0]) * (1 + 500 * 3) + 1);
			
 
				+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
			
 
				+		ntdb = ntdb_open("run-12-store.ntdb", flags[i]|MAYBE_NOSYNC,
			
 
				+				 O_RDWR|O_CREAT|O_TRUNC, 0600, &fixed_hattr);
			
 
				+		ok1(ntdb);
			
 
				+		if (!ntdb)
			
 
				+			continue;
			
 
				+
			
 
				+		/* We seemed to lose some keys.
			
 
				+		 * Insert and check they're in there! */
			
 
				+		for (j = 0; j < 500; j++) {
			
 
				+			NTDB_DATA d = { NULL, 0 }; /* Bogus GCC warning */
			
 
				+			ok1(ntdb_store(ntdb, key, data, NTDB_REPLACE) == 0);
			
 
				+			ok1(ntdb_fetch(ntdb, key, &d) == NTDB_SUCCESS);
			
 
				+			ok1(ntdb_deq(d, data));
			
 
				+			free(d.dptr);
			
 
				+		}
			
 
				+		ntdb_close(ntdb);
			
 
				+	}
			
 
				+
			
 
				+	ok1(tap_log_messages == 0);
			
 
				+	return exit_status();
			
 
				+}
			
--- a/ccan/ntdb/test/api-13-delete.c
+++ b/ccan/ntdb/test/api-13-delete.c
@@ -0,0 +1,201 @@
 
				+#include "private.h" // For NTDB_TOPLEVEL_HASH_BITS
			
 
				+#include <ccan/hash/hash.h>
			
 
				+#include "ntdb.h"
			
 
				+#include "tap-interface.h"
			
 
				+#include "logging.h"
			
 
				+
			
 
				+/* We rig the hash so adjacent-numbered records always clash. */
			
 
				+static uint32_t clash(const void *key, size_t len, uint32_t seed, void *priv)
			
 
				+{
			
 
				+	return *((const unsigned int *)key) / 2;
			
 
				+}
			
 
				+
			
 
				+/* We use the same seed which we saw a failure on. */
			
 
				+static uint32_t fixedhash(const void *key, size_t len, uint32_t seed, void *p)
			
 
				+{
			
 
				+	return hash64_stable((const unsigned char *)key, len,
			
 
				+			     *(uint64_t *)p);
			
 
				+}
			
 
				+
			
 
				+static bool store_records(struct ntdb_context *ntdb)
			
 
				+{
			
 
				+	int i;
			
 
				+	NTDB_DATA key = { (unsigned char *)&i, sizeof(i) };
			
 
				+	NTDB_DATA d, data = { (unsigned char *)&i, sizeof(i) };
			
 
				+
			
 
				+	for (i = 0; i < 1000; i++) {
			
 
				+		if (ntdb_store(ntdb, key, data, NTDB_REPLACE) != 0)
			
 
				+			return false;
			
 
				+		ntdb_fetch(ntdb, key, &d);
			
 
				+		if (!ntdb_deq(d, data))
			
 
				+			return false;
			
 
				+		free(d.dptr);
			
 
				+	}
			
 
				+	return true;
			
 
				+}
			
 
				+
			
 
				+static void test_val(struct ntdb_context *ntdb, uint64_t val)
			
 
				+{
			
 
				+	uint64_t v;
			
 
				+	NTDB_DATA key = { (unsigned char *)&v, sizeof(v) };
			
 
				+	NTDB_DATA d, data = { (unsigned char *)&v, sizeof(v) };
			
 
				+
			
 
				+	/* Insert an entry, then delete it. */
			
 
				+	v = val;
			
 
				+	/* Delete should fail. */
			
 
				+	ok1(ntdb_delete(ntdb, key) == NTDB_ERR_NOEXIST);
			
 
				+	ok1(ntdb_check(ntdb, NULL, NULL) == 0);
			
 
				+
			
 
				+	/* Insert should succeed. */
			
 
				+	ok1(ntdb_store(ntdb, key, data, NTDB_INSERT) == 0);
			
 
				+	ok1(ntdb_check(ntdb, NULL, NULL) == 0);
			
 
				+
			
 
				+	/* Delete should succeed. */
			
 
				+	ok1(ntdb_delete(ntdb, key) == 0);
			
 
				+	ok1(ntdb_check(ntdb, NULL, NULL) == 0);
			
 
				+
			
 
				+	/* Re-add it, then add collision. */
			
 
				+	ok1(ntdb_store(ntdb, key, data, NTDB_INSERT) == 0);
			
 
				+	v = val + 1;
			
 
				+	ok1(ntdb_store(ntdb, key, data, NTDB_INSERT) == 0);
			
 
				+	ok1(ntdb_check(ntdb, NULL, NULL) == 0);
			
 
				+
			
 
				+	/* Can find both? */
			
 
				+	ok1(ntdb_fetch(ntdb, key, &d) == NTDB_SUCCESS);
			
 
				+	ok1(d.dsize == data.dsize);
			
 
				+	free(d.dptr);
			
 
				+	v = val;
			
 
				+	ok1(ntdb_fetch(ntdb, key, &d) == NTDB_SUCCESS);
			
 
				+	ok1(d.dsize == data.dsize);
			
 
				+	free(d.dptr);
			
 
				+
			
 
				+	/* Delete second one. */
			
 
				+	v = val + 1;
			
 
				+	ok1(ntdb_delete(ntdb, key) == 0);
			
 
				+	ok1(ntdb_check(ntdb, NULL, NULL) == 0);
			
 
				+
			
 
				+	/* Re-add */
			
 
				+	ok1(ntdb_store(ntdb, key, data, NTDB_INSERT) == 0);
			
 
				+	ok1(ntdb_check(ntdb, NULL, NULL) == 0);
			
 
				+
			
 
				+	/* Now, try deleting first one. */
			
 
				+	v = val;
			
 
				+	ok1(ntdb_delete(ntdb, key) == 0);
			
 
				+	ok1(ntdb_check(ntdb, NULL, NULL) == 0);
			
 
				+
			
 
				+	/* Can still find second? */
			
 
				+	v = val + 1;
			
 
				+	ok1(ntdb_fetch(ntdb, key, &d) == NTDB_SUCCESS);
			
 
				+	ok1(d.dsize == data.dsize);
			
 
				+	free(d.dptr);
			
 
				+
			
 
				+	/* Now, this will be ideally placed. */
			
 
				+	v = val + 2;
			
 
				+	ok1(ntdb_store(ntdb, key, data, NTDB_INSERT) == 0);
			
 
				+	ok1(ntdb_check(ntdb, NULL, NULL) == 0);
			
 
				+
			
 
				+	/* This will collide with both. */
			
 
				+	v = val;
			
 
				+	ok1(ntdb_store(ntdb, key, data, NTDB_INSERT) == 0);
			
 
				+
			
 
				+	/* We can still find them all, right? */
			
 
				+	ok1(ntdb_fetch(ntdb, key, &d) == NTDB_SUCCESS);
			
 
				+	ok1(d.dsize == data.dsize);
			
 
				+	free(d.dptr);
			
 
				+	v = val + 1;
			
 
				+	ok1(ntdb_fetch(ntdb, key, &d) == NTDB_SUCCESS);
			
 
				+	ok1(d.dsize == data.dsize);
			
 
				+	free(d.dptr);
			
 
				+	v = val + 2;
			
 
				+	ok1(ntdb_fetch(ntdb, key, &d) == NTDB_SUCCESS);
			
 
				+	ok1(d.dsize == data.dsize);
			
 
				+	free(d.dptr);
			
 
				+
			
 
				+	/* And if we delete val + 1, that val + 2 should not move! */
			
 
				+	v = val + 1;
			
 
				+	ok1(ntdb_delete(ntdb, key) == 0);
			
 
				+	ok1(ntdb_check(ntdb, NULL, NULL) == 0);
			
 
				+
			
 
				+	v = val;
			
 
				+	ok1(ntdb_fetch(ntdb, key, &d) == NTDB_SUCCESS);
			
 
				+	ok1(d.dsize == data.dsize);
			
 
				+	free(d.dptr);
			
 
				+	v = val + 2;
			
 
				+	ok1(ntdb_fetch(ntdb, key, &d) == NTDB_SUCCESS);
			
 
				+	ok1(d.dsize == data.dsize);
			
 
				+	free(d.dptr);
			
 
				+
			
 
				+	/* Delete those two, so we are empty. */
			
 
				+	ok1(ntdb_delete(ntdb, key) == 0);
			
 
				+	v = val;
			
 
				+	ok1(ntdb_delete(ntdb, key) == 0);
			
 
				+
			
 
				+	ok1(ntdb_check(ntdb, NULL, NULL) == 0);
			
 
				+}
			
 
				+
			
 
				+int main(int argc, char *argv[])
			
 
				+{
			
 
				+	unsigned int i, j;
			
 
				+	struct ntdb_context *ntdb;
			
 
				+	uint64_t seed = 16014841315512641303ULL;
			
 
				+	union ntdb_attribute clash_hattr
			
 
				+		= { .hash = { .base = { NTDB_ATTRIBUTE_HASH },
			
 
				+			      .fn = clash } };
			
 
				+	union ntdb_attribute fixed_hattr
			
 
				+		= { .hash = { .base = { NTDB_ATTRIBUTE_HASH },
			
 
				+			      .fn = fixedhash,
			
 
				+			      .data = &seed } };
			
 
				+	int flags[] = { NTDB_INTERNAL, NTDB_DEFAULT, NTDB_NOMMAP,
			
 
				+			NTDB_INTERNAL|NTDB_CONVERT, NTDB_CONVERT,
			
 
				+			NTDB_NOMMAP|NTDB_CONVERT };
			
 
				+	/* These two values gave trouble before. */
			
 
				+	int vals[] = { 755, 837 };
			
 
				+
			
 
				+	clash_hattr.base.next = &tap_log_attr;
			
 
				+	fixed_hattr.base.next = &tap_log_attr;
			
 
				+
			
 
				+	plan_tests(sizeof(flags) / sizeof(flags[0])
			
 
				+		   * (39 * 3 + 5 + sizeof(vals)/sizeof(vals[0])*2) + 1);
			
 
				+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
			
 
				+		ntdb = ntdb_open("run-13-delete.ntdb", flags[i]|MAYBE_NOSYNC,
			
 
				+				 O_RDWR|O_CREAT|O_TRUNC, 0600, &clash_hattr);
			
 
				+		ok1(ntdb);
			
 
				+		if (!ntdb)
			
 
				+			continue;
			
 
				+
			
 
				+		/* Check start of hash table. */
			
 
				+		test_val(ntdb, 0);
			
 
				+
			
 
				+		/* Check end of hash table. */
			
 
				+		test_val(ntdb, -1ULL);
			
 
				+
			
 
				+		/* Check mixed bitpattern. */
			
 
				+		test_val(ntdb, 0x123456789ABCDEF0ULL);
			
 
				+
			
 
				+		ok1(!ntdb->file || (ntdb->file->allrecord_lock.count == 0
			
 
				+				   && ntdb->file->num_lockrecs == 0));
			
 
				+		ntdb_close(ntdb);
			
 
				+
			
 
				+		/* Deleting these entries in the db gave problems. */
			
 
				+		ntdb = ntdb_open("run-13-delete.ntdb", flags[i]|MAYBE_NOSYNC,
			
 
				+			       O_RDWR|O_CREAT|O_TRUNC, 0600, &fixed_hattr);
			
 
				+		ok1(ntdb);
			
 
				+		if (!ntdb)
			
 
				+			continue;
			
 
				+
			
 
				+		ok1(store_records(ntdb));
			
 
				+		ok1(ntdb_check(ntdb, NULL, NULL) == 0);
			
 
				+		for (j = 0; j < sizeof(vals)/sizeof(vals[0]); j++) {
			
 
				+			NTDB_DATA key;
			
 
				+
			
 
				+			key.dptr = (unsigned char *)&vals[j];
			
 
				+			key.dsize = sizeof(vals[j]);
			
 
				+			ok1(ntdb_delete(ntdb, key) == 0);
			
 
				+			ok1(ntdb_check(ntdb, NULL, NULL) == 0);
			
 
				+		}
			
 
				+		ntdb_close(ntdb);
			
 
				+	}
			
 
				+
			
 
				+	ok1(tap_log_messages == 0);
			
 
				+	return exit_status();
			
 
				+}
			
--- a/ccan/ntdb/test/api-14-exists.c
+++ b/ccan/ntdb/test/api-14-exists.c
@@ -0,0 +1,52 @@
 
				+#include "config.h"
			
 
				+#include "ntdb.h"
			
 
				+#include "private.h"
			
 
				+#include "tap-interface.h"
			
 
				+#include "logging.h"
			
 
				+
			
 
				+static bool test_records(struct ntdb_context *ntdb)
			
 
				+{
			
 
				+	int i;
			
 
				+	NTDB_DATA key = { (unsigned char *)&i, sizeof(i) };
			
 
				+	NTDB_DATA data = { (unsigned char *)&i, sizeof(i) };
			
 
				+
			
 
				+	for (i = 0; i < 1000; i++) {
			
 
				+		if (ntdb_exists(ntdb, key))
			
 
				+			return false;
			
 
				+		if (ntdb_store(ntdb, key, data, NTDB_REPLACE) != 0)
			
 
				+			return false;
			
 
				+		if (!ntdb_exists(ntdb, key))
			
 
				+			return false;
			
 
				+	}
			
 
				+
			
 
				+	for (i = 0; i < 1000; i++) {
			
 
				+		if (!ntdb_exists(ntdb, key))
			
 
				+			return false;
			
 
				+		if (ntdb_delete(ntdb, key) != 0)
			
 
				+			return false;
			
 
				+		if (ntdb_exists(ntdb, key))
			
 
				+			return false;
			
 
				+	}
			
 
				+	return true;
			
 
				+}
			
 
				+
			
 
				+int main(int argc, char *argv[])
			
 
				+{
			
 
				+	unsigned int i;
			
 
				+	struct ntdb_context *ntdb;
			
 
				+	int flags[] = { NTDB_INTERNAL, NTDB_DEFAULT, NTDB_NOMMAP,
			
 
				+			NTDB_INTERNAL|NTDB_CONVERT, NTDB_CONVERT,
			
 
				+			NTDB_NOMMAP|NTDB_CONVERT };
			
 
				+
			
 
				+	plan_tests(sizeof(flags) / sizeof(flags[0]) * 2 + 1);
			
 
				+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
			
 
				+		ntdb = ntdb_open("run-14-exists.ntdb", flags[i]|MAYBE_NOSYNC,
			
 
				+			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
			
 
				+		if (ok1(ntdb))
			
 
				+			ok1(test_records(ntdb));
			
 
				+		ntdb_close(ntdb);
			
 
				+	}
			
 
				+
			
 
				+	ok1(tap_log_messages == 0);
			
 
				+	return exit_status();
			
 
				+}
			
--- a/ccan/ntdb/test/api-16-wipe_all.c
+++ b/ccan/ntdb/test/api-16-wipe_all.c
@@ -0,0 +1,44 @@
 
				+#include "config.h"
			
 
				+#include "ntdb.h"
			
 
				+#include "private.h"
			
 
				+#include "tap-interface.h"
			
 
				+#include "logging.h"
			
 
				+
			
 
				+static bool add_records(struct ntdb_context *ntdb)
			
 
				+{
			
 
				+	int i;
			
 
				+	NTDB_DATA key = { (unsigned char *)&i, sizeof(i) };
			
 
				+	NTDB_DATA data = { (unsigned char *)&i, sizeof(i) };
			
 
				+
			
 
				+	for (i = 0; i < 1000; i++) {
			
 
				+		if (ntdb_store(ntdb, key, data, NTDB_REPLACE) != 0)
			
 
				+			return false;
			
 
				+	}
			
 
				+	return true;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+int main(int argc, char *argv[])
			
 
				+{
			
 
				+	unsigned int i;
			
 
				+	struct ntdb_context *ntdb;
			
 
				+	int flags[] = { NTDB_INTERNAL, NTDB_DEFAULT, NTDB_NOMMAP,
			
 
				+			NTDB_INTERNAL|NTDB_CONVERT, NTDB_CONVERT,
			
 
				+			NTDB_NOMMAP|NTDB_CONVERT };
			
 
				+
			
 
				+	plan_tests(sizeof(flags) / sizeof(flags[0]) * 4 + 1);
			
 
				+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
			
 
				+		ntdb = ntdb_open("run-16-wipe_all.ntdb", flags[i]|MAYBE_NOSYNC,
			
 
				+			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
			
 
				+		if (ok1(ntdb)) {
			
 
				+			NTDB_DATA key;
			
 
				+			ok1(add_records(ntdb));
			
 
				+			ok1(ntdb_wipe_all(ntdb) == NTDB_SUCCESS);
			
 
				+			ok1(ntdb_firstkey(ntdb, &key) == NTDB_ERR_NOEXIST);
			
 
				+			ntdb_close(ntdb);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	ok1(tap_log_messages == 0);
			
 
				+	return exit_status();
			
 
				+}
			
--- a/ccan/ntdb/test/api-20-alloc-attr.c
+++ b/ccan/ntdb/test/api-20-alloc-attr.c
@@ -0,0 +1,106 @@
 
				+#include "config.h"
			
 
				+#include "ntdb.h"
			
 
				+#include "private.h"
			
 
				+#include "tap-interface.h"
			
 
				+#include <ccan/hash/hash.h>
			
 
				+#include <assert.h>
			
 
				+
			
 
				+#include "logging.h"
			
 
				+
			
 
				+static const struct ntdb_context *curr_ntdb;
			
 
				+static const struct ntdb_file *curr_file;
			
 
				+
			
 
				+static int owner_null_count,
			
 
				+	owner_weird_count, alloc_count, free_count, expand_count;
			
 
				+
			
 
				+static void *test_alloc(const void *owner, size_t len, void *priv_data)
			
 
				+{
			
 
				+	void *ret;
			
 
				+
			
 
				+	if (!owner) {
			
 
				+		owner_null_count++;
			
 
				+	} else if (owner != curr_ntdb && owner != curr_file) {
			
 
				+		owner_weird_count++;
			
 
				+	}
			
 
				+
			
 
				+	alloc_count++;
			
 
				+	ret = malloc(len);
			
 
				+
			
 
				+	/* The first time, this is the current ntdb, next is
			
 
				+	 * for the file struct. */
			
 
				+	if (!owner) {
			
 
				+		if (!curr_ntdb) {
			
 
				+			curr_ntdb = ret;
			
 
				+		} else if (!curr_file) {
			
 
				+			curr_file = ret;
			
 
				+		}
			
 
				+	}
			
 
				+	assert(priv_data == &owner_weird_count);
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+static void *test_expand(void *old, size_t newlen, void *priv_data)
			
 
				+{
			
 
				+	expand_count++;
			
 
				+
			
 
				+	assert(priv_data == &owner_weird_count);
			
 
				+	return realloc(old, newlen);
			
 
				+}
			
 
				+
			
 
				+static void test_free(void *old, void *priv_data)
			
 
				+{
			
 
				+	assert(priv_data == &owner_weird_count);
			
 
				+	if (old) {
			
 
				+		free_count++;
			
 
				+	}
			
 
				+	free(old);
			
 
				+}
			
 
				+
			
 
				+int main(int argc, char *argv[])
			
 
				+{
			
 
				+	unsigned int i, j;
			
 
				+	union ntdb_attribute alloc_attr;
			
 
				+	struct ntdb_context *ntdb;
			
 
				+	int flags[] = { NTDB_INTERNAL, NTDB_DEFAULT, NTDB_NOMMAP,
			
 
				+			NTDB_INTERNAL|NTDB_CONVERT, NTDB_CONVERT,
			
 
				+			NTDB_NOMMAP|NTDB_CONVERT };
			
 
				+	NTDB_DATA key = { (unsigned char *)&j, sizeof(j) };
			
 
				+	NTDB_DATA data = { (unsigned char *)&j, sizeof(j) };
			
 
				+
			
 
				+	alloc_attr.base.next = &tap_log_attr;
			
 
				+	alloc_attr.base.attr = NTDB_ATTRIBUTE_ALLOCATOR;
			
 
				+
			
 
				+	alloc_attr.alloc.alloc = test_alloc;
			
 
				+	alloc_attr.alloc.expand = test_expand;
			
 
				+	alloc_attr.alloc.free = test_free;
			
 
				+	alloc_attr.alloc.priv_data = &owner_weird_count;
			
 
				+
			
 
				+	plan_tests(sizeof(flags) / sizeof(flags[0]) * (1 + 700 * 3 + 4) + 1);
			
 
				+
			
 
				+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
			
 
				+		curr_ntdb = NULL;
			
 
				+		curr_file = NULL;
			
 
				+		ntdb = ntdb_open("run-12-store.ntdb", flags[i]|MAYBE_NOSYNC,
			
 
				+			       O_RDWR|O_CREAT|O_TRUNC, 0600, &alloc_attr);
			
 
				+		ok1(ntdb);
			
 
				+		if (!ntdb)
			
 
				+			continue;
			
 
				+
			
 
				+		for (j = 0; j < 700; j++) {
			
 
				+			NTDB_DATA d = { NULL, 0 }; /* Bogus GCC warning */
			
 
				+			ok1(ntdb_store(ntdb, key, data, NTDB_REPLACE) == 0);
			
 
				+			ok1(ntdb_fetch(ntdb, key, &d) == NTDB_SUCCESS);
			
 
				+			ok1(ntdb_deq(d, data));
			
 
				+			test_free(d.dptr, &owner_weird_count);
			
 
				+		}
			
 
				+		ntdb_close(ntdb);
			
 
				+
			
 
				+		ok1(owner_null_count == 2+i*2);
			
 
				+		ok1(owner_weird_count == 0);
			
 
				+		ok1(alloc_count == free_count);
			
 
				+		ok1(expand_count != 0);
			
 
				+	}
			
 
				+
			
 
				+	ok1(tap_log_messages == 0);
			
 
				+	return exit_status();
			
 
				+}
			
--- a/ccan/ntdb/test/api-21-parse_record.c
+++ b/ccan/ntdb/test/api-21-parse_record.c
@@ -0,0 +1,66 @@
 
				+#include "config.h"
			
 
				+#include "ntdb.h"
			
 
				+#include "private.h"
			
 
				+#include "tap-interface.h"
			
 
				+#include "logging.h"
			
 
				+
			
 
				+static enum NTDB_ERROR parse(NTDB_DATA key, NTDB_DATA data, NTDB_DATA *expected)
			
 
				+{
			
 
				+	if (!ntdb_deq(data, *expected))
			
 
				+		return NTDB_ERR_EINVAL;
			
 
				+	return NTDB_SUCCESS;
			
 
				+}
			
 
				+
			
 
				+static enum NTDB_ERROR parse_err(NTDB_DATA key, NTDB_DATA data, void *unused)
			
 
				+{
			
 
				+	return 100;
			
 
				+}
			
 
				+
			
 
				+static bool test_records(struct ntdb_context *ntdb)
			
 
				+{
			
 
				+	int i;
			
 
				+	NTDB_DATA key = { (unsigned char *)&i, sizeof(i) };
			
 
				+	NTDB_DATA data = { (unsigned char *)&i, sizeof(i) };
			
 
				+
			
 
				+	for (i = 0; i < 1000; i++) {
			
 
				+		if (ntdb_store(ntdb, key, data, NTDB_REPLACE) != 0)
			
 
				+			return false;
			
 
				+	}
			
 
				+
			
 
				+	for (i = 0; i < 1000; i++) {
			
 
				+		if (ntdb_parse_record(ntdb, key, parse, &data) != NTDB_SUCCESS)
			
 
				+			return false;
			
 
				+	}
			
 
				+
			
 
				+	if (ntdb_parse_record(ntdb, key, parse, &data) != NTDB_ERR_NOEXIST)
			
 
				+		return false;
			
 
				+
			
 
				+	/* Test error return from parse function. */
			
 
				+	i = 0;
			
 
				+	if (ntdb_parse_record(ntdb, key, parse_err, NULL) != 100)
			
 
				+		return false;
			
 
				+
			
 
				+	return true;
			
 
				+}
			
 
				+
			
 
				+int main(int argc, char *argv[])
			
 
				+{
			
 
				+	unsigned int i;
			
 
				+	struct ntdb_context *ntdb;
			
 
				+	int flags[] = { NTDB_INTERNAL, NTDB_DEFAULT, NTDB_NOMMAP,
			
 
				+			NTDB_INTERNAL|NTDB_CONVERT, NTDB_CONVERT,
			
 
				+			NTDB_NOMMAP|NTDB_CONVERT };
			
 
				+
			
 
				+	plan_tests(sizeof(flags) / sizeof(flags[0]) * 2 + 1);
			
 
				+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
			
 
				+		ntdb = ntdb_open("api-21-parse_record.ntdb",
			
 
				+				 flags[i]|MAYBE_NOSYNC,
			
 
				+				 O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
			
 
				+		if (ok1(ntdb))
			
 
				+			ok1(test_records(ntdb));
			
 
				+		ntdb_close(ntdb);
			
 
				+	}
			
 
				+
			
 
				+	ok1(tap_log_messages == 0);
			
 
				+	return exit_status();
			
 
				+}
			
--- a/ccan/ntdb/test/api-55-transaction.c
+++ b/ccan/ntdb/test/api-55-transaction.c
@@ -0,0 +1,71 @@
 
				+#include "private.h" // struct ntdb_context
			
 
				+#include "ntdb.h"
			
 
				+#include "tap-interface.h"
			
 
				+#include <stdlib.h>
			
 
				+#include "logging.h"
			
 
				+
			
 
				+int main(int argc, char *argv[])
			
 
				+{
			
 
				+	unsigned int i;
			
 
				+	struct ntdb_context *ntdb;
			
 
				+	unsigned char *buffer;
			
 
				+	int flags[] = { NTDB_DEFAULT, NTDB_NOMMAP,
			
 
				+			NTDB_CONVERT, NTDB_NOMMAP|NTDB_CONVERT };
			
 
				+	NTDB_DATA key = ntdb_mkdata("key", 3);
			
 
				+	NTDB_DATA data;
			
 
				+
			
 
				+	buffer = malloc(1000);
			
 
				+	for (i = 0; i < 1000; i++)
			
 
				+		buffer[i] = i;
			
 
				+
			
 
				+	plan_tests(sizeof(flags) / sizeof(flags[0]) * 20 + 1);
			
 
				+
			
 
				+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
			
 
				+		ntdb = ntdb_open("run-55-transaction.ntdb",
			
 
				+				 flags[i]|MAYBE_NOSYNC,
			
 
				+				 O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
			
 
				+		ok1(ntdb);
			
 
				+		if (!ntdb)
			
 
				+			continue;
			
 
				+
			
 
				+		ok1(ntdb_transaction_start(ntdb) == 0);
			
 
				+		data.dptr = buffer;
			
 
				+		data.dsize = 1000;
			
 
				+		ok1(ntdb_store(ntdb, key, data, NTDB_INSERT) == 0);
			
 
				+		ok1(ntdb_fetch(ntdb, key, &data) == NTDB_SUCCESS);
			
 
				+		ok1(data.dsize == 1000);
			
 
				+		ok1(memcmp(data.dptr, buffer, data.dsize) == 0);
			
 
				+		free(data.dptr);
			
 
				+
			
 
				+		/* Cancelling a transaction means no store */
			
 
				+		ntdb_transaction_cancel(ntdb);
			
 
				+		ok1(ntdb->file->allrecord_lock.count == 0
			
 
				+		    && ntdb->file->num_lockrecs == 0);
			
 
				+		ok1(ntdb_check(ntdb, NULL, NULL) == 0);
			
 
				+		ok1(ntdb_fetch(ntdb, key, &data) == NTDB_ERR_NOEXIST);
			
 
				+
			
 
				+		/* Commit the transaction. */
			
 
				+		ok1(ntdb_transaction_start(ntdb) == 0);
			
 
				+		data.dptr = buffer;
			
 
				+		data.dsize = 1000;
			
 
				+		ok1(ntdb_store(ntdb, key, data, NTDB_INSERT) == 0);
			
 
				+		ok1(ntdb_fetch(ntdb, key, &data) == NTDB_SUCCESS);
			
 
				+		ok1(data.dsize == 1000);
			
 
				+		ok1(memcmp(data.dptr, buffer, data.dsize) == 0);
			
 
				+		free(data.dptr);
			
 
				+		ok1(ntdb_transaction_commit(ntdb) == 0);
			
 
				+		ok1(ntdb->file->allrecord_lock.count == 0
			
 
				+		    && ntdb->file->num_lockrecs == 0);
			
 
				+		ok1(ntdb_check(ntdb, NULL, NULL) == 0);
			
 
				+		ok1(ntdb_fetch(ntdb, key, &data) == NTDB_SUCCESS);
			
 
				+		ok1(data.dsize == 1000);
			
 
				+		ok1(memcmp(data.dptr, buffer, data.dsize) == 0);
			
 
				+		free(data.dptr);
			
 
				+
			
 
				+		ntdb_close(ntdb);
			
 
				+	}
			
 
				+
			
 
				+	ok1(tap_log_messages == 0);
			
 
				+	free(buffer);
			
 
				+	return exit_status();
			
 
				+}
			
--- a/ccan/ntdb/test/api-60-noop-transaction.c
+++ b/ccan/ntdb/test/api-60-noop-transaction.c
@@ -0,0 +1,55 @@
 
				+#include "private.h" // struct ntdb_context
			
 
				+#include "ntdb.h"
			
 
				+#include "tap-interface.h"
			
 
				+#include <stdlib.h>
			
 
				+#include "logging.h"
			
 
				+
			
 
				+int main(int argc, char *argv[])
			
 
				+{
			
 
				+	unsigned int i;
			
 
				+	struct ntdb_context *ntdb;
			
 
				+	int flags[] = { NTDB_DEFAULT, NTDB_NOMMAP,
			
 
				+			NTDB_CONVERT, NTDB_NOMMAP|NTDB_CONVERT };
			
 
				+	NTDB_DATA key = ntdb_mkdata("key", 3);
			
 
				+	NTDB_DATA data = ntdb_mkdata("data", 4), d;
			
 
				+
			
 
				+	plan_tests(sizeof(flags) / sizeof(flags[0]) * 12 + 1);
			
 
				+
			
 
				+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
			
 
				+		ntdb = ntdb_open("api-60-transaction.ntdb",
			
 
				+				 flags[i]|MAYBE_NOSYNC,
			
 
				+				 O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
			
 
				+		ok1(ntdb);
			
 
				+		if (!ntdb)
			
 
				+			continue;
			
 
				+
			
 
				+		ok1(ntdb_store(ntdb, key, data, NTDB_INSERT) == 0);
			
 
				+
			
 
				+		ok1(ntdb_transaction_start(ntdb) == 0);
			
 
				+		/* Do an identical replace. */
			
 
				+		ok1(ntdb_store(ntdb, key, data, NTDB_REPLACE) == 0);
			
 
				+		ok1(ntdb_transaction_commit(ntdb) == 0);
			
 
				+
			
 
				+		ok1(ntdb_check(ntdb, NULL, NULL) == 0);
			
 
				+		ok1(ntdb_fetch(ntdb, key, &d) == NTDB_SUCCESS);
			
 
				+		ok1(ntdb_deq(data, d));
			
 
				+		free(d.dptr);
			
 
				+		ntdb_close(ntdb);
			
 
				+
			
 
				+		/* Reopen, fetch. */
			
 
				+		ntdb = ntdb_open("api-60-transaction.ntdb",
			
 
				+				 flags[i]|MAYBE_NOSYNC,
			
 
				+				 O_RDWR, 0600, &tap_log_attr);
			
 
				+		ok1(ntdb);
			
 
				+		if (!ntdb)
			
 
				+			continue;
			
 
				+		ok1(ntdb_check(ntdb, NULL, NULL) == 0);
			
 
				+		ok1(ntdb_fetch(ntdb, key, &d) == NTDB_SUCCESS);
			
 
				+		ok1(ntdb_deq(data, d));
			
 
				+		free(d.dptr);
			
 
				+		ntdb_close(ntdb);
			
 
				+	}
			
 
				+
			
 
				+	ok1(tap_log_messages == 0);
			
 
				+	return exit_status();
			
 
				+}
			
--- a/ccan/ntdb/test/api-80-tdb_fd.c
+++ b/ccan/ntdb/test/api-80-tdb_fd.c
@@ -0,0 +1,30 @@
 
				+#include "config.h"
			
 
				+#include "ntdb.h"
			
 
				+#include "private.h"
			
 
				+#include "tap-interface.h"
			
 
				+#include "logging.h"
			
 
				+
			
 
				+int main(int argc, char *argv[])
			
 
				+{
			
 
				+	unsigned int i;
			
 
				+	struct ntdb_context *ntdb;
			
 
				+	int flags[] = { NTDB_INTERNAL, NTDB_DEFAULT, NTDB_NOMMAP,
			
 
				+			NTDB_INTERNAL|NTDB_CONVERT, NTDB_CONVERT,
			
 
				+			NTDB_NOMMAP|NTDB_CONVERT };
			
 
				+
			
 
				+	plan_tests(sizeof(flags) / sizeof(flags[0]) * 3);
			
 
				+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
			
 
				+		ntdb = ntdb_open("api-80-ntdb_fd.ntdb", flags[i]|MAYBE_NOSYNC,
			
 
				+			       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
			
 
				+		if (!ok1(ntdb))
			
 
				+			continue;
			
 
				+
			
 
				+		if (flags[i] & NTDB_INTERNAL)
			
 
				+			ok1(ntdb_fd(ntdb) == -1);
			
 
				+		else
			
 
				+			ok1(ntdb_fd(ntdb) > 2);
			
 
				+		ntdb_close(ntdb);
			
 
				+		ok1(tap_log_messages == 0);
			
 
				+	}
			
 
				+	return exit_status();
			
 
				+}
			
--- a/ccan/ntdb/test/api-81-seqnum.c
+++ b/ccan/ntdb/test/api-81-seqnum.c
@@ -0,0 +1,68 @@
 
				+#include "config.h"
			
 
				+#include "ntdb.h"
			
 
				+#include "private.h"
			
 
				+#include "tap-interface.h"
			
 
				+#include <stdlib.h>
			
 
				+#include "logging.h"
			
 
				+
			
 
				+int main(int argc, char *argv[])
			
 
				+{
			
 
				+	unsigned int i, seq;
			
 
				+	struct ntdb_context *ntdb;
			
 
				+	NTDB_DATA d = { NULL, 0 }; /* Bogus GCC warning */
			
 
				+	NTDB_DATA key = ntdb_mkdata("key", 3);
			
 
				+	NTDB_DATA data = ntdb_mkdata("data", 4);
			
 
				+	int flags[] = { NTDB_INTERNAL, NTDB_DEFAULT, NTDB_NOMMAP,
			
 
				+			NTDB_INTERNAL|NTDB_CONVERT, NTDB_CONVERT,
			
 
				+			NTDB_NOMMAP|NTDB_CONVERT };
			
 
				+
			
 
				+	plan_tests(sizeof(flags) / sizeof(flags[0]) * 15 + 4 * 13);
			
 
				+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
			
 
				+		ntdb = ntdb_open("api-81-seqnum.ntdb",
			
 
				+				 flags[i]|NTDB_SEQNUM|MAYBE_NOSYNC,
			
 
				+				 O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
			
 
				+		if (!ok1(ntdb))
			
 
				+			continue;
			
 
				+
			
 
				+		seq = 0;
			
 
				+		ok1(ntdb_get_seqnum(ntdb) == seq);
			
 
				+		ok1(ntdb_store(ntdb, key, data, NTDB_INSERT) == 0);
			
 
				+		ok1(ntdb_get_seqnum(ntdb) == ++seq);
			
 
				+		/* Fetch doesn't change seqnum */
			
 
				+		if (ok1(ntdb_fetch(ntdb, key, &d) == NTDB_SUCCESS))
			
 
				+			free(d.dptr);
			
 
				+		ok1(ntdb_get_seqnum(ntdb) == seq);
			
 
				+		ok1(ntdb_append(ntdb, key, data) == NTDB_SUCCESS);
			
 
				+		ok1(ntdb_get_seqnum(ntdb) == ++seq);
			
 
				+
			
 
				+		ok1(ntdb_delete(ntdb, key) == NTDB_SUCCESS);
			
 
				+		ok1(ntdb_get_seqnum(ntdb) == ++seq);
			
 
				+		/* Empty append works */
			
 
				+		ok1(ntdb_append(ntdb, key, data) == NTDB_SUCCESS);
			
 
				+		ok1(ntdb_get_seqnum(ntdb) == ++seq);
			
 
				+
			
 
				+		ok1(ntdb_wipe_all(ntdb) == NTDB_SUCCESS);
			
 
				+		ok1(ntdb_get_seqnum(ntdb) == ++seq);
			
 
				+
			
 
				+		if (!(flags[i] & NTDB_INTERNAL)) {
			
 
				+			ok1(ntdb_transaction_start(ntdb) == NTDB_SUCCESS);
			
 
				+			ok1(ntdb_store(ntdb, key, data, NTDB_INSERT) == 0);
			
 
				+			ok1(ntdb_get_seqnum(ntdb) == ++seq);
			
 
				+			ok1(ntdb_append(ntdb, key, data) == NTDB_SUCCESS);
			
 
				+			ok1(ntdb_get_seqnum(ntdb) == ++seq);
			
 
				+			ok1(ntdb_delete(ntdb, key) == NTDB_SUCCESS);
			
 
				+			ok1(ntdb_get_seqnum(ntdb) == ++seq);
			
 
				+			ok1(ntdb_transaction_commit(ntdb) == NTDB_SUCCESS);
			
 
				+			ok1(ntdb_get_seqnum(ntdb) == seq);
			
 
				+
			
 
				+			ok1(ntdb_transaction_start(ntdb) == NTDB_SUCCESS);
			
 
				+			ok1(ntdb_store(ntdb, key, data, NTDB_INSERT) == 0);
			
 
				+			ok1(ntdb_get_seqnum(ntdb) == seq + 1);
			
 
				+			ntdb_transaction_cancel(ntdb);
			
 
				+			ok1(ntdb_get_seqnum(ntdb) == seq);
			
 
				+		}
			
 
				+		ntdb_close(ntdb);
			
 
				+		ok1(tap_log_messages == 0);
			
 
				+	}
			
 
				+	return exit_status();
			
 
				+}
			
--- a/ccan/ntdb/test/api-82-lockattr.c
+++ b/ccan/ntdb/test/api-82-lockattr.c
@@ -0,0 +1,237 @@
 
				+#include "private.h" // for ntdb_fcntl_unlock
			
 
				+#include "ntdb.h"
			
 
				+#include "tap-interface.h"
			
 
				+#include <errno.h>
			
 
				+#include "logging.h"
			
 
				+
			
 
				+static int mylock(int fd, int rw, off_t off, off_t len, bool waitflag,
			
 
				+		  void *_err)
			
 
				+{
			
 
				+	int *lock_err = _err;
			
 
				+	struct flock fl;
			
 
				+	int ret;
			
 
				+
			
 
				+	if (*lock_err) {
			
 
				+		errno = *lock_err;
			
 
				+		return -1;
			
 
				+	}
			
 
				+
			
 
				+	do {
			
 
				+		fl.l_type = rw;
			
 
				+		fl.l_whence = SEEK_SET;
			
 
				+		fl.l_start = off;
			
 
				+		fl.l_len = len;
			
 
				+
			
 
				+		if (waitflag)
			
 
				+			ret = fcntl(fd, F_SETLKW, &fl);
			
 
				+		else
			
 
				+			ret = fcntl(fd, F_SETLK, &fl);
			
 
				+	} while (ret != 0 && errno == EINTR);
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+static int trav_err;
			
 
				+static int trav(struct ntdb_context *ntdb, NTDB_DATA k, NTDB_DATA d, int *terr)
			
 
				+{
			
 
				+	*terr = trav_err;
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+int main(int argc, char *argv[])
			
 
				+{
			
 
				+	unsigned int i;
			
 
				+	struct ntdb_context *ntdb;
			
 
				+	int flags[] = { NTDB_DEFAULT, NTDB_NOMMAP,
			
 
				+			NTDB_CONVERT, NTDB_NOMMAP|NTDB_CONVERT };
			
 
				+	union ntdb_attribute lock_attr;
			
 
				+	NTDB_DATA key = ntdb_mkdata("key", 3);
			
 
				+	NTDB_DATA data = ntdb_mkdata("data", 4);
			
 
				+	int lock_err;
			
 
				+
			
 
				+	lock_attr.base.attr = NTDB_ATTRIBUTE_FLOCK;
			
 
				+	lock_attr.base.next = &tap_log_attr;
			
 
				+	lock_attr.flock.lock = mylock;
			
 
				+	lock_attr.flock.unlock = ntdb_fcntl_unlock;
			
 
				+	lock_attr.flock.data = &lock_err;
			
 
				+
			
 
				+	plan_tests(sizeof(flags) / sizeof(flags[0]) * 81);
			
 
				+
			
 
				+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
			
 
				+		NTDB_DATA d;
			
 
				+
			
 
				+		/* Nonblocking open; expect no error message. */
			
 
				+		lock_err = EAGAIN;
			
 
				+		ntdb = ntdb_open("run-82-lockattr.ntdb", flags[i]|MAYBE_NOSYNC,
			
 
				+				 O_RDWR|O_CREAT|O_TRUNC, 0600, &lock_attr);
			
 
				+		ok(errno == lock_err, "Errno is %u", errno);
			
 
				+		ok1(!ntdb);
			
 
				+		ok1(tap_log_messages == 0);
			
 
				+
			
 
				+		lock_err = EINTR;
			
 
				+		ntdb = ntdb_open("run-82-lockattr.ntdb", flags[i]|MAYBE_NOSYNC,
			
 
				+				 O_RDWR|O_CREAT|O_TRUNC, 0600, &lock_attr);
			
 
				+		ok(errno == lock_err, "Errno is %u", errno);
			
 
				+		ok1(!ntdb);
			
 
				+		ok1(tap_log_messages == 0);
			
 
				+
			
 
				+		/* Forced fail open. */
			
 
				+		lock_err = ENOMEM;
			
 
				+		ntdb = ntdb_open("run-82-lockattr.ntdb", flags[i]|MAYBE_NOSYNC,
			
 
				+				 O_RDWR|O_CREAT|O_TRUNC, 0600, &lock_attr);
			
 
				+		ok1(errno == lock_err);
			
 
				+		ok1(!ntdb);
			
 
				+		ok1(tap_log_messages == 1);
			
 
				+		tap_log_messages = 0;
			
 
				+
			
 
				+		lock_err = 0;
			
 
				+		ntdb = ntdb_open("run-82-lockattr.ntdb", flags[i]|MAYBE_NOSYNC,
			
 
				+				 O_RDWR|O_CREAT|O_TRUNC, 0600, &lock_attr);
			
 
				+		if (!ok1(ntdb))
			
 
				+			continue;
			
 
				+		ok1(tap_log_messages == 0);
			
 
				+
			
 
				+		/* Nonblocking store. */
			
 
				+		lock_err = EAGAIN;
			
 
				+		ok1(ntdb_store(ntdb, key, data, NTDB_REPLACE) == NTDB_ERR_LOCK);
			
 
				+		ok1(tap_log_messages == 0);
			
 
				+		lock_err = EINTR;
			
 
				+		ok1(ntdb_store(ntdb, key, data, NTDB_REPLACE) == NTDB_ERR_LOCK);
			
 
				+		ok1(tap_log_messages == 0);
			
 
				+		lock_err = ENOMEM;
			
 
				+		ok1(ntdb_store(ntdb, key, data, NTDB_REPLACE) == NTDB_ERR_LOCK);
			
 
				+		ok1(tap_log_messages == 1);
			
 
				+		tap_log_messages = 0;
			
 
				+
			
 
				+		/* Nonblocking fetch. */
			
 
				+		lock_err = EAGAIN;
			
 
				+		ok1(!ntdb_exists(ntdb, key));
			
 
				+		ok1(tap_log_messages == 0);
			
 
				+		lock_err = EINTR;
			
 
				+		ok1(!ntdb_exists(ntdb, key));
			
 
				+		ok1(tap_log_messages == 0);
			
 
				+		lock_err = ENOMEM;
			
 
				+		ok1(!ntdb_exists(ntdb, key));
			
 
				+		ok1(tap_log_messages == 1);
			
 
				+		tap_log_messages = 0;
			
 
				+
			
 
				+		lock_err = EAGAIN;
			
 
				+		ok1(ntdb_fetch(ntdb, key, &d) == NTDB_ERR_LOCK);
			
 
				+		ok1(tap_log_messages == 0);
			
 
				+		lock_err = EINTR;
			
 
				+		ok1(ntdb_fetch(ntdb, key, &d) == NTDB_ERR_LOCK);
			
 
				+		ok1(tap_log_messages == 0);
			
 
				+		lock_err = ENOMEM;
			
 
				+		ok1(ntdb_fetch(ntdb, key, &d) == NTDB_ERR_LOCK);
			
 
				+		ok1(tap_log_messages == 1);
			
 
				+		tap_log_messages = 0;
			
 
				+
			
 
				+		/* Nonblocking delete. */
			
 
				+		lock_err = EAGAIN;
			
 
				+		ok1(ntdb_delete(ntdb, key) == NTDB_ERR_LOCK);
			
 
				+		ok1(tap_log_messages == 0);
			
 
				+		lock_err = EINTR;
			
 
				+		ok1(ntdb_delete(ntdb, key) == NTDB_ERR_LOCK);
			
 
				+		ok1(tap_log_messages == 0);
			
 
				+		lock_err = ENOMEM;
			
 
				+		ok1(ntdb_delete(ntdb, key) == NTDB_ERR_LOCK);
			
 
				+		ok1(tap_log_messages == 1);
			
 
				+		tap_log_messages = 0;
			
 
				+
			
 
				+		/* Nonblocking locks. */
			
 
				+		lock_err = EAGAIN;
			
 
				+		ok1(ntdb_chainlock(ntdb, key) == NTDB_ERR_LOCK);
			
 
				+		ok1(tap_log_messages == 0);
			
 
				+		lock_err = EINTR;
			
 
				+		ok1(ntdb_chainlock(ntdb, key) == NTDB_ERR_LOCK);
			
 
				+		ok1(tap_log_messages == 0);
			
 
				+		lock_err = ENOMEM;
			
 
				+		ok1(ntdb_chainlock(ntdb, key) == NTDB_ERR_LOCK);
			
 
				+		ok1(tap_log_messages == 1);
			
 
				+		tap_log_messages = 0;
			
 
				+
			
 
				+		lock_err = EAGAIN;
			
 
				+		ok1(ntdb_chainlock_read(ntdb, key) == NTDB_ERR_LOCK);
			
 
				+		ok1(tap_log_messages == 0);
			
 
				+		lock_err = EINTR;
			
 
				+		ok1(ntdb_chainlock_read(ntdb, key) == NTDB_ERR_LOCK);
			
 
				+		ok1(tap_log_messages == 0);
			
 
				+		lock_err = ENOMEM;
			
 
				+		ok1(ntdb_chainlock_read(ntdb, key) == NTDB_ERR_LOCK);
			
 
				+		ok1(tap_log_messages == 1);
			
 
				+		tap_log_messages = 0;
			
 
				+
			
 
				+		lock_err = EAGAIN;
			
 
				+		ok1(ntdb_lockall(ntdb) == NTDB_ERR_LOCK);
			
 
				+		ok1(tap_log_messages == 0);
			
 
				+		lock_err = EINTR;
			
 
				+		ok1(ntdb_lockall(ntdb) == NTDB_ERR_LOCK);
			
 
				+		ok1(tap_log_messages == 0);
			
 
				+		lock_err = ENOMEM;
			
 
				+		ok1(ntdb_lockall(ntdb) == NTDB_ERR_LOCK);
			
 
				+		/* This actually does divide and conquer. */
			
 
				+		ok1(tap_log_messages > 0);
			
 
				+		tap_log_messages = 0;
			
 
				+
			
 
				+		lock_err = EAGAIN;
			
 
				+		ok1(ntdb_lockall_read(ntdb) == NTDB_ERR_LOCK);
			
 
				+		ok1(tap_log_messages == 0);
			
 
				+		lock_err = EINTR;
			
 
				+		ok1(ntdb_lockall_read(ntdb) == NTDB_ERR_LOCK);
			
 
				+		ok1(tap_log_messages == 0);
			
 
				+		lock_err = ENOMEM;
			
 
				+		ok1(ntdb_lockall_read(ntdb) == NTDB_ERR_LOCK);
			
 
				+		ok1(tap_log_messages > 0);
			
 
				+		tap_log_messages = 0;
			
 
				+
			
 
				+		/* Nonblocking traverse; go nonblock partway through. */
			
 
				+		lock_err = 0;
			
 
				+		ok1(ntdb_store(ntdb, key, data, NTDB_REPLACE) == 0);
			
 
				+		/* Need two entries to ensure two lock attempts! */
			
 
				+		ok1(ntdb_store(ntdb, ntdb_mkdata("key2", 4), data,
			
 
				+			       NTDB_REPLACE) == 0);
			
 
				+		trav_err = EAGAIN;
			
 
				+		ok1(ntdb_traverse(ntdb, trav, &lock_err) == NTDB_ERR_LOCK);
			
 
				+		ok1(tap_log_messages == 0);
			
 
				+		trav_err = EINTR;
			
 
				+		lock_err = 0;
			
 
				+		ok1(ntdb_traverse(ntdb, trav, &lock_err) == NTDB_ERR_LOCK);
			
 
				+		ok1(tap_log_messages == 0);
			
 
				+		trav_err = ENOMEM;
			
 
				+		lock_err = 0;
			
 
				+		ok1(ntdb_traverse(ntdb, trav, &lock_err) == NTDB_ERR_LOCK);
			
 
				+		ok1(tap_log_messages == 1);
			
 
				+		tap_log_messages = 0;
			
 
				+
			
 
				+		/* Nonblocking transactions. */
			
 
				+		lock_err = EAGAIN;
			
 
				+		ok1(ntdb_transaction_start(ntdb) == NTDB_ERR_LOCK);
			
 
				+		ok1(tap_log_messages == 0);
			
 
				+		lock_err = EINTR;
			
 
				+		ok1(ntdb_transaction_start(ntdb) == NTDB_ERR_LOCK);
			
 
				+		ok1(tap_log_messages == 0);
			
 
				+		lock_err = ENOMEM;
			
 
				+		ok1(ntdb_transaction_start(ntdb) == NTDB_ERR_LOCK);
			
 
				+		ok1(tap_log_messages == 1);
			
 
				+		tap_log_messages = 0;
			
 
				+
			
 
				+		/* Nonblocking transaction prepare. */
			
 
				+		lock_err = 0;
			
 
				+		ok1(ntdb_transaction_start(ntdb) == 0);
			
 
				+		ok1(ntdb_delete(ntdb, key) == 0);
			
 
				+
			
 
				+		lock_err = EAGAIN;
			
 
				+		ok1(ntdb_transaction_prepare_commit(ntdb) == NTDB_ERR_LOCK);
			
 
				+		ok1(tap_log_messages == 0);
			
 
				+
			
 
				+		lock_err = 0;
			
 
				+		ok1(ntdb_transaction_prepare_commit(ntdb) == 0);
			
 
				+		ok1(ntdb_transaction_commit(ntdb) == 0);
			
 
				+
			
 
				+		/* And the transaction was committed, right? */
			
 
				+		ok1(!ntdb_exists(ntdb, key));
			
 
				+		ntdb_close(ntdb);
			
 
				+		ok1(tap_log_messages == 0);
			
 
				+	}
			
 
				+	return exit_status();
			
 
				+}
			
--- a/ccan/ntdb/test/api-83-openhook.c
+++ b/ccan/ntdb/test/api-83-openhook.c
@@ -0,0 +1,103 @@
 
				+#include "config.h"
			
 
				+#include "ntdb.h"
			
 
				+#include "private.h"
			
 
				+#include "tap-interface.h"
			
 
				+#include "external-agent.h"
			
 
				+#include "logging.h"
			
 
				+
			
 
				+#define KEY_STR "key"
			
 
				+
			
 
				+static enum NTDB_ERROR clear_if_first(int fd, void *arg)
			
 
				+{
			
 
				+/* We hold a lock offset 4 always, so we can tell if anyone is holding it.
			
 
				+ * (This is compatible with tdb's TDB_CLEAR_IF_FIRST flag).  */
			
 
				+	struct flock fl;
			
 
				+
			
 
				+	if (arg != clear_if_first)
			
 
				+		return NTDB_ERR_CORRUPT;
			
 
				+
			
 
				+	fl.l_type = F_WRLCK;
			
 
				+	fl.l_whence = SEEK_SET;
			
 
				+	fl.l_start = 4;
			
 
				+	fl.l_len = 1;
			
 
				+
			
 
				+	if (fcntl(fd, F_SETLK, &fl) == 0) {
			
 
				+		/* We must be first ones to open it! */
			
 
				+		diag("truncating file!");
			
 
				+		if (ftruncate(fd, 0) != 0) {
			
 
				+			return NTDB_ERR_IO;
			
 
				+		}
			
 
				+	}
			
 
				+	fl.l_type = F_RDLCK;
			
 
				+	if (fcntl(fd, F_SETLKW, &fl) != 0) {
			
 
				+		return NTDB_ERR_IO;
			
 
				+	}
			
 
				+	return NTDB_SUCCESS;
			
 
				+}
			
 
				+
			
 
				+int main(int argc, char *argv[])
			
 
				+{
			
 
				+	unsigned int i;
			
 
				+	struct ntdb_context *ntdb, *ntdb2;
			
 
				+	struct agent *agent;
			
 
				+	union ntdb_attribute cif;
			
 
				+	NTDB_DATA key = ntdb_mkdata(KEY_STR, strlen(KEY_STR));
			
 
				+	int flags[] = { NTDB_DEFAULT, NTDB_NOMMAP,
			
 
				+			NTDB_CONVERT, NTDB_NOMMAP|NTDB_CONVERT };
			
 
				+
			
 
				+	cif.openhook.base.attr = NTDB_ATTRIBUTE_OPENHOOK;
			
 
				+	cif.openhook.base.next = &tap_log_attr;
			
 
				+	cif.openhook.fn = clear_if_first;
			
 
				+	cif.openhook.data = clear_if_first;
			
 
				+
			
 
				+	agent = prepare_external_agent();
			
 
				+	plan_tests(sizeof(flags) / sizeof(flags[0]) * 16);
			
 
				+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
			
 
				+		/* Create it */
			
 
				+		ntdb = ntdb_open("run-83-openhook.ntdb", flags[i]|MAYBE_NOSYNC,
			
 
				+				 O_RDWR|O_CREAT|O_TRUNC, 0600, NULL);
			
 
				+		ok1(ntdb);
			
 
				+		ok1(ntdb_store(ntdb, key, key, NTDB_REPLACE) == 0);
			
 
				+		ntdb_close(ntdb);
			
 
				+
			
 
				+		/* Now, open with CIF, should clear it. */
			
 
				+		ntdb = ntdb_open("run-83-openhook.ntdb", flags[i]|MAYBE_NOSYNC,
			
 
				+				 O_RDWR, 0, &cif);
			
 
				+		ok1(ntdb);
			
 
				+		ok1(!ntdb_exists(ntdb, key));
			
 
				+		ok1(ntdb_store(ntdb, key, key, NTDB_REPLACE) == 0);
			
 
				+
			
 
				+		/* Agent should not clear it, since it's still open. */
			
 
				+		ok1(external_agent_operation(agent, OPEN_WITH_HOOK,
			
 
				+					     "run-83-openhook.ntdb") == SUCCESS);
			
 
				+		ok1(external_agent_operation(agent, FETCH, KEY_STR "=" KEY_STR)
			
 
				+		    == SUCCESS);
			
 
				+		ok1(external_agent_operation(agent, CLOSE, "") == SUCCESS);
			
 
				+
			
 
				+		/* Still exists for us too. */
			
 
				+		ok1(ntdb_exists(ntdb, key));
			
 
				+
			
 
				+		/* Nested open should not erase db. */
			
 
				+		ntdb2 = ntdb_open("run-83-openhook.ntdb", flags[i]|MAYBE_NOSYNC,
			
 
				+				  O_RDWR, 0, &cif);
			
 
				+		ok1(ntdb_exists(ntdb2, key));
			
 
				+		ok1(ntdb_exists(ntdb, key));
			
 
				+		ntdb_close(ntdb2);
			
 
				+
			
 
				+		ok1(ntdb_exists(ntdb, key));
			
 
				+
			
 
				+		/* Close it, now agent should clear it. */
			
 
				+		ntdb_close(ntdb);
			
 
				+
			
 
				+		ok1(external_agent_operation(agent, OPEN_WITH_HOOK,
			
 
				+					     "run-83-openhook.ntdb") == SUCCESS);
			
 
				+		ok1(external_agent_operation(agent, FETCH, KEY_STR "=" KEY_STR)
			
 
				+		    == FAILED);
			
 
				+		ok1(external_agent_operation(agent, CLOSE, "") == SUCCESS);
			
 
				+
			
 
				+		ok1(tap_log_messages == 0);
			
 
				+	}
			
 
				+
			
 
				+	free_external_agent(agent);
			
 
				+	return exit_status();
			
 
				+}
			
--- a/ccan/ntdb/test/api-91-get-stats.c
+++ b/ccan/ntdb/test/api-91-get-stats.c
@@ -0,0 +1,57 @@
 
				+#include "config.h"
			
 
				+#include "ntdb.h"
			
 
				+#include "private.h"
			
 
				+#include "tap-interface.h"
			
 
				+#include "logging.h"
			
 
				+
			
 
				+int main(int argc, char *argv[])
			
 
				+{
			
 
				+	unsigned int i;
			
 
				+	struct ntdb_context *ntdb;
			
 
				+	int flags[] = { NTDB_DEFAULT, NTDB_NOMMAP,
			
 
				+			NTDB_CONVERT, NTDB_NOMMAP|NTDB_CONVERT };
			
 
				+
			
 
				+	plan_tests(sizeof(flags) / sizeof(flags[0]) * 11);
			
 
				+
			
 
				+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
			
 
				+		union ntdb_attribute *attr;
			
 
				+		NTDB_DATA key = ntdb_mkdata("key", 3), data;
			
 
				+
			
 
				+		ntdb = ntdb_open("run-91-get-stats.ntdb", flags[i]|MAYBE_NOSYNC,
			
 
				+				 O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
			
 
				+		ok1(ntdb);
			
 
				+		/* Force an expansion */
			
 
				+		data.dsize = 65536;
			
 
				+		data.dptr = calloc(data.dsize, 1);
			
 
				+		ok1(ntdb_store(ntdb, key, data, NTDB_REPLACE) == 0);
			
 
				+		free(data.dptr);
			
 
				+
			
 
				+		/* Use malloc so valgrind will catch overruns. */
			
 
				+		attr = malloc(sizeof *attr);
			
 
				+		attr->stats.base.attr = NTDB_ATTRIBUTE_STATS;
			
 
				+		attr->stats.size = sizeof(*attr);
			
 
				+
			
 
				+		ok1(ntdb_get_attribute(ntdb, attr) == 0);
			
 
				+		ok1(attr->stats.size == sizeof(*attr));
			
 
				+		ok1(attr->stats.allocs > 0);
			
 
				+		ok1(attr->stats.expands > 0);
			
 
				+		ok1(attr->stats.locks > 0);
			
 
				+		free(attr);
			
 
				+
			
 
				+		/* Try short one. */
			
 
				+		attr = malloc(offsetof(struct ntdb_attribute_stats, allocs)
			
 
				+			      + sizeof(attr->stats.allocs));
			
 
				+		attr->stats.base.attr = NTDB_ATTRIBUTE_STATS;
			
 
				+		attr->stats.size = offsetof(struct ntdb_attribute_stats, allocs)
			
 
				+			+ sizeof(attr->stats.allocs);
			
 
				+		ok1(ntdb_get_attribute(ntdb, attr) == 0);
			
 
				+		ok1(attr->stats.size == sizeof(*attr));
			
 
				+		ok1(attr->stats.allocs > 0);
			
 
				+		free(attr);
			
 
				+		ok1(tap_log_messages == 0);
			
 
				+
			
 
				+		ntdb_close(ntdb);
			
 
				+
			
 
				+	}
			
 
				+	return exit_status();
			
 
				+}
			
--- a/ccan/ntdb/test/api-92-get-set-readonly.c
+++ b/ccan/ntdb/test/api-92-get-set-readonly.c
@@ -0,0 +1,105 @@
 
				+#include "config.h"
			
 
				+#include "ntdb.h"
			
 
				+#include "private.h"
			
 
				+#include "tap-interface.h"
			
 
				+#include "logging.h"
			
 
				+
			
 
				+int main(int argc, char *argv[])
			
 
				+{
			
 
				+	unsigned int i;
			
 
				+	struct ntdb_context *ntdb;
			
 
				+	NTDB_DATA key = ntdb_mkdata("key", 3);
			
 
				+	NTDB_DATA data = ntdb_mkdata("data", 4);
			
 
				+	int flags[] = { NTDB_DEFAULT, NTDB_NOMMAP,
			
 
				+			NTDB_CONVERT, NTDB_NOMMAP|NTDB_CONVERT };
			
 
				+
			
 
				+	plan_tests(sizeof(flags) / sizeof(flags[0]) * 48);
			
 
				+
			
 
				+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
			
 
				+		/* RW -> R0 */
			
 
				+		ntdb = ntdb_open("run-92-get-set-readonly.ntdb",
			
 
				+				 flags[i]|MAYBE_NOSYNC,
			
 
				+				 O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
			
 
				+		ok1(ntdb);
			
 
				+		ok1(!(ntdb_get_flags(ntdb) & NTDB_RDONLY));
			
 
				+
			
 
				+		ok1(ntdb_store(ntdb, key, data, NTDB_INSERT) == NTDB_SUCCESS);
			
 
				+
			
 
				+		ntdb_add_flag(ntdb, NTDB_RDONLY);
			
 
				+		ok1(ntdb_get_flags(ntdb) & NTDB_RDONLY);
			
 
				+
			
 
				+		/* Can't store, append, delete. */
			
 
				+		ok1(ntdb_store(ntdb, key, data, NTDB_MODIFY) == NTDB_ERR_RDONLY);
			
 
				+		ok1(tap_log_messages == 1);
			
 
				+		ok1(ntdb_append(ntdb, key, data) == NTDB_ERR_RDONLY);
			
 
				+		ok1(tap_log_messages == 2);
			
 
				+		ok1(ntdb_delete(ntdb, key) == NTDB_ERR_RDONLY);
			
 
				+		ok1(tap_log_messages == 3);
			
 
				+
			
 
				+		/* Can't start a transaction, or any write lock. */
			
 
				+		ok1(ntdb_transaction_start(ntdb) == NTDB_ERR_RDONLY);
			
 
				+		ok1(tap_log_messages == 4);
			
 
				+		ok1(ntdb_chainlock(ntdb, key) == NTDB_ERR_RDONLY);
			
 
				+		ok1(tap_log_messages == 5);
			
 
				+		ok1(ntdb_lockall(ntdb) == NTDB_ERR_RDONLY);
			
 
				+		ok1(tap_log_messages == 6);
			
 
				+		ok1(ntdb_wipe_all(ntdb) == NTDB_ERR_RDONLY);
			
 
				+		ok1(tap_log_messages == 7);
			
 
				+
			
 
				+		/* Back to RW. */
			
 
				+		ntdb_remove_flag(ntdb, NTDB_RDONLY);
			
 
				+		ok1(!(ntdb_get_flags(ntdb) & NTDB_RDONLY));
			
 
				+
			
 
				+		ok1(ntdb_store(ntdb, key, data, NTDB_MODIFY) == NTDB_SUCCESS);
			
 
				+		ok1(ntdb_append(ntdb, key, data) == NTDB_SUCCESS);
			
 
				+		ok1(ntdb_delete(ntdb, key) == NTDB_SUCCESS);
			
 
				+
			
 
				+		ok1(ntdb_transaction_start(ntdb) == NTDB_SUCCESS);
			
 
				+		ok1(ntdb_store(ntdb, key, data, NTDB_INSERT) == NTDB_SUCCESS);
			
 
				+		ok1(ntdb_transaction_commit(ntdb) == NTDB_SUCCESS);
			
 
				+
			
 
				+		ok1(ntdb_chainlock(ntdb, key) == NTDB_SUCCESS);
			
 
				+		ntdb_chainunlock(ntdb, key);
			
 
				+		ok1(ntdb_lockall(ntdb) == NTDB_SUCCESS);
			
 
				+		ntdb_unlockall(ntdb);
			
 
				+		ok1(ntdb_wipe_all(ntdb) == NTDB_SUCCESS);
			
 
				+		ok1(tap_log_messages == 7);
			
 
				+
			
 
				+		ntdb_close(ntdb);
			
 
				+
			
 
				+		/* R0 -> RW */
			
 
				+		ntdb = ntdb_open("run-92-get-set-readonly.ntdb",
			
 
				+				 flags[i]|MAYBE_NOSYNC,
			
 
				+				 O_RDONLY, 0600, &tap_log_attr);
			
 
				+		ok1(ntdb);
			
 
				+		ok1(ntdb_get_flags(ntdb) & NTDB_RDONLY);
			
 
				+
			
 
				+		/* Can't store, append, delete. */
			
 
				+		ok1(ntdb_store(ntdb, key, data, NTDB_INSERT) == NTDB_ERR_RDONLY);
			
 
				+		ok1(tap_log_messages == 8);
			
 
				+		ok1(ntdb_append(ntdb, key, data) == NTDB_ERR_RDONLY);
			
 
				+		ok1(tap_log_messages == 9);
			
 
				+		ok1(ntdb_delete(ntdb, key) == NTDB_ERR_RDONLY);
			
 
				+		ok1(tap_log_messages == 10);
			
 
				+
			
 
				+		/* Can't start a transaction, or any write lock. */
			
 
				+		ok1(ntdb_transaction_start(ntdb) == NTDB_ERR_RDONLY);
			
 
				+		ok1(tap_log_messages == 11);
			
 
				+		ok1(ntdb_chainlock(ntdb, key) == NTDB_ERR_RDONLY);
			
 
				+		ok1(tap_log_messages == 12);
			
 
				+		ok1(ntdb_lockall(ntdb) == NTDB_ERR_RDONLY);
			
 
				+		ok1(tap_log_messages == 13);
			
 
				+		ok1(ntdb_wipe_all(ntdb) == NTDB_ERR_RDONLY);
			
 
				+		ok1(tap_log_messages == 14);
			
 
				+
			
 
				+		/* Can't remove NTDB_RDONLY since we opened with O_RDONLY */
			
 
				+		ntdb_remove_flag(ntdb, NTDB_RDONLY);
			
 
				+		ok1(tap_log_messages == 15);
			
 
				+		ok1(ntdb_get_flags(ntdb) & NTDB_RDONLY);
			
 
				+		ntdb_close(ntdb);
			
 
				+
			
 
				+		ok1(tap_log_messages == 15);
			
 
				+		tap_log_messages = 0;
			
 
				+	}
			
 
				+	return exit_status();
			
 
				+}
			
--- a/ccan/ntdb/test/api-93-repack.c
+++ b/ccan/ntdb/test/api-93-repack.c
@@ -0,0 +1,79 @@
 
				+#include "config.h"
			
 
				+#include "ntdb.h"
			
 
				+#include "private.h"
			
 
				+#include "tap-interface.h"
			
 
				+#include "logging.h"
			
 
				+
			
 
				+#define NUM_TESTS 1000
			
 
				+
			
 
				+static bool store_all(struct ntdb_context *ntdb)
			
 
				+{
			
 
				+	unsigned int i;
			
 
				+	NTDB_DATA key = { (unsigned char *)&i, sizeof(i) };
			
 
				+	NTDB_DATA dbuf = { (unsigned char *)&i, sizeof(i) };
			
 
				+
			
 
				+	for (i = 0; i < NUM_TESTS; i++) {
			
 
				+		if (ntdb_store(ntdb, key, dbuf, NTDB_INSERT) != NTDB_SUCCESS)
			
 
				+			return false;
			
 
				+	}
			
 
				+	return true;
			
 
				+}
			
 
				+
			
 
				+static int mark_entry(struct ntdb_context *ntdb,
			
 
				+		      NTDB_DATA key, NTDB_DATA data, bool found[])
			
 
				+{
			
 
				+	unsigned int num;
			
 
				+
			
 
				+	if (key.dsize != sizeof(num))
			
 
				+		return -1;
			
 
				+	memcpy(&num, key.dptr, key.dsize);
			
 
				+	if (num >= NUM_TESTS)
			
 
				+		return -1;
			
 
				+	if (found[num])
			
 
				+		return -1;
			
 
				+	found[num] = true;
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static bool is_all_set(bool found[], unsigned int num)
			
 
				+{
			
 
				+	unsigned int i;
			
 
				+
			
 
				+	for (i = 0; i < num; i++)
			
 
				+		if (!found[i])
			
 
				+			return false;
			
 
				+	return true;
			
 
				+}
			
 
				+
			
 
				+int main(int argc, char *argv[])
			
 
				+{
			
 
				+	unsigned int i;
			
 
				+	bool found[NUM_TESTS];
			
 
				+	struct ntdb_context *ntdb;
			
 
				+	int flags[] = { NTDB_DEFAULT, NTDB_NOMMAP,
			
 
				+			NTDB_CONVERT, NTDB_NOMMAP|NTDB_CONVERT
			
 
				+	};
			
 
				+
			
 
				+	plan_tests(sizeof(flags) / sizeof(flags[0]) * 6 + 1);
			
 
				+
			
 
				+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
			
 
				+		ntdb = ntdb_open("run-93-repack.ntdb",
			
 
				+				 flags[i]|MAYBE_NOSYNC,
			
 
				+				 O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
			
 
				+		ok1(ntdb);
			
 
				+		if (!ntdb)
			
 
				+			break;
			
 
				+
			
 
				+		ok1(store_all(ntdb));
			
 
				+
			
 
				+		ok1(ntdb_repack(ntdb) == NTDB_SUCCESS);
			
 
				+		memset(found, 0, sizeof(found));
			
 
				+		ok1(ntdb_check(ntdb, NULL, NULL) == NTDB_SUCCESS);
			
 
				+		ok1(ntdb_traverse(ntdb, mark_entry, found) == NUM_TESTS);
			
 
				+		ok1(is_all_set(found, NUM_TESTS));
			
 
				+		ntdb_close(ntdb);
			
 
				+	}
			
 
				+
			
 
				+	ok1(tap_log_messages == 0);
			
 
				+	return exit_status();
			
 
				+}
			
--- a/ccan/ntdb/test/api-94-expand-during-parse.c
+++ b/ccan/ntdb/test/api-94-expand-during-parse.c
@@ -0,0 +1,86 @@
 
				+/* We use direct access to hand to the parse function: what if db expands? */
			
 
				+#include "config.h"
			
 
				+#include "ntdb.h"
			
 
				+#include "tap-interface.h"
			
 
				+#include "logging.h"
			
 
				+#include "../private.h" /* To establish size, esp. for NTDB_INTERNAL dbs */
			
 
				+
			
 
				+static struct ntdb_context *ntdb;
			
 
				+
			
 
				+static off_t ntdb_size(void)
			
 
				+{
			
 
				+	return ntdb->file->map_size;
			
 
				+}
			
 
				+
			
 
				+struct parse_info {
			
 
				+	unsigned int depth;
			
 
				+	NTDB_DATA expected;
			
 
				+};
			
 
				+
			
 
				+static enum NTDB_ERROR parse(NTDB_DATA key, NTDB_DATA data,
			
 
				+			     struct parse_info *pinfo)
			
 
				+{
			
 
				+	off_t flen;
			
 
				+	unsigned int i;
			
 
				+
			
 
				+	if (!ntdb_deq(data, pinfo->expected))
			
 
				+		return NTDB_ERR_EINVAL;
			
 
				+
			
 
				+	flen = ntdb_size();
			
 
				+
			
 
				+	for (i = 0; ntdb_size() == flen; i++) {
			
 
				+		NTDB_DATA add = ntdb_mkdata(&i, sizeof(i));
			
 
				+
			
 
				+		/* This is technically illegal parse(), which is why we
			
 
				+		 * grabbed allrecord lock.*/
			
 
				+		ntdb_store(ntdb, add, add, NTDB_INSERT);
			
 
				+	}
			
 
				+
			
 
				+	/* Access the record again. */
			
 
				+	if (!ntdb_deq(data, pinfo->expected))
			
 
				+		return NTDB_ERR_EINVAL;
			
 
				+
			
 
				+	/* Recurse!  Woot! */
			
 
				+	if (pinfo->depth != 0) {
			
 
				+		enum NTDB_ERROR ecode;
			
 
				+
			
 
				+		pinfo->depth--;
			
 
				+		ecode = ntdb_parse_record(ntdb, key, parse, pinfo);
			
 
				+		if (ecode) {
			
 
				+			return ecode;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	/* Access the record one more time. */
			
 
				+	if (!ntdb_deq(data, pinfo->expected))
			
 
				+		return NTDB_ERR_EINVAL;
			
 
				+
			
 
				+	return NTDB_SUCCESS;
			
 
				+}
			
 
				+
			
 
				+int main(int argc, char *argv[])
			
 
				+{
			
 
				+	unsigned int i;
			
 
				+	int flags[] = { NTDB_INTERNAL, NTDB_DEFAULT, NTDB_NOMMAP,
			
 
				+			NTDB_INTERNAL|NTDB_CONVERT, NTDB_CONVERT,
			
 
				+			NTDB_NOMMAP|NTDB_CONVERT };
			
 
				+	struct parse_info pinfo;
			
 
				+	NTDB_DATA key = ntdb_mkdata("hello", 5), data = ntdb_mkdata("world", 5);
			
 
				+
			
 
				+	plan_tests(sizeof(flags) / sizeof(flags[0]) * 3 + 1);
			
 
				+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
			
 
				+		ntdb = ntdb_open("api-94-expand-during-parse.ntdb",
			
 
				+				 flags[i]|MAYBE_NOSYNC,
			
 
				+				 O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
			
 
				+		ok1(ntdb_store(ntdb, key, data, NTDB_INSERT) == NTDB_SUCCESS);
			
 
				+		ok1(ntdb_lockall(ntdb) == NTDB_SUCCESS);
			
 
				+		pinfo.expected = data;
			
 
				+		pinfo.depth = 3;
			
 
				+		ok1(ntdb_parse_record(ntdb, key, parse, &pinfo) == NTDB_SUCCESS);
			
 
				+		ntdb_unlockall(ntdb);
			
 
				+		ntdb_close(ntdb);
			
 
				+	}
			
 
				+
			
 
				+	ok1(tap_log_messages == 0);
			
 
				+	return exit_status();
			
 
				+}
			
--- a/ccan/ntdb/test/api-95-read-only-during-parse.c
+++ b/ccan/ntdb/test/api-95-read-only-during-parse.c
@@ -0,0 +1,92 @@
 
				+/* Make sure write operations fail during ntdb_parse(). */
			
 
				+#include "config.h"
			
 
				+#include "ntdb.h"
			
 
				+#include "private.h"
			
 
				+#include "tap-interface.h"
			
 
				+#include "logging.h"
			
 
				+
			
 
				+static struct ntdb_context *ntdb;
			
 
				+
			
 
				+/* We could get either of these. */
			
 
				+static bool xfail(enum NTDB_ERROR ecode)
			
 
				+{
			
 
				+	return ecode == NTDB_ERR_RDONLY || ecode == NTDB_ERR_LOCK;
			
 
				+}
			
 
				+
			
 
				+static enum NTDB_ERROR parse(NTDB_DATA key, NTDB_DATA data,
			
 
				+			     NTDB_DATA *expected)
			
 
				+{
			
 
				+	NTDB_DATA add = ntdb_mkdata("another", strlen("another"));
			
 
				+
			
 
				+	if (!ntdb_deq(data, *expected)) {
			
 
				+		return NTDB_ERR_EINVAL;
			
 
				+	}
			
 
				+
			
 
				+	/* These should all fail.*/
			
 
				+	if (!xfail(ntdb_store(ntdb, add, add, NTDB_INSERT))) {
			
 
				+		return NTDB_ERR_EINVAL;
			
 
				+	}
			
 
				+	tap_log_messages--;
			
 
				+
			
 
				+	if (!xfail(ntdb_append(ntdb, key, add))) {
			
 
				+		return NTDB_ERR_EINVAL;
			
 
				+	}
			
 
				+	tap_log_messages--;
			
 
				+
			
 
				+	if (!xfail(ntdb_delete(ntdb, key))) {
			
 
				+		return NTDB_ERR_EINVAL;
			
 
				+	}
			
 
				+	tap_log_messages--;
			
 
				+
			
 
				+	if (!xfail(ntdb_transaction_start(ntdb))) {
			
 
				+		return NTDB_ERR_EINVAL;
			
 
				+	}
			
 
				+	tap_log_messages--;
			
 
				+
			
 
				+	if (!xfail(ntdb_chainlock(ntdb, key))) {
			
 
				+		return NTDB_ERR_EINVAL;
			
 
				+	}
			
 
				+	tap_log_messages--;
			
 
				+
			
 
				+	if (!xfail(ntdb_lockall(ntdb))) {
			
 
				+		return NTDB_ERR_EINVAL;
			
 
				+	}
			
 
				+	tap_log_messages--;
			
 
				+
			
 
				+	if (!xfail(ntdb_wipe_all(ntdb))) {
			
 
				+		return NTDB_ERR_EINVAL;
			
 
				+	}
			
 
				+	tap_log_messages--;
			
 
				+
			
 
				+	if (!xfail(ntdb_repack(ntdb))) {
			
 
				+		return NTDB_ERR_EINVAL;
			
 
				+	}
			
 
				+	tap_log_messages--;
			
 
				+
			
 
				+	/* Access the record one more time. */
			
 
				+	if (!ntdb_deq(data, *expected)) {
			
 
				+		return NTDB_ERR_EINVAL;
			
 
				+	}
			
 
				+
			
 
				+	return NTDB_SUCCESS;
			
 
				+}
			
 
				+
			
 
				+int main(int argc, char *argv[])
			
 
				+{
			
 
				+	unsigned int i;
			
 
				+	int flags[] = { NTDB_DEFAULT, NTDB_NOMMAP, NTDB_CONVERT };
			
 
				+	NTDB_DATA key = ntdb_mkdata("hello", 5), data = ntdb_mkdata("world", 5);
			
 
				+
			
 
				+	plan_tests(sizeof(flags) / sizeof(flags[0]) * 2 + 1);
			
 
				+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
			
 
				+		ntdb = ntdb_open("api-95-read-only-during-parse.ntdb",
			
 
				+				 flags[i]|MAYBE_NOSYNC,
			
 
				+				 O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
			
 
				+		ok1(ntdb_store(ntdb, key, data, NTDB_INSERT) == NTDB_SUCCESS);
			
 
				+		ok1(ntdb_parse_record(ntdb, key, parse, &data) == NTDB_SUCCESS);
			
 
				+		ntdb_close(ntdb);
			
 
				+	}
			
 
				+
			
 
				+	ok1(tap_log_messages == 0);
			
 
				+	return exit_status();
			
 
				+}
			
--- a/ccan/ntdb/test/api-add-remove-flags.c
+++ b/ccan/ntdb/test/api-add-remove-flags.c
@@ -0,0 +1,87 @@
 
				+#include "private.h" // for ntdb_context
			
 
				+#include "ntdb.h"
			
 
				+#include "tap-interface.h"
			
 
				+#include "logging.h"
			
 
				+
			
 
				+int main(int argc, char *argv[])
			
 
				+{
			
 
				+	unsigned int i;
			
 
				+	struct ntdb_context *ntdb;
			
 
				+	int flags[] = { NTDB_INTERNAL, NTDB_DEFAULT, NTDB_NOMMAP,
			
 
				+			NTDB_INTERNAL|NTDB_CONVERT, NTDB_CONVERT,
			
 
				+			NTDB_NOMMAP|NTDB_CONVERT };
			
 
				+
			
 
				+	plan_tests(87);
			
 
				+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
			
 
				+		ntdb = ntdb_open("run-add-remove-flags.ntdb",
			
 
				+				 flags[i]|MAYBE_NOSYNC,
			
 
				+				 O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
			
 
				+		ok1(ntdb);
			
 
				+		if (!ntdb)
			
 
				+			continue;
			
 
				+
			
 
				+		ok1(ntdb_get_flags(ntdb) == ntdb->flags);
			
 
				+		tap_log_messages = 0;
			
 
				+		ntdb_add_flag(ntdb, NTDB_NOLOCK);
			
 
				+		if (flags[i] & NTDB_INTERNAL)
			
 
				+			ok1(tap_log_messages == 1);
			
 
				+		else {
			
 
				+			ok1(tap_log_messages == 0);
			
 
				+			ok1(ntdb_get_flags(ntdb) & NTDB_NOLOCK);
			
 
				+		}
			
 
				+
			
 
				+		tap_log_messages = 0;
			
 
				+		ntdb_add_flag(ntdb, NTDB_NOMMAP);
			
 
				+		if (flags[i] & NTDB_INTERNAL)
			
 
				+			ok1(tap_log_messages == 1);
			
 
				+		else {
			
 
				+			ok1(tap_log_messages == 0);
			
 
				+			ok1(ntdb_get_flags(ntdb) & NTDB_NOMMAP);
			
 
				+			ok1(ntdb->file->map_ptr == NULL);
			
 
				+		}
			
 
				+
			
 
				+		tap_log_messages = 0;
			
 
				+		ntdb_add_flag(ntdb, NTDB_NOSYNC);
			
 
				+		if (flags[i] & NTDB_INTERNAL)
			
 
				+			ok1(tap_log_messages == 1);
			
 
				+		else {
			
 
				+			ok1(tap_log_messages == 0);
			
 
				+			ok1(ntdb_get_flags(ntdb) & NTDB_NOSYNC);
			
 
				+		}
			
 
				+
			
 
				+		ok1(ntdb_get_flags(ntdb) == ntdb->flags);
			
 
				+
			
 
				+		tap_log_messages = 0;
			
 
				+		ntdb_remove_flag(ntdb, NTDB_NOLOCK);
			
 
				+		if (flags[i] & NTDB_INTERNAL)
			
 
				+			ok1(tap_log_messages == 1);
			
 
				+		else {
			
 
				+			ok1(tap_log_messages == 0);
			
 
				+			ok1(!(ntdb_get_flags(ntdb) & NTDB_NOLOCK));
			
 
				+		}
			
 
				+
			
 
				+		tap_log_messages = 0;
			
 
				+		ntdb_remove_flag(ntdb, NTDB_NOMMAP);
			
 
				+		if (flags[i] & NTDB_INTERNAL)
			
 
				+			ok1(tap_log_messages == 1);
			
 
				+		else {
			
 
				+			ok1(tap_log_messages == 0);
			
 
				+			ok1(!(ntdb_get_flags(ntdb) & NTDB_NOMMAP));
			
 
				+			ok1(ntdb->file->map_ptr != NULL);
			
 
				+		}
			
 
				+
			
 
				+		tap_log_messages = 0;
			
 
				+		ntdb_remove_flag(ntdb, NTDB_NOSYNC);
			
 
				+		if (flags[i] & NTDB_INTERNAL)
			
 
				+			ok1(tap_log_messages == 1);
			
 
				+		else {
			
 
				+			ok1(tap_log_messages == 0);
			
 
				+			ok1(!(ntdb_get_flags(ntdb) & NTDB_NOSYNC));
			
 
				+		}
			
 
				+
			
 
				+		ntdb_close(ntdb);
			
 
				+	}
			
 
				+
			
 
				+	ok1(tap_log_messages == 0);
			
 
				+	return exit_status();
			
 
				+}
			
--- a/ccan/ntdb/test/api-check-callback.c
+++ b/ccan/ntdb/test/api-check-callback.c
@@ -0,0 +1,86 @@
 
				+#include "config.h"
			
 
				+#include "ntdb.h"
			
 
				+#include "private.h"
			
 
				+#include "tap-interface.h"
			
 
				+#include "logging.h"
			
 
				+
			
 
				+#define NUM_RECORDS 1000
			
 
				+
			
 
				+static bool store_records(struct ntdb_context *ntdb)
			
 
				+{
			
 
				+	int i;
			
 
				+	NTDB_DATA key = { (unsigned char *)&i, sizeof(i) };
			
 
				+	NTDB_DATA data = { (unsigned char *)&i, sizeof(i) };
			
 
				+
			
 
				+	for (i = 0; i < NUM_RECORDS; i++)
			
 
				+		if (ntdb_store(ntdb, key, data, NTDB_REPLACE) != 0)
			
 
				+			return false;
			
 
				+	return true;
			
 
				+}
			
 
				+
			
 
				+static enum NTDB_ERROR check(NTDB_DATA key,
			
 
				+			    NTDB_DATA data,
			
 
				+			    bool *array)
			
 
				+{
			
 
				+	int val;
			
 
				+
			
 
				+	if (key.dsize != sizeof(val)) {
			
 
				+		diag("Wrong key size: %zu\n", key.dsize);
			
 
				+		return NTDB_ERR_CORRUPT;
			
 
				+	}
			
 
				+
			
 
				+	if (key.dsize != data.dsize
			
 
				+	    || memcmp(key.dptr, data.dptr, sizeof(val)) != 0) {
			
 
				+		diag("Key and data differ\n");
			
 
				+		return NTDB_ERR_CORRUPT;
			
 
				+	}
			
 
				+
			
 
				+	memcpy(&val, key.dptr, sizeof(val));
			
 
				+	if (val >= NUM_RECORDS || val < 0) {
			
 
				+		diag("check value %i\n", val);
			
 
				+		return NTDB_ERR_CORRUPT;
			
 
				+	}
			
 
				+
			
 
				+	if (array[val]) {
			
 
				+		diag("Value %i already seen\n", val);
			
 
				+		return NTDB_ERR_CORRUPT;
			
 
				+	}
			
 
				+
			
 
				+	array[val] = true;
			
 
				+	return NTDB_SUCCESS;
			
 
				+}
			
 
				+
			
 
				+int main(int argc, char *argv[])
			
 
				+{
			
 
				+	unsigned int i, j;
			
 
				+	struct ntdb_context *ntdb;
			
 
				+	int flags[] = { NTDB_INTERNAL, NTDB_DEFAULT, NTDB_NOMMAP,
			
 
				+			NTDB_INTERNAL|NTDB_CONVERT, NTDB_CONVERT,
			
 
				+			NTDB_NOMMAP|NTDB_CONVERT };
			
 
				+	return 0;
			
 
				+
			
 
				+	plan_tests(sizeof(flags) / sizeof(flags[0]) * 4 + 1);
			
 
				+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
			
 
				+		bool array[NUM_RECORDS];
			
 
				+
			
 
				+		ntdb = ntdb_open("run-check-callback.ntdb",
			
 
				+				 flags[i]|MAYBE_NOSYNC,
			
 
				+				 O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
			
 
				+		ok1(ntdb);
			
 
				+		if (!ntdb)
			
 
				+			continue;
			
 
				+
			
 
				+		ok1(store_records(ntdb));
			
 
				+		for (j = 0; j < NUM_RECORDS; j++)
			
 
				+			array[j] = false;
			
 
				+		ok1(ntdb_check(ntdb, check, array) == NTDB_SUCCESS);
			
 
				+		for (j = 0; j < NUM_RECORDS; j++)
			
 
				+			if (!array[j])
			
 
				+				break;
			
 
				+		ok1(j == NUM_RECORDS);
			
 
				+		ntdb_close(ntdb);
			
 
				+	}
			
 
				+
			
 
				+	ok1(tap_log_messages == 0);
			
 
				+	return exit_status();
			
 
				+}
			
--- a/ccan/ntdb/test/api-firstkey-nextkey.c
+++ b/ccan/ntdb/test/api-firstkey-nextkey.c
@@ -0,0 +1,157 @@
 
				+#include "config.h"
			
 
				+#include "ntdb.h"
			
 
				+#include "private.h"
			
 
				+#include "tap-interface.h"
			
 
				+#include "logging.h"
			
 
				+
			
 
				+#define NUM_RECORDS 1000
			
 
				+
			
 
				+static bool store_records(struct ntdb_context *ntdb)
			
 
				+{
			
 
				+	int i;
			
 
				+	NTDB_DATA key = { (unsigned char *)&i, sizeof(i) };
			
 
				+	NTDB_DATA data = { (unsigned char *)&i, sizeof(i) };
			
 
				+
			
 
				+	for (i = 0; i < NUM_RECORDS; i++)
			
 
				+		if (ntdb_store(ntdb, key, data, NTDB_REPLACE) != 0)
			
 
				+			return false;
			
 
				+	return true;
			
 
				+}
			
 
				+
			
 
				+struct trav_data {
			
 
				+	unsigned int records[NUM_RECORDS];
			
 
				+	unsigned int calls;
			
 
				+};
			
 
				+
			
 
				+static int trav(struct ntdb_context *ntdb, NTDB_DATA key, NTDB_DATA dbuf, void *p)
			
 
				+{
			
 
				+	struct trav_data *td = p;
			
 
				+	int val;
			
 
				+
			
 
				+	memcpy(&val, dbuf.dptr, dbuf.dsize);
			
 
				+	td->records[td->calls++] = val;
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/* Since ntdb_nextkey frees dptr, we need to clone it. */
			
 
				+static NTDB_DATA dup_key(NTDB_DATA key)
			
 
				+{
			
 
				+	void *p = malloc(key.dsize);
			
 
				+	memcpy(p, key.dptr, key.dsize);
			
 
				+	key.dptr = p;
			
 
				+	return key;
			
 
				+}
			
 
				+
			
 
				+int main(int argc, char *argv[])
			
 
				+{
			
 
				+	unsigned int i, j;
			
 
				+	int num;
			
 
				+	struct trav_data td;
			
 
				+	NTDB_DATA k;
			
 
				+	struct ntdb_context *ntdb;
			
 
				+	union ntdb_attribute seed_attr;
			
 
				+	enum NTDB_ERROR ecode;
			
 
				+	int flags[] = { NTDB_INTERNAL, NTDB_DEFAULT, NTDB_NOMMAP,
			
 
				+			NTDB_INTERNAL|NTDB_CONVERT, NTDB_CONVERT,
			
 
				+			NTDB_NOMMAP|NTDB_CONVERT };
			
 
				+
			
 
				+	seed_attr.base.attr = NTDB_ATTRIBUTE_SEED;
			
 
				+	seed_attr.base.next = &tap_log_attr;
			
 
				+	seed_attr.seed.seed = 6334326220117065685ULL;
			
 
				+
			
 
				+	plan_tests(sizeof(flags) / sizeof(flags[0])
			
 
				+		   * (NUM_RECORDS*6 + (NUM_RECORDS-1)*3 + 22) + 1);
			
 
				+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
			
 
				+		ntdb = ntdb_open("api-firstkey-nextkey.ntdb",
			
 
				+				 flags[i]|MAYBE_NOSYNC,
			
 
				+				 O_RDWR|O_CREAT|O_TRUNC, 0600,
			
 
				+				 &seed_attr);
			
 
				+		ok1(ntdb);
			
 
				+		if (!ntdb)
			
 
				+			continue;
			
 
				+
			
 
				+		ok1(ntdb_firstkey(ntdb, &k) == NTDB_ERR_NOEXIST);
			
 
				+
			
 
				+		/* One entry... */
			
 
				+		k.dptr = (unsigned char *)&num;
			
 
				+		k.dsize = sizeof(num);
			
 
				+		num = 0;
			
 
				+		ok1(ntdb_store(ntdb, k, k, NTDB_INSERT) == 0);
			
 
				+		ok1(ntdb_firstkey(ntdb, &k) == NTDB_SUCCESS);
			
 
				+		ok1(k.dsize == sizeof(num));
			
 
				+		ok1(memcmp(k.dptr, &num, sizeof(num)) == 0);
			
 
				+		ok1(ntdb_nextkey(ntdb, &k) == NTDB_ERR_NOEXIST);
			
 
				+
			
 
				+		/* Two entries. */
			
 
				+		k.dptr = (unsigned char *)&num;
			
 
				+		k.dsize = sizeof(num);
			
 
				+		num = 1;
			
 
				+		ok1(ntdb_store(ntdb, k, k, NTDB_INSERT) == 0);
			
 
				+		ok1(ntdb_firstkey(ntdb, &k) == NTDB_SUCCESS);
			
 
				+		ok1(k.dsize == sizeof(num));
			
 
				+		memcpy(&num, k.dptr, sizeof(num));
			
 
				+		ok1(num == 0 || num == 1);
			
 
				+		ok1(ntdb_nextkey(ntdb, &k) == NTDB_SUCCESS);
			
 
				+		ok1(k.dsize == sizeof(j));
			
 
				+		memcpy(&j, k.dptr, sizeof(j));
			
 
				+		ok1(j == 0 || j == 1);
			
 
				+		ok1(j != num);
			
 
				+		ok1(ntdb_nextkey(ntdb, &k) == NTDB_ERR_NOEXIST);
			
 
				+
			
 
				+		/* Clean up. */
			
 
				+		k.dptr = (unsigned char *)&num;
			
 
				+		k.dsize = sizeof(num);
			
 
				+		num = 0;
			
 
				+		ok1(ntdb_delete(ntdb, k) == 0);
			
 
				+		num = 1;
			
 
				+		ok1(ntdb_delete(ntdb, k) == 0);
			
 
				+
			
 
				+		/* Now lots of records. */
			
 
				+		ok1(store_records(ntdb));
			
 
				+		td.calls = 0;
			
 
				+
			
 
				+		num = ntdb_traverse(ntdb, trav, &td);
			
 
				+		ok1(num == NUM_RECORDS);
			
 
				+		ok1(td.calls == NUM_RECORDS);
			
 
				+
			
 
				+		/* Simple loop should match ntdb_traverse */
			
 
				+		for (j = 0, ecode = ntdb_firstkey(ntdb, &k); j < td.calls; j++) {
			
 
				+			int val;
			
 
				+
			
 
				+			ok1(ecode == NTDB_SUCCESS);
			
 
				+			ok1(k.dsize == sizeof(val));
			
 
				+			memcpy(&val, k.dptr, k.dsize);
			
 
				+			ok1(td.records[j] == val);
			
 
				+			ecode = ntdb_nextkey(ntdb, &k);
			
 
				+		}
			
 
				+
			
 
				+		/* But arbitrary orderings should work too. */
			
 
				+		for (j = td.calls-1; j > 0; j--) {
			
 
				+			k.dptr = (unsigned char *)&td.records[j-1];
			
 
				+			k.dsize = sizeof(td.records[j-1]);
			
 
				+			k = dup_key(k);
			
 
				+			ok1(ntdb_nextkey(ntdb, &k) == NTDB_SUCCESS);
			
 
				+			ok1(k.dsize == sizeof(td.records[j]));
			
 
				+			ok1(memcmp(k.dptr, &td.records[j], k.dsize) == 0);
			
 
				+			free(k.dptr);
			
 
				+		}
			
 
				+
			
 
				+		/* Even delete should work. */
			
 
				+		for (j = 0, ecode = ntdb_firstkey(ntdb, &k);
			
 
				+		     ecode != NTDB_ERR_NOEXIST;
			
 
				+		     j++) {
			
 
				+			ok1(ecode == NTDB_SUCCESS);
			
 
				+			ok1(k.dsize == 4);
			
 
				+			ok1(ntdb_delete(ntdb, k) == 0);
			
 
				+			ecode = ntdb_nextkey(ntdb, &k);
			
 
				+		}
			
 
				+
			
 
				+		diag("delete using first/nextkey gave %u of %u records",
			
 
				+		     j, NUM_RECORDS);
			
 
				+		ok1(j == NUM_RECORDS);
			
 
				+		ntdb_close(ntdb);
			
 
				+	}
			
 
				+
			
 
				+	ok1(tap_log_messages == 0);
			
 
				+	return exit_status();
			
 
				+}
			
--- a/ccan/ntdb/test/api-fork-test.c
+++ b/ccan/ntdb/test/api-fork-test.c
@@ -0,0 +1,194 @@
 
				+/* Test forking while holding lock.
			
 
				+ *
			
 
				+ * There are only five ways to do this currently:
			
 
				+ * (1) grab a ntdb_chainlock, then fork.
			
 
				+ * (2) grab a ntdb_lockall, then fork.
			
 
				+ * (3) grab a ntdb_lockall_read, then fork.
			
 
				+ * (4) start a transaction, then fork.
			
 
				+ * (5) fork from inside a ntdb_parse() callback.
			
 
				+ *
			
 
				+ * Note that we don't hold a lock across ntdb_traverse callbacks, so
			
 
				+ * that doesn't matter.
			
 
				+ */
			
 
				+#include "config.h"
			
 
				+#include "ntdb.h"
			
 
				+#include "private.h"
			
 
				+#include "tap-interface.h"
			
 
				+#include "logging.h"
			
 
				+
			
 
				+static bool am_child = false;
			
 
				+
			
 
				+static enum NTDB_ERROR fork_in_parse(NTDB_DATA key, NTDB_DATA data,
			
 
				+				    struct ntdb_context *ntdb)
			
 
				+{
			
 
				+	int status;
			
 
				+
			
 
				+	if (fork() == 0) {
			
 
				+		am_child = true;
			
 
				+
			
 
				+		/* We expect this to fail. */
			
 
				+		if (ntdb_store(ntdb, key, data, NTDB_REPLACE) != NTDB_ERR_LOCK)
			
 
				+			exit(1);
			
 
				+
			
 
				+		if (ntdb_fetch(ntdb, key, &data) != NTDB_ERR_LOCK)
			
 
				+			exit(1);
			
 
				+
			
 
				+		if (tap_log_messages != 2)
			
 
				+			exit(2);
			
 
				+
			
 
				+		return NTDB_SUCCESS;
			
 
				+	}
			
 
				+	wait(&status);
			
 
				+	ok1(WIFEXITED(status) && WEXITSTATUS(status) == 0);
			
 
				+	return NTDB_SUCCESS;
			
 
				+}
			
 
				+
			
 
				+int main(int argc, char *argv[])
			
 
				+{
			
 
				+	unsigned int i;
			
 
				+	struct ntdb_context *ntdb;
			
 
				+	int flags[] = { NTDB_DEFAULT, NTDB_NOMMAP,
			
 
				+			NTDB_CONVERT, NTDB_NOMMAP|NTDB_CONVERT };
			
 
				+	NTDB_DATA key = ntdb_mkdata("key", 3);
			
 
				+	NTDB_DATA data = ntdb_mkdata("data", 4);
			
 
				+
			
 
				+	plan_tests(sizeof(flags) / sizeof(flags[0]) * 14);
			
 
				+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
			
 
				+		int status;
			
 
				+
			
 
				+		tap_log_messages = 0;
			
 
				+
			
 
				+		ntdb = ntdb_open("run-fork-test.ntdb",
			
 
				+				 flags[i]|MAYBE_NOSYNC,
			
 
				+				 O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
			
 
				+		if (!ok1(ntdb))
			
 
				+			continue;
			
 
				+
			
 
				+		/* Put a record in here. */
			
 
				+		ok1(ntdb_store(ntdb, key, data, NTDB_REPLACE) == NTDB_SUCCESS);
			
 
				+
			
 
				+		ok1(ntdb_chainlock(ntdb, key) == NTDB_SUCCESS);
			
 
				+		if (fork() == 0) {
			
 
				+			/* We expect this to fail. */
			
 
				+			if (ntdb_store(ntdb, key, data, NTDB_REPLACE) != NTDB_ERR_LOCK)
			
 
				+				return 1;
			
 
				+
			
 
				+			if (ntdb_fetch(ntdb, key, &data) != NTDB_ERR_LOCK)
			
 
				+				return 1;
			
 
				+
			
 
				+			if (tap_log_messages != 2)
			
 
				+				return 2;
			
 
				+
			
 
				+			/* Child can do this without any complaints. */
			
 
				+			ntdb_chainunlock(ntdb, key);
			
 
				+			if (tap_log_messages != 2)
			
 
				+				return 3;
			
 
				+			ntdb_close(ntdb);
			
 
				+			if (tap_log_messages != 2)
			
 
				+				return 4;
			
 
				+			return 0;
			
 
				+		}
			
 
				+		wait(&status);
			
 
				+		ok1(WIFEXITED(status) && WEXITSTATUS(status) == 0);
			
 
				+		ntdb_chainunlock(ntdb, key);
			
 
				+
			
 
				+		ok1(ntdb_lockall(ntdb) == NTDB_SUCCESS);
			
 
				+		if (fork() == 0) {
			
 
				+			/* We expect this to fail. */
			
 
				+			if (ntdb_store(ntdb, key, data, NTDB_REPLACE) != NTDB_ERR_LOCK)
			
 
				+				return 1;
			
 
				+
			
 
				+			if (ntdb_fetch(ntdb, key, &data) != NTDB_ERR_LOCK)
			
 
				+				return 1;
			
 
				+
			
 
				+			if (tap_log_messages != 2)
			
 
				+				return 2;
			
 
				+
			
 
				+			/* Child can do this without any complaints. */
			
 
				+			ntdb_unlockall(ntdb);
			
 
				+			if (tap_log_messages != 2)
			
 
				+				return 3;
			
 
				+			ntdb_close(ntdb);
			
 
				+			if (tap_log_messages != 2)
			
 
				+				return 4;
			
 
				+			return 0;
			
 
				+		}
			
 
				+		wait(&status);
			
 
				+		ok1(WIFEXITED(status) && WEXITSTATUS(status) == 0);
			
 
				+		ntdb_unlockall(ntdb);
			
 
				+
			
 
				+		ok1(ntdb_lockall_read(ntdb) == NTDB_SUCCESS);
			
 
				+		if (fork() == 0) {
			
 
				+			/* We expect this to fail. */
			
 
				+			/* This would always fail anyway... */
			
 
				+			if (ntdb_store(ntdb, key, data, NTDB_REPLACE) != NTDB_ERR_LOCK)
			
 
				+				return 1;
			
 
				+
			
 
				+			if (ntdb_fetch(ntdb, key, &data) != NTDB_ERR_LOCK)
			
 
				+				return 1;
			
 
				+
			
 
				+			if (tap_log_messages != 2)
			
 
				+				return 2;
			
 
				+
			
 
				+			/* Child can do this without any complaints. */
			
 
				+			ntdb_unlockall_read(ntdb);
			
 
				+			if (tap_log_messages != 2)
			
 
				+				return 3;
			
 
				+			ntdb_close(ntdb);
			
 
				+			if (tap_log_messages != 2)
			
 
				+				return 4;
			
 
				+			return 0;
			
 
				+		}
			
 
				+		wait(&status);
			
 
				+		ok1(WIFEXITED(status) && WEXITSTATUS(status) == 0);
			
 
				+		ntdb_unlockall_read(ntdb);
			
 
				+
			
 
				+		ok1(ntdb_transaction_start(ntdb) == NTDB_SUCCESS);
			
 
				+		/* If transactions is empty, noop "commit" succeeds. */
			
 
				+		ok1(ntdb_delete(ntdb, key) == NTDB_SUCCESS);
			
 
				+		if (fork() == 0) {
			
 
				+			int last_log_messages;
			
 
				+
			
 
				+			/* We expect this to fail. */
			
 
				+			if (ntdb_store(ntdb, key, data, NTDB_REPLACE) != NTDB_ERR_LOCK)
			
 
				+				return 1;
			
 
				+
			
 
				+			if (ntdb_fetch(ntdb, key, &data) != NTDB_ERR_LOCK)
			
 
				+				return 1;
			
 
				+
			
 
				+			if (tap_log_messages != 2)
			
 
				+				return 2;
			
 
				+
			
 
				+			if (ntdb_transaction_prepare_commit(ntdb)
			
 
				+			    != NTDB_ERR_LOCK)
			
 
				+				return 3;
			
 
				+			if (tap_log_messages == 2)
			
 
				+				return 4;
			
 
				+
			
 
				+			last_log_messages = tap_log_messages;
			
 
				+			/* Child can do this without any complaints. */
			
 
				+			ntdb_transaction_cancel(ntdb);
			
 
				+			if (tap_log_messages != last_log_messages)
			
 
				+				return 4;
			
 
				+			ntdb_close(ntdb);
			
 
				+			if (tap_log_messages != last_log_messages)
			
 
				+				return 4;
			
 
				+			return 0;
			
 
				+		}
			
 
				+		wait(&status);
			
 
				+		ok1(WIFEXITED(status) && WEXITSTATUS(status) == 0);
			
 
				+		ntdb_transaction_cancel(ntdb);
			
 
				+
			
 
				+		ok1(ntdb_parse_record(ntdb, key, fork_in_parse, ntdb)
			
 
				+		    == NTDB_SUCCESS);
			
 
				+		ntdb_close(ntdb);
			
 
				+		if (am_child) {
			
 
				+			/* Child can return from parse without complaints. */
			
 
				+			if (tap_log_messages != 2)
			
 
				+				exit(3);
			
 
				+			exit(0);
			
 
				+		}
			
 
				+		ok1(tap_log_messages == 0);
			
 
				+	}
			
 
				+	return exit_status();
			
 
				+}
			
--- a/ccan/ntdb/test/api-locktimeout.c
+++ b/ccan/ntdb/test/api-locktimeout.c
@@ -0,0 +1,189 @@
 
				+#include "config.h"
			
 
				+#include "ntdb.h"
			
 
				+#include "private.h"
			
 
				+#include "tap-interface.h"
			
 
				+#include <limits.h>
			
 
				+#include "logging.h"
			
 
				+#include "external-agent.h"
			
 
				+
			
 
				+#undef alarm
			
 
				+#define alarm fast_alarm
			
 
				+
			
 
				+/* Speed things up by doing things in milliseconds. */
			
 
				+static unsigned int fast_alarm(unsigned int milli_seconds)
			
 
				+{
			
 
				+	struct itimerval it;
			
 
				+
			
 
				+	it.it_interval.tv_sec = it.it_interval.tv_usec = 0;
			
 
				+	it.it_value.tv_sec = milli_seconds / 1000;
			
 
				+	it.it_value.tv_usec = milli_seconds * 1000;
			
 
				+	setitimer(ITIMER_REAL, &it, NULL);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+#define CatchSignal(sig, handler) signal((sig), (handler))
			
 
				+
			
 
				+static void do_nothing(int signum)
			
 
				+{
			
 
				+}
			
 
				+
			
 
				+/* This example code is taken from SAMBA, so try not to change it. */
			
 
				+static struct flock flock_struct;
			
 
				+
			
 
				+/* Return a value which is none of v1, v2 or v3. */
			
 
				+static inline short int invalid_value(short int v1, short int v2, short int v3)
			
 
				+{
			
 
				+	short int try = (v1+v2+v3)^((v1+v2+v3) << 16);
			
 
				+	while (try == v1 || try == v2 || try == v3)
			
 
				+		try++;
			
 
				+	return try;
			
 
				+}
			
 
				+
			
 
				+/* We invalidate in as many ways as we can, so the OS rejects it */
			
 
				+static void invalidate_flock_struct(int signum)
			
 
				+{
			
 
				+	flock_struct.l_type = invalid_value(F_RDLCK, F_WRLCK, F_UNLCK);
			
 
				+	flock_struct.l_whence = invalid_value(SEEK_SET, SEEK_CUR, SEEK_END);
			
 
				+	flock_struct.l_start = -1;
			
 
				+	/* A large negative. */
			
 
				+	flock_struct.l_len = (((off_t)1 << (sizeof(off_t)*CHAR_BIT - 1)) + 1);
			
 
				+}
			
 
				+
			
 
				+static int timeout_lock(int fd, int rw, off_t off, off_t len, bool waitflag,
			
 
				+			void *_timeout)
			
 
				+{
			
 
				+	int ret, saved_errno = errno;
			
 
				+	unsigned int timeout = *(unsigned int *)_timeout;
			
 
				+
			
 
				+	flock_struct.l_type = rw;
			
 
				+	flock_struct.l_whence = SEEK_SET;
			
 
				+	flock_struct.l_start = off;
			
 
				+	flock_struct.l_len = len;
			
 
				+
			
 
				+	CatchSignal(SIGALRM, invalidate_flock_struct);
			
 
				+	alarm(timeout);
			
 
				+
			
 
				+	for (;;) {
			
 
				+		if (waitflag)
			
 
				+			ret = fcntl(fd, F_SETLKW, &flock_struct);
			
 
				+		else
			
 
				+			ret = fcntl(fd, F_SETLK, &flock_struct);
			
 
				+
			
 
				+		if (ret == 0)
			
 
				+			break;
			
 
				+
			
 
				+		/* Not signalled?  Something else went wrong. */
			
 
				+		if (flock_struct.l_len == len) {
			
 
				+			if (errno == EAGAIN || errno == EINTR)
			
 
				+				continue;
			
 
				+			saved_errno = errno;
			
 
				+			break;
			
 
				+		} else {
			
 
				+			saved_errno = EINTR;
			
 
				+			break;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	alarm(0);
			
 
				+	errno = saved_errno;
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+static int ntdb_chainlock_with_timeout_internal(struct ntdb_context *ntdb,
			
 
				+					       NTDB_DATA key,
			
 
				+					       unsigned int timeout,
			
 
				+					       int rw_type)
			
 
				+{
			
 
				+	union ntdb_attribute locking;
			
 
				+	enum NTDB_ERROR ecode;
			
 
				+
			
 
				+	if (timeout) {
			
 
				+		locking.base.attr = NTDB_ATTRIBUTE_FLOCK;
			
 
				+		ecode = ntdb_get_attribute(ntdb, &locking);
			
 
				+		if (ecode != NTDB_SUCCESS)
			
 
				+			return ecode;
			
 
				+
			
 
				+		/* Replace locking function with our own. */
			
 
				+		locking.flock.data = &timeout;
			
 
				+		locking.flock.lock = timeout_lock;
			
 
				+
			
 
				+		ecode = ntdb_set_attribute(ntdb, &locking);
			
 
				+		if (ecode != NTDB_SUCCESS)
			
 
				+			return ecode;
			
 
				+	}
			
 
				+	if (rw_type == F_RDLCK)
			
 
				+		ecode = ntdb_chainlock_read(ntdb, key);
			
 
				+	else
			
 
				+		ecode = ntdb_chainlock(ntdb, key);
			
 
				+
			
 
				+	if (timeout) {
			
 
				+		ntdb_unset_attribute(ntdb, NTDB_ATTRIBUTE_FLOCK);
			
 
				+	}
			
 
				+	return ecode;
			
 
				+}
			
 
				+
			
 
				+int main(int argc, char *argv[])
			
 
				+{
			
 
				+	unsigned int i;
			
 
				+	struct ntdb_context *ntdb;
			
 
				+	NTDB_DATA key = ntdb_mkdata("hello", 5);
			
 
				+	int flags[] = { NTDB_DEFAULT, NTDB_NOMMAP,
			
 
				+			NTDB_CONVERT, NTDB_NOMMAP|NTDB_CONVERT };
			
 
				+	struct agent *agent;
			
 
				+
			
 
				+	plan_tests(sizeof(flags) / sizeof(flags[0]) * 15);
			
 
				+
			
 
				+	agent = prepare_external_agent();
			
 
				+
			
 
				+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
			
 
				+		enum NTDB_ERROR ecode;
			
 
				+		ntdb = ntdb_open("run-locktimeout.ntdb",
			
 
				+				 flags[i]|MAYBE_NOSYNC,
			
 
				+				 O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
			
 
				+		if (!ok1(ntdb))
			
 
				+			break;
			
 
				+
			
 
				+		/* Simple cases: should succeed. */
			
 
				+		ecode = ntdb_chainlock_with_timeout_internal(ntdb, key, 20,
			
 
				+							    F_RDLCK);
			
 
				+		ok1(ecode == NTDB_SUCCESS);
			
 
				+		ok1(tap_log_messages == 0);
			
 
				+
			
 
				+		ntdb_chainunlock_read(ntdb, key);
			
 
				+		ok1(tap_log_messages == 0);
			
 
				+
			
 
				+		ecode = ntdb_chainlock_with_timeout_internal(ntdb, key, 20,
			
 
				+							    F_WRLCK);
			
 
				+		ok1(ecode == NTDB_SUCCESS);
			
 
				+		ok1(tap_log_messages == 0);
			
 
				+
			
 
				+		ntdb_chainunlock(ntdb, key);
			
 
				+		ok1(tap_log_messages == 0);
			
 
				+
			
 
				+		/* OK, get agent to start transaction, then we should time out. */
			
 
				+		ok1(external_agent_operation(agent, OPEN, "run-locktimeout.ntdb")
			
 
				+		    == SUCCESS);
			
 
				+		ok1(external_agent_operation(agent, TRANSACTION_START, "")
			
 
				+		    == SUCCESS);
			
 
				+		ecode = ntdb_chainlock_with_timeout_internal(ntdb, key, 20,
			
 
				+							    F_WRLCK);
			
 
				+		ok1(ecode == NTDB_ERR_LOCK);
			
 
				+		ok1(tap_log_messages == 0);
			
 
				+
			
 
				+		/* Even if we get a different signal, should be fine. */
			
 
				+		CatchSignal(SIGUSR1, do_nothing);
			
 
				+		external_agent_operation(agent, SEND_SIGNAL, "");
			
 
				+		ecode = ntdb_chainlock_with_timeout_internal(ntdb, key, 20,
			
 
				+							    F_WRLCK);
			
 
				+		ok1(ecode == NTDB_ERR_LOCK);
			
 
				+		ok1(tap_log_messages == 0);
			
 
				+
			
 
				+		ok1(external_agent_operation(agent, TRANSACTION_COMMIT, "")
			
 
				+		    == SUCCESS);
			
 
				+		ok1(external_agent_operation(agent, CLOSE, "")
			
 
				+		    == SUCCESS);
			
 
				+		ntdb_close(ntdb);
			
 
				+	}
			
 
				+	free_external_agent(agent);
			
 
				+	return exit_status();
			
 
				+}
			
--- a/ccan/ntdb/test/api-missing-entries.c
+++ b/ccan/ntdb/test/api-missing-entries.c
@@ -0,0 +1,42 @@
 
				+/* Another test revealed that we lost an entry.  This reproduces it. */
			
 
				+#include "config.h"
			
 
				+#include "ntdb.h"
			
 
				+#include "private.h"
			
 
				+#include <ccan/hash/hash.h>
			
 
				+#include "tap-interface.h"
			
 
				+#include "logging.h"
			
 
				+
			
 
				+#define NUM_RECORDS 1189
			
 
				+
			
 
				+/* We use the same seed which we saw this failure on. */
			
 
				+static uint32_t failhash(const void *key, size_t len, uint32_t seed, void *p)
			
 
				+{
			
 
				+	return hash64_stable((const unsigned char *)key, len,
			
 
				+			     699537674708983027ULL);
			
 
				+}
			
 
				+
			
 
				+int main(int argc, char *argv[])
			
 
				+{
			
 
				+	int i;
			
 
				+	struct ntdb_context *ntdb;
			
 
				+	NTDB_DATA key = { (unsigned char *)&i, sizeof(i) };
			
 
				+	NTDB_DATA data = { (unsigned char *)&i, sizeof(i) };
			
 
				+	union ntdb_attribute hattr = { .hash = { .base = { NTDB_ATTRIBUTE_HASH },
			
 
				+						.fn = failhash } };
			
 
				+
			
 
				+	hattr.base.next = &tap_log_attr;
			
 
				+	plan_tests(1 + NUM_RECORDS + 2);
			
 
				+
			
 
				+	ntdb = ntdb_open("run-missing-entries.ntdb", NTDB_INTERNAL,
			
 
				+			 O_RDWR|O_CREAT|O_TRUNC, 0600, &hattr);
			
 
				+	if (ok1(ntdb)) {
			
 
				+		for (i = 0; i < NUM_RECORDS; i++) {
			
 
				+			ok1(ntdb_store(ntdb, key, data, NTDB_REPLACE) == 0);
			
 
				+		}
			
 
				+		ok1(ntdb_check(ntdb, NULL, NULL) == 0);
			
 
				+		ntdb_close(ntdb);
			
 
				+	}
			
 
				+
			
 
				+	ok1(tap_log_messages == 0);
			
 
				+	return exit_status();
			
 
				+}
			
--- a/ccan/ntdb/test/api-open-multiple-times.c
+++ b/ccan/ntdb/test/api-open-multiple-times.c
@@ -0,0 +1,86 @@
 
				+#include "config.h"
			
 
				+#include "ntdb.h"
			
 
				+#include "tap-interface.h"
			
 
				+#include <stdlib.h>
			
 
				+#include "logging.h"
			
 
				+#include "../private.h"
			
 
				+
			
 
				+int main(int argc, char *argv[])
			
 
				+{
			
 
				+	unsigned int i;
			
 
				+	struct ntdb_context *ntdb, *ntdb2;
			
 
				+	NTDB_DATA key = { (unsigned char *)&i, sizeof(i) };
			
 
				+	NTDB_DATA data = { (unsigned char *)&i, sizeof(i) };
			
 
				+	NTDB_DATA d = { NULL, 0 }; /* Bogus GCC warning */
			
 
				+	int flags[] = { NTDB_DEFAULT, NTDB_NOMMAP,
			
 
				+			NTDB_CONVERT, NTDB_NOMMAP|NTDB_CONVERT };
			
 
				+
			
 
				+	plan_tests(sizeof(flags) / sizeof(flags[0]) * 30);
			
 
				+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
			
 
				+		ntdb = ntdb_open("run-open-multiple-times.ntdb",
			
 
				+				 flags[i]|MAYBE_NOSYNC,
			
 
				+				 O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
			
 
				+		ok1(ntdb);
			
 
				+		if (!ntdb)
			
 
				+			continue;
			
 
				+
			
 
				+		ntdb2 = ntdb_open("run-open-multiple-times.ntdb",
			
 
				+				  flags[i]|MAYBE_NOSYNC,
			
 
				+				  O_RDWR|O_CREAT, 0600, &tap_log_attr);
			
 
				+		ok1(ntdb_check(ntdb, NULL, NULL) == 0);
			
 
				+		ok1(ntdb_check(ntdb2, NULL, NULL) == 0);
			
 
				+		ok1((flags[i] & NTDB_NOMMAP) || ntdb2->file->map_ptr);
			
 
				+
			
 
				+		/* Store in one, fetch in the other. */
			
 
				+		ok1(ntdb_store(ntdb, key, data, NTDB_REPLACE) == 0);
			
 
				+		ok1(ntdb_fetch(ntdb2, key, &d) == NTDB_SUCCESS);
			
 
				+		ok1(ntdb_deq(d, data));
			
 
				+		free(d.dptr);
			
 
				+
			
 
				+		/* Vice versa, with delete. */
			
 
				+		ok1(ntdb_delete(ntdb2, key) == 0);
			
 
				+		ok1(ntdb_fetch(ntdb, key, &d) == NTDB_ERR_NOEXIST);
			
 
				+
			
 
				+		/* OK, now close first one, check second still good. */
			
 
				+		ok1(ntdb_close(ntdb) == 0);
			
 
				+
			
 
				+		ok1((flags[i] & NTDB_NOMMAP) || ntdb2->file->map_ptr);
			
 
				+		ok1(ntdb_store(ntdb2, key, data, NTDB_REPLACE) == 0);
			
 
				+		ok1(ntdb_fetch(ntdb2, key, &d) == NTDB_SUCCESS);
			
 
				+		ok1(ntdb_deq(d, data));
			
 
				+		free(d.dptr);
			
 
				+
			
 
				+		/* Reopen */
			
 
				+		ntdb = ntdb_open("run-open-multiple-times.ntdb",
			
 
				+				 flags[i]|MAYBE_NOSYNC,
			
 
				+				 O_RDWR|O_CREAT, 0600, &tap_log_attr);
			
 
				+		ok1(ntdb);
			
 
				+
			
 
				+		ok1(ntdb_transaction_start(ntdb2) == 0);
			
 
				+
			
 
				+		/* Anything in the other one should fail. */
			
 
				+		ok1(ntdb_fetch(ntdb, key, &d) == NTDB_ERR_LOCK);
			
 
				+		ok1(tap_log_messages == 1);
			
 
				+		ok1(ntdb_store(ntdb, key, data, NTDB_REPLACE) == NTDB_ERR_LOCK);
			
 
				+		ok1(tap_log_messages == 2);
			
 
				+		ok1(ntdb_transaction_start(ntdb) == NTDB_ERR_LOCK);
			
 
				+		ok1(tap_log_messages == 3);
			
 
				+		ok1(ntdb_chainlock(ntdb, key) == NTDB_ERR_LOCK);
			
 
				+		ok1(tap_log_messages == 4);
			
 
				+
			
 
				+		/* Transaciton should work as normal. */
			
 
				+		ok1(ntdb_store(ntdb2, key, data, NTDB_REPLACE) == NTDB_SUCCESS);
			
 
				+
			
 
				+		/* Now... try closing with locks held. */
			
 
				+		ok1(ntdb_close(ntdb2) == 0);
			
 
				+
			
 
				+		ok1(ntdb_fetch(ntdb, key, &d) == NTDB_SUCCESS);
			
 
				+		ok1(ntdb_deq(d, data));
			
 
				+		free(d.dptr);
			
 
				+		ok1(ntdb_close(ntdb) == 0);
			
 
				+		ok1(tap_log_messages == 4);
			
 
				+		tap_log_messages = 0;
			
 
				+	}
			
 
				+
			
 
				+	return exit_status();
			
 
				+}
			
--- a/ccan/ntdb/test/api-record-expand.c
+++ b/ccan/ntdb/test/api-record-expand.c
@@ -0,0 +1,49 @@
 
				+#include "config.h"
			
 
				+#include "ntdb.h"
			
 
				+#include "private.h"
			
 
				+#include "tap-interface.h"
			
 
				+#include "logging.h"
			
 
				+
			
 
				+#define MAX_SIZE 10000
			
 
				+#define SIZE_STEP 131
			
 
				+
			
 
				+int main(int argc, char *argv[])
			
 
				+{
			
 
				+	unsigned int i;
			
 
				+	struct ntdb_context *ntdb;
			
 
				+	int flags[] = { NTDB_INTERNAL, NTDB_DEFAULT, NTDB_NOMMAP,
			
 
				+			NTDB_INTERNAL|NTDB_CONVERT, NTDB_CONVERT,
			
 
				+			NTDB_NOMMAP|NTDB_CONVERT };
			
 
				+	NTDB_DATA key = ntdb_mkdata("key", 3);
			
 
				+	NTDB_DATA data;
			
 
				+
			
 
				+	data.dptr = malloc(MAX_SIZE);
			
 
				+	memset(data.dptr, 0x24, MAX_SIZE);
			
 
				+
			
 
				+	plan_tests(sizeof(flags) / sizeof(flags[0])
			
 
				+		   * (3 + (1 + (MAX_SIZE/SIZE_STEP)) * 2) + 1);
			
 
				+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
			
 
				+		ntdb = ntdb_open("run-record-expand.ntdb",
			
 
				+				 flags[i]|MAYBE_NOSYNC,
			
 
				+				 O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
			
 
				+		ok1(ntdb);
			
 
				+		if (!ntdb)
			
 
				+			continue;
			
 
				+
			
 
				+		data.dsize = 0;
			
 
				+		ok1(ntdb_store(ntdb, key, data, NTDB_INSERT) == 0);
			
 
				+		ok1(ntdb_check(ntdb, NULL, NULL) == 0);
			
 
				+		for (data.dsize = 0;
			
 
				+		     data.dsize < MAX_SIZE;
			
 
				+		     data.dsize += SIZE_STEP) {
			
 
				+			memset(data.dptr, data.dsize, data.dsize);
			
 
				+			ok1(ntdb_store(ntdb, key, data, NTDB_MODIFY) == 0);
			
 
				+			ok1(ntdb_check(ntdb, NULL, NULL) == 0);
			
 
				+		}
			
 
				+		ntdb_close(ntdb);
			
 
				+	}
			
 
				+	ok1(tap_log_messages == 0);
			
 
				+	free(data.dptr);
			
 
				+
			
 
				+	return exit_status();
			
 
				+}
			
--- a/ccan/ntdb/test/api-simple-delete.c
+++ b/ccan/ntdb/test/api-simple-delete.c
@@ -0,0 +1,38 @@
 
				+#include "config.h"
			
 
				+#include "ntdb.h"
			
 
				+#include "private.h"
			
 
				+#include "tap-interface.h"
			
 
				+#include "logging.h"
			
 
				+
			
 
				+int main(int argc, char *argv[])
			
 
				+{
			
 
				+	unsigned int i;
			
 
				+	struct ntdb_context *ntdb;
			
 
				+	int flags[] = { NTDB_INTERNAL, NTDB_DEFAULT, NTDB_NOMMAP,
			
 
				+			NTDB_INTERNAL|NTDB_CONVERT, NTDB_CONVERT,
			
 
				+			NTDB_NOMMAP|NTDB_CONVERT };
			
 
				+	NTDB_DATA key = ntdb_mkdata("key", 3);
			
 
				+	NTDB_DATA data = ntdb_mkdata("data", 4);
			
 
				+
			
 
				+	plan_tests(sizeof(flags) / sizeof(flags[0]) * 7 + 1);
			
 
				+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
			
 
				+		ntdb = ntdb_open("run-simple-delete.ntdb",
			
 
				+				 flags[i]|MAYBE_NOSYNC,
			
 
				+				 O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
			
 
				+		ok1(ntdb);
			
 
				+		if (ntdb) {
			
 
				+			/* Delete should fail. */
			
 
				+			ok1(ntdb_delete(ntdb, key) == NTDB_ERR_NOEXIST);
			
 
				+			ok1(ntdb_check(ntdb, NULL, NULL) == 0);
			
 
				+			/* Insert should succeed. */
			
 
				+			ok1(ntdb_store(ntdb, key, data, NTDB_INSERT) == 0);
			
 
				+			ok1(ntdb_check(ntdb, NULL, NULL) == 0);
			
 
				+			/* Delete should now work. */
			
 
				+			ok1(ntdb_delete(ntdb, key) == 0);
			
 
				+			ok1(ntdb_check(ntdb, NULL, NULL) == 0);
			
 
				+			ntdb_close(ntdb);
			
 
				+		}
			
 
				+	}
			
 
				+	ok1(tap_log_messages == 0);
			
 
				+	return exit_status();
			
 
				+}
			
--- a/ccan/ntdb/test/api-summary.c
+++ b/ccan/ntdb/test/api-summary.c
@@ -0,0 +1,55 @@
 
				+#include "config.h"
			
 
				+#include "ntdb.h"
			
 
				+#include "private.h"
			
 
				+#include "tap-interface.h"
			
 
				+#include "logging.h"
			
 
				+
			
 
				+int main(int argc, char *argv[])
			
 
				+{
			
 
				+	unsigned int i, j;
			
 
				+	struct ntdb_context *ntdb;
			
 
				+	int flags[] = { NTDB_INTERNAL, NTDB_DEFAULT, NTDB_NOMMAP,
			
 
				+			NTDB_INTERNAL|NTDB_CONVERT, NTDB_CONVERT,
			
 
				+			NTDB_NOMMAP|NTDB_CONVERT };
			
 
				+	NTDB_DATA key = { (unsigned char *)&j, sizeof(j) };
			
 
				+	NTDB_DATA data = { (unsigned char *)&j, sizeof(j) };
			
 
				+	char *summary;
			
 
				+
			
 
				+	plan_tests(sizeof(flags) / sizeof(flags[0]) * (1 + 2 * 5) + 1);
			
 
				+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
			
 
				+		ntdb = ntdb_open("run-summary.ntdb", flags[i]|MAYBE_NOSYNC,
			
 
				+				 O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
			
 
				+		ok1(ntdb);
			
 
				+		if (!ntdb)
			
 
				+			continue;
			
 
				+
			
 
				+		/* Put some stuff in there. */
			
 
				+		for (j = 0; j < 500; j++) {
			
 
				+			/* Make sure padding varies to we get some graphs! */
			
 
				+			data.dsize = j % (sizeof(j) + 1);
			
 
				+			if (ntdb_store(ntdb, key, data, NTDB_REPLACE) != 0)
			
 
				+				fail("Storing in ntdb");
			
 
				+		}
			
 
				+
			
 
				+		for (j = 0;
			
 
				+		     j <= NTDB_SUMMARY_HISTOGRAMS;
			
 
				+		     j += NTDB_SUMMARY_HISTOGRAMS) {
			
 
				+			ok1(ntdb_summary(ntdb, j, &summary) == NTDB_SUCCESS);
			
 
				+			ok1(strstr(summary, "Number of records: 500\n"));
			
 
				+			ok1(strstr(summary, "Smallest/average/largest keys: 4/4/4\n"));
			
 
				+			ok1(strstr(summary, "Smallest/average/largest data: 0/2/4\n"));
			
 
				+			if (j == NTDB_SUMMARY_HISTOGRAMS) {
			
 
				+				ok1(strstr(summary, "|")
			
 
				+				    && strstr(summary, "*"));
			
 
				+			} else {
			
 
				+				ok1(!strstr(summary, "|")
			
 
				+				    && !strstr(summary, "*"));
			
 
				+			}
			
 
				+			free(summary);
			
 
				+		}
			
 
				+		ntdb_close(ntdb);
			
 
				+	}
			
 
				+
			
 
				+	ok1(tap_log_messages == 0);
			
 
				+	return exit_status();
			
 
				+}
			
--- a/ccan/ntdb/test/external-agent.c
+++ b/ccan/ntdb/test/external-agent.c
@@ -0,0 +1,261 @@
 
				+#include "external-agent.h"
			
 
				+#include "logging.h"
			
 
				+#include "lock-tracking.h"
			
 
				+#include <sys/types.h>
			
 
				+#include <sys/wait.h>
			
 
				+#include <unistd.h>
			
 
				+#include <ccan/err/err.h>
			
 
				+#include <fcntl.h>
			
 
				+#include <stdlib.h>
			
 
				+#include <limits.h>
			
 
				+#include <string.h>
			
 
				+#include <errno.h>
			
 
				+#include "tap-interface.h"
			
 
				+#include <stdio.h>
			
 
				+#include <stdarg.h>
			
 
				+
			
 
				+static struct ntdb_context *ntdb;
			
 
				+
			
 
				+void (*external_agent_free)(void *) = free;
			
 
				+
			
 
				+static enum NTDB_ERROR clear_if_first(int fd, void *arg)
			
 
				+{
			
 
				+/* We hold a lock offset 4 always, so we can tell if anyone is holding it.
			
 
				+ * (This is compatible with tdb's TDB_CLEAR_IF_FIRST flag).  */
			
 
				+	struct flock fl;
			
 
				+
			
 
				+	fl.l_type = F_WRLCK;
			
 
				+	fl.l_whence = SEEK_SET;
			
 
				+	fl.l_start = 4;
			
 
				+	fl.l_len = 1;
			
 
				+
			
 
				+	if (fcntl(fd, F_SETLK, &fl) == 0) {
			
 
				+		/* We must be first ones to open it! */
			
 
				+		diag("agent truncating file!");
			
 
				+		if (ftruncate(fd, 0) != 0) {
			
 
				+			return NTDB_ERR_IO;
			
 
				+		}
			
 
				+	}
			
 
				+	fl.l_type = F_RDLCK;
			
 
				+	if (fcntl(fd, F_SETLKW, &fl) != 0) {
			
 
				+		return NTDB_ERR_IO;
			
 
				+	}
			
 
				+	return NTDB_SUCCESS;
			
 
				+}
			
 
				+
			
 
				+static enum agent_return do_operation(enum operation op, const char *name)
			
 
				+{
			
 
				+	NTDB_DATA k, d;
			
 
				+	enum agent_return ret;
			
 
				+	NTDB_DATA data;
			
 
				+	enum NTDB_ERROR ecode;
			
 
				+	union ntdb_attribute cif;
			
 
				+	const char *eq;
			
 
				+
			
 
				+	if (op != OPEN && op != OPEN_WITH_HOOK && !ntdb) {
			
 
				+		diag("external: No ntdb open!");
			
 
				+		return OTHER_FAILURE;
			
 
				+	}
			
 
				+
			
 
				+	diag("external: %s", operation_name(op));
			
 
				+
			
 
				+	eq = strchr(name, '=');
			
 
				+	if (eq) {
			
 
				+		k = ntdb_mkdata(name, eq - name);
			
 
				+		d = ntdb_mkdata(eq + 1, strlen(eq+1));
			
 
				+	} else {
			
 
				+		k = ntdb_mkdata(name, strlen(name));
			
 
				+		d.dsize = 0;
			
 
				+		d.dptr = NULL;
			
 
				+	}
			
 
				+
			
 
				+	locking_would_block = 0;
			
 
				+	switch (op) {
			
 
				+	case OPEN:
			
 
				+		if (ntdb) {
			
 
				+			diag("Already have ntdb %s open", ntdb_name(ntdb));
			
 
				+			return OTHER_FAILURE;
			
 
				+		}
			
 
				+		ntdb = ntdb_open(name, MAYBE_NOSYNC, O_RDWR, 0, &tap_log_attr);
			
 
				+		if (!ntdb) {
			
 
				+			if (!locking_would_block)
			
 
				+				diag("Opening ntdb gave %s", strerror(errno));
			
 
				+			forget_locking();
			
 
				+			ret = OTHER_FAILURE;
			
 
				+		} else
			
 
				+			ret = SUCCESS;
			
 
				+		break;
			
 
				+	case OPEN_WITH_HOOK:
			
 
				+		if (ntdb) {
			
 
				+			diag("Already have ntdb %s open", ntdb_name(ntdb));
			
 
				+			return OTHER_FAILURE;
			
 
				+		}
			
 
				+		cif.openhook.base.attr = NTDB_ATTRIBUTE_OPENHOOK;
			
 
				+		cif.openhook.base.next = &tap_log_attr;
			
 
				+		cif.openhook.fn = clear_if_first;
			
 
				+		ntdb = ntdb_open(name, MAYBE_NOSYNC, O_RDWR, 0, &cif);
			
 
				+		if (!ntdb) {
			
 
				+			if (!locking_would_block)
			
 
				+				diag("Opening ntdb gave %s", strerror(errno));
			
 
				+			forget_locking();
			
 
				+			ret = OTHER_FAILURE;
			
 
				+		} else
			
 
				+			ret = SUCCESS;
			
 
				+		break;
			
 
				+	case FETCH:
			
 
				+		ecode = ntdb_fetch(ntdb, k, &data);
			
 
				+		if (ecode == NTDB_ERR_NOEXIST) {
			
 
				+			ret = FAILED;
			
 
				+		} else if (ecode < 0) {
			
 
				+			ret = OTHER_FAILURE;
			
 
				+		} else if (!ntdb_deq(data, d)) {
			
 
				+			ret = OTHER_FAILURE;
			
 
				+			external_agent_free(data.dptr);
			
 
				+		} else {
			
 
				+			ret = SUCCESS;
			
 
				+			external_agent_free(data.dptr);
			
 
				+		}
			
 
				+		break;
			
 
				+	case STORE:
			
 
				+		ret = ntdb_store(ntdb, k, d, 0) == 0 ? SUCCESS : OTHER_FAILURE;
			
 
				+		break;
			
 
				+	case TRANSACTION_START:
			
 
				+		ret = ntdb_transaction_start(ntdb) == 0 ? SUCCESS : OTHER_FAILURE;
			
 
				+		break;
			
 
				+	case TRANSACTION_COMMIT:
			
 
				+		ret = ntdb_transaction_commit(ntdb)==0 ? SUCCESS : OTHER_FAILURE;
			
 
				+		break;
			
 
				+	case NEEDS_RECOVERY:
			
 
				+		ret = external_agent_needs_rec(ntdb);
			
 
				+		break;
			
 
				+	case CHECK:
			
 
				+		ret = ntdb_check(ntdb, NULL, NULL) == 0 ? SUCCESS : OTHER_FAILURE;
			
 
				+		break;
			
 
				+	case CLOSE:
			
 
				+		ret = ntdb_close(ntdb) == 0 ? SUCCESS : OTHER_FAILURE;
			
 
				+		ntdb = NULL;
			
 
				+		break;
			
 
				+	case SEND_SIGNAL:
			
 
				+		/* We do this async */
			
 
				+		ret = SUCCESS;
			
 
				+		break;
			
 
				+	default:
			
 
				+		ret = OTHER_FAILURE;
			
 
				+	}
			
 
				+
			
 
				+	if (locking_would_block)
			
 
				+		ret = WOULD_HAVE_BLOCKED;
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+struct agent {
			
 
				+	int cmdfd, responsefd;
			
 
				+};
			
 
				+
			
 
				+/* Do this before doing any ntdb stuff.  Return handle, or NULL. */
			
 
				+struct agent *prepare_external_agent(void)
			
 
				+{
			
 
				+	int pid, ret;
			
 
				+	int command[2], response[2];
			
 
				+	char name[1+PATH_MAX];
			
 
				+
			
 
				+	if (pipe(command) != 0 || pipe(response) != 0)
			
 
				+		return NULL;
			
 
				+
			
 
				+	pid = fork();
			
 
				+	if (pid < 0)
			
 
				+		return NULL;
			
 
				+
			
 
				+	if (pid != 0) {
			
 
				+		struct agent *agent = malloc(sizeof(*agent));
			
 
				+
			
 
				+		close(command[0]);
			
 
				+		close(response[1]);
			
 
				+		agent->cmdfd = command[1];
			
 
				+		agent->responsefd = response[0];
			
 
				+		return agent;
			
 
				+	}
			
 
				+
			
 
				+	close(command[1]);
			
 
				+	close(response[0]);
			
 
				+
			
 
				+	/* We want to fail, not block. */
			
 
				+	nonblocking_locks = true;
			
 
				+	log_prefix = "external: ";
			
 
				+	while ((ret = read(command[0], name, sizeof(name))) > 0) {
			
 
				+		enum agent_return result;
			
 
				+
			
 
				+		result = do_operation(name[0], name+1);
			
 
				+		if (write(response[1], &result, sizeof(result))
			
 
				+		    != sizeof(result))
			
 
				+			err(1, "Writing response");
			
 
				+		if (name[0] == SEND_SIGNAL) {
			
 
				+			struct timeval ten_ms;
			
 
				+			ten_ms.tv_sec = 0;
			
 
				+			ten_ms.tv_usec = 10000;
			
 
				+			select(0, NULL, NULL, NULL, &ten_ms);
			
 
				+			kill(getppid(), SIGUSR1);
			
 
				+		}
			
 
				+	}
			
 
				+	exit(0);
			
 
				+}
			
 
				+
			
 
				+/* Ask the external agent to try to do an operation. */
			
 
				+enum agent_return external_agent_operation(struct agent *agent,
			
 
				+					   enum operation op,
			
 
				+					   const char *name)
			
 
				+{
			
 
				+	enum agent_return res;
			
 
				+	unsigned int len;
			
 
				+	char *string;
			
 
				+
			
 
				+	if (!name)
			
 
				+		name = "";
			
 
				+	len = 1 + strlen(name) + 1;
			
 
				+	string = malloc(len);
			
 
				+
			
 
				+	string[0] = op;
			
 
				+	strcpy(string+1, name);
			
 
				+
			
 
				+	if (write(agent->cmdfd, string, len) != len
			
 
				+	    || read(agent->responsefd, &res, sizeof(res)) != sizeof(res))
			
 
				+		res = AGENT_DIED;
			
 
				+
			
 
				+	free(string);
			
 
				+	return res;
			
 
				+}
			
 
				+
			
 
				+const char *agent_return_name(enum agent_return ret)
			
 
				+{
			
 
				+	return ret == SUCCESS ? "SUCCESS"
			
 
				+		: ret == WOULD_HAVE_BLOCKED ? "WOULD_HAVE_BLOCKED"
			
 
				+		: ret == AGENT_DIED ? "AGENT_DIED"
			
 
				+		: ret == FAILED ? "FAILED"
			
 
				+		: ret == OTHER_FAILURE ? "OTHER_FAILURE"
			
 
				+		: "**INVALID**";
			
 
				+}
			
 
				+
			
 
				+const char *operation_name(enum operation op)
			
 
				+{
			
 
				+	switch (op) {
			
 
				+	case OPEN: return "OPEN";
			
 
				+	case OPEN_WITH_HOOK: return "OPEN_WITH_HOOK";
			
 
				+	case FETCH: return "FETCH";
			
 
				+	case STORE: return "STORE";
			
 
				+	case CHECK: return "CHECK";
			
 
				+	case TRANSACTION_START: return "TRANSACTION_START";
			
 
				+	case TRANSACTION_COMMIT: return "TRANSACTION_COMMIT";
			
 
				+	case NEEDS_RECOVERY: return "NEEDS_RECOVERY";
			
 
				+	case SEND_SIGNAL: return "SEND_SIGNAL";
			
 
				+	case CLOSE: return "CLOSE";
			
 
				+	}
			
 
				+	return "**INVALID**";
			
 
				+}
			
 
				+
			
 
				+void free_external_agent(struct agent *agent)
			
 
				+{
			
 
				+	close(agent->cmdfd);
			
 
				+	close(agent->responsefd);
			
 
				+	free(agent);
			
 
				+}
			
--- a/ccan/ntdb/test/external-agent.h
+++ b/ccan/ntdb/test/external-agent.h
@@ -0,0 +1,51 @@
 
				+#ifndef NTDB_TEST_EXTERNAL_AGENT_H
			
 
				+#define NTDB_TEST_EXTERNAL_AGENT_H
			
 
				+
			
 
				+/* For locking tests, we need a different process to try things at
			
 
				+ * various times. */
			
 
				+enum operation {
			
 
				+	OPEN,
			
 
				+	OPEN_WITH_HOOK,
			
 
				+	FETCH,
			
 
				+	STORE,
			
 
				+	TRANSACTION_START,
			
 
				+	TRANSACTION_COMMIT,
			
 
				+	NEEDS_RECOVERY,
			
 
				+	CHECK,
			
 
				+	SEND_SIGNAL,
			
 
				+	CLOSE,
			
 
				+};
			
 
				+
			
 
				+/* Do this before doing any ntdb stuff.  Return handle, or -1. */
			
 
				+struct agent *prepare_external_agent(void);
			
 
				+
			
 
				+enum agent_return {
			
 
				+	SUCCESS,
			
 
				+	WOULD_HAVE_BLOCKED,
			
 
				+	AGENT_DIED,
			
 
				+	FAILED, /* For fetch, or NEEDS_RECOVERY */
			
 
				+	OTHER_FAILURE,
			
 
				+};
			
 
				+
			
 
				+/* Ask the external agent to try to do an operation.
			
 
				+ * name == ntdb name for OPEN/OPEN_WITH_CLEAR_IF_FIRST,
			
 
				+ * <key>=<data> for FETCH/STORE.
			
 
				+ */
			
 
				+enum agent_return external_agent_operation(struct agent *handle,
			
 
				+					   enum operation op,
			
 
				+					   const char *name);
			
 
				+
			
 
				+/* Hook into free() on ntdb_data in external agent. */
			
 
				+extern void (*external_agent_free)(void *);
			
 
				+
			
 
				+/* Mapping enum -> string. */
			
 
				+const char *agent_return_name(enum agent_return ret);
			
 
				+const char *operation_name(enum operation op);
			
 
				+
			
 
				+void free_external_agent(struct agent *agent);
			
 
				+
			
 
				+/* Internal use: */
			
 
				+struct ntdb_context;
			
 
				+enum agent_return external_agent_needs_rec(struct ntdb_context *ntdb);
			
 
				+
			
 
				+#endif /* NTDB_TEST_EXTERNAL_AGENT_H */
			
--- a/ccan/ntdb/test/failtest_helper.c
+++ b/ccan/ntdb/test/failtest_helper.c
@@ -0,0 +1,99 @@
 
				+#include "failtest_helper.h"
			
 
				+#include "logging.h"
			
 
				+#include <string.h>
			
 
				+#include "tap-interface.h"
			
 
				+
			
 
				+bool failtest_suppress = false;
			
 
				+
			
 
				+/* FIXME: From ccan/str */
			
 
				+static inline bool strends(const char *str, const char *postfix)
			
 
				+{
			
 
				+	if (strlen(str) < strlen(postfix))
			
 
				+		return false;
			
 
				+
			
 
				+	return !strcmp(str + strlen(str) - strlen(postfix), postfix);
			
 
				+}
			
 
				+
			
 
				+bool failmatch(const struct failtest_call *call,
			
 
				+	       const char *file, int line, enum failtest_call_type type)
			
 
				+{
			
 
				+	return call->type == type
			
 
				+		&& call->line == line
			
 
				+		&& ((strcmp(call->file, file) == 0)
			
 
				+		    || (strends(call->file, file)
			
 
				+			&& (call->file[strlen(call->file) - strlen(file) - 1]
			
 
				+			    == '/')));
			
 
				+}
			
 
				+
			
 
				+static bool is_nonblocking_lock(const struct failtest_call *call)
			
 
				+{
			
 
				+	return call->type == FAILTEST_FCNTL && call->u.fcntl.cmd == F_SETLK;
			
 
				+}
			
 
				+
			
 
				+static bool is_unlock(const struct failtest_call *call)
			
 
				+{
			
 
				+	return call->type == FAILTEST_FCNTL
			
 
				+		&& call->u.fcntl.arg.fl.l_type == F_UNLCK;
			
 
				+}
			
 
				+
			
 
				+bool exit_check_log(struct tlist_calls *history)
			
 
				+{
			
 
				+	const struct failtest_call *i;
			
 
				+	unsigned int malloc_count = 0;
			
 
				+
			
 
				+	tlist_for_each(history, i, list) {
			
 
				+		if (!i->fail)
			
 
				+			continue;
			
 
				+		/* Failing the /dev/urandom open doesn't count: we fall back. */
			
 
				+		if (failmatch(i, URANDOM_OPEN))
			
 
				+			continue;
			
 
				+
			
 
				+		/* Similarly with read fail. */
			
 
				+		if (failmatch(i, URANDOM_READ))
			
 
				+			continue;
			
 
				+
			
 
				+		/* Initial allocation of ntdb doesn't log. */
			
 
				+		if (i->type == FAILTEST_MALLOC) {
			
 
				+			if (malloc_count++ == 0) {
			
 
				+				continue;
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+		/* We don't block "failures" on non-blocking locks. */
			
 
				+		if (is_nonblocking_lock(i))
			
 
				+			continue;
			
 
				+
			
 
				+		if (!tap_log_messages)
			
 
				+			diag("We didn't log for %s:%u", i->file, i->line);
			
 
				+		return tap_log_messages != 0;
			
 
				+	}
			
 
				+	return true;
			
 
				+}
			
 
				+
			
 
				+/* Some places we soldier on despite errors: only fail them once. */
			
 
				+enum failtest_result
			
 
				+block_repeat_failures(struct tlist_calls *history)
			
 
				+{
			
 
				+	const struct failtest_call *last;
			
 
				+
			
 
				+	last = tlist_tail(history, list);
			
 
				+
			
 
				+	if (failtest_suppress)
			
 
				+		return FAIL_DONT_FAIL;
			
 
				+
			
 
				+	if (failmatch(last, URANDOM_OPEN)
			
 
				+	    || failmatch(last, URANDOM_READ)) {
			
 
				+		return FAIL_PROBE;
			
 
				+	}
			
 
				+
			
 
				+	/* We handle mmap failing, by falling back to read/write, so
			
 
				+	 * don't try all possible paths. */
			
 
				+	if (last->type == FAILTEST_MMAP)
			
 
				+		return FAIL_PROBE;
			
 
				+
			
 
				+	/* Unlock or non-blocking lock is fail-once. */
			
 
				+	if (is_unlock(last) || is_nonblocking_lock(last))
			
 
				+		return FAIL_PROBE;
			
 
				+
			
 
				+	return FAIL_OK;
			
 
				+}
			
--- a/ccan/ntdb/test/failtest_helper.h
+++ b/ccan/ntdb/test/failtest_helper.h
@@ -0,0 +1,18 @@
 
				+#ifndef NTDB_TEST_FAILTEST_HELPER_H
			
 
				+#define NTDB_TEST_FAILTEST_HELPER_H
			
 
				+#include <ccan/failtest/failtest.h>
			
 
				+#include <stdbool.h>
			
 
				+
			
 
				+/* FIXME: Check these! */
			
 
				+#define URANDOM_OPEN		"open.c", 62, FAILTEST_OPEN
			
 
				+#define URANDOM_READ		"open.c", 42, FAILTEST_READ
			
 
				+
			
 
				+bool exit_check_log(struct tlist_calls *history);
			
 
				+bool failmatch(const struct failtest_call *call,
			
 
				+	       const char *file, int line, enum failtest_call_type type);
			
 
				+enum failtest_result block_repeat_failures(struct tlist_calls *history);
			
 
				+
			
 
				+/* Set this to suppress failure. */
			
 
				+extern bool failtest_suppress;
			
 
				+
			
 
				+#endif /* NTDB_TEST_LOGGING_H */
			
--- a/ccan/ntdb/test/helpapi-external-agent.c
+++ b/ccan/ntdb/test/helpapi-external-agent.c
@@ -0,0 +1,7 @@
 
				+#include "external-agent.h"
			
 
				+
			
 
				+/* This isn't possible with via the ntdb API, but this makes it link. */
			
 
				+enum agent_return external_agent_needs_rec(struct ntdb_context *ntdb)
			
 
				+{
			
 
				+	return FAILED;
			
 
				+}
			
--- a/ccan/ntdb/test/helprun-external-agent.c
+++ b/ccan/ntdb/test/helprun-external-agent.c
@@ -0,0 +1,7 @@
 
				+#include "external-agent.h"
			
 
				+#include "private.h"
			
 
				+
			
 
				+enum agent_return external_agent_needs_rec(struct ntdb_context *ntdb)
			
 
				+{
			
 
				+	return ntdb_needs_recovery(ntdb) ? SUCCESS : FAILED;
			
 
				+}
			
--- a/ccan/ntdb/test/helprun-layout.c
+++ b/ccan/ntdb/test/helprun-layout.c
@@ -0,0 +1,362 @@
 
				+/* NTDB tools to create various canned database layouts. */
			
 
				+#include "layout.h"
			
 
				+#include <stdlib.h>
			
 
				+#include <string.h>
			
 
				+#include <assert.h>
			
 
				+#include <ccan/err/err.h>
			
 
				+#include "logging.h"
			
 
				+
			
 
				+struct ntdb_layout *new_ntdb_layout(void)
			
 
				+{
			
 
				+	struct ntdb_layout *layout = malloc(sizeof(*layout));
			
 
				+	layout->num_elems = 0;
			
 
				+	layout->elem = NULL;
			
 
				+	return layout;
			
 
				+}
			
 
				+
			
 
				+static void add(struct ntdb_layout *layout, union ntdb_layout_elem elem)
			
 
				+{
			
 
				+	layout->elem = realloc(layout->elem,
			
 
				+			       sizeof(layout->elem[0])
			
 
				+			       * (layout->num_elems+1));
			
 
				+	layout->elem[layout->num_elems++] = elem;
			
 
				+}
			
 
				+
			
 
				+void ntdb_layout_add_freetable(struct ntdb_layout *layout)
			
 
				+{
			
 
				+	union ntdb_layout_elem elem;
			
 
				+	elem.base.type = FREETABLE;
			
 
				+	add(layout, elem);
			
 
				+}
			
 
				+
			
 
				+void ntdb_layout_add_free(struct ntdb_layout *layout, ntdb_len_t len,
			
 
				+			 unsigned ftable)
			
 
				+{
			
 
				+	union ntdb_layout_elem elem;
			
 
				+	elem.base.type = FREE;
			
 
				+	elem.free.len = len;
			
 
				+	elem.free.ftable_num = ftable;
			
 
				+	add(layout, elem);
			
 
				+}
			
 
				+
			
 
				+void ntdb_layout_add_capability(struct ntdb_layout *layout,
			
 
				+			       uint64_t type,
			
 
				+			       bool write_breaks,
			
 
				+			       bool check_breaks,
			
 
				+			       bool open_breaks,
			
 
				+			       ntdb_len_t extra)
			
 
				+{
			
 
				+	union ntdb_layout_elem elem;
			
 
				+	elem.base.type = CAPABILITY;
			
 
				+	elem.capability.type = type;
			
 
				+	if (write_breaks)
			
 
				+		elem.capability.type |= NTDB_CAP_NOWRITE;
			
 
				+	if (open_breaks)
			
 
				+		elem.capability.type |= NTDB_CAP_NOOPEN;
			
 
				+	if (check_breaks)
			
 
				+		elem.capability.type |= NTDB_CAP_NOCHECK;
			
 
				+	elem.capability.extra = extra;
			
 
				+	add(layout, elem);
			
 
				+}
			
 
				+
			
 
				+static NTDB_DATA dup_key(NTDB_DATA key)
			
 
				+{
			
 
				+	NTDB_DATA ret;
			
 
				+	ret.dsize = key.dsize;
			
 
				+	ret.dptr = malloc(ret.dsize);
			
 
				+	memcpy(ret.dptr, key.dptr, ret.dsize);
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+void ntdb_layout_add_used(struct ntdb_layout *layout,
			
 
				+			 NTDB_DATA key, NTDB_DATA data,
			
 
				+			 ntdb_len_t extra)
			
 
				+{
			
 
				+	union ntdb_layout_elem elem;
			
 
				+	elem.base.type = DATA;
			
 
				+	elem.used.key = dup_key(key);
			
 
				+	elem.used.data = dup_key(data);
			
 
				+	elem.used.extra = extra;
			
 
				+	add(layout, elem);
			
 
				+}
			
 
				+
			
 
				+static ntdb_len_t free_record_len(ntdb_len_t len)
			
 
				+{
			
 
				+	return sizeof(struct ntdb_used_record) + len;
			
 
				+}
			
 
				+
			
 
				+static ntdb_len_t data_record_len(struct tle_used *used)
			
 
				+{
			
 
				+	ntdb_len_t len;
			
 
				+	len = sizeof(struct ntdb_used_record)
			
 
				+		+ used->key.dsize + used->data.dsize + used->extra;
			
 
				+	assert(len >= sizeof(struct ntdb_free_record));
			
 
				+	return len;
			
 
				+}
			
 
				+
			
 
				+static ntdb_len_t capability_len(struct tle_capability *cap)
			
 
				+{
			
 
				+	return sizeof(struct ntdb_capability) + cap->extra;
			
 
				+}
			
 
				+
			
 
				+static ntdb_len_t freetable_len(struct tle_freetable *ftable)
			
 
				+{
			
 
				+	return sizeof(struct ntdb_freetable);
			
 
				+}
			
 
				+
			
 
				+static void set_free_record(void *mem, ntdb_len_t len)
			
 
				+{
			
 
				+	/* We do all the work in add_to_freetable */
			
 
				+}
			
 
				+
			
 
				+static void add_zero_pad(struct ntdb_used_record *u, size_t len, size_t extra)
			
 
				+{
			
 
				+	if (extra)
			
 
				+		((char *)(u + 1))[len] = '\0';
			
 
				+}
			
 
				+
			
 
				+static void set_data_record(void *mem, struct ntdb_context *ntdb,
			
 
				+			    struct tle_used *used)
			
 
				+{
			
 
				+	struct ntdb_used_record *u = mem;
			
 
				+
			
 
				+	set_header(ntdb, u, NTDB_USED_MAGIC, used->key.dsize, used->data.dsize,
			
 
				+		   used->key.dsize + used->data.dsize + used->extra);
			
 
				+	memcpy(u + 1, used->key.dptr, used->key.dsize);
			
 
				+	memcpy((char *)(u + 1) + used->key.dsize,
			
 
				+	       used->data.dptr, used->data.dsize);
			
 
				+	add_zero_pad(u, used->key.dsize + used->data.dsize, used->extra);
			
 
				+}
			
 
				+
			
 
				+static void set_capability(void *mem, struct ntdb_context *ntdb,
			
 
				+			   struct tle_capability *cap, struct ntdb_header *hdr,
			
 
				+			   ntdb_off_t last_cap)
			
 
				+{
			
 
				+	struct ntdb_capability *c = mem;
			
 
				+	ntdb_len_t len = sizeof(*c) - sizeof(struct ntdb_used_record) + cap->extra;
			
 
				+
			
 
				+	c->type = cap->type;
			
 
				+	c->next = 0;
			
 
				+	set_header(ntdb, &c->hdr, NTDB_CAP_MAGIC, 0, len, len);
			
 
				+
			
 
				+	/* Append to capability list. */
			
 
				+	if (!last_cap) {
			
 
				+		hdr->capabilities = cap->base.off;
			
 
				+	} else {
			
 
				+		c = (struct ntdb_capability *)((char *)hdr + last_cap);
			
 
				+		c->next = cap->base.off;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void set_freetable(void *mem, struct ntdb_context *ntdb,
			
 
				+			 struct tle_freetable *freetable, struct ntdb_header *hdr,
			
 
				+			 ntdb_off_t last_ftable)
			
 
				+{
			
 
				+	struct ntdb_freetable *ftable = mem;
			
 
				+	memset(ftable, 0, sizeof(*ftable));
			
 
				+	set_header(ntdb, &ftable->hdr, NTDB_FTABLE_MAGIC, 0,
			
 
				+			sizeof(*ftable) - sizeof(ftable->hdr),
			
 
				+			sizeof(*ftable) - sizeof(ftable->hdr));
			
 
				+
			
 
				+	if (last_ftable) {
			
 
				+		ftable = (struct ntdb_freetable *)((char *)hdr + last_ftable);
			
 
				+		ftable->next = freetable->base.off;
			
 
				+	} else {
			
 
				+		hdr->free_table = freetable->base.off;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void add_to_freetable(struct ntdb_context *ntdb,
			
 
				+			     ntdb_off_t eoff,
			
 
				+			     ntdb_off_t elen,
			
 
				+			     unsigned ftable,
			
 
				+			     struct tle_freetable *freetable)
			
 
				+{
			
 
				+	ntdb->ftable_off = freetable->base.off;
			
 
				+	ntdb->ftable = ftable;
			
 
				+	add_free_record(ntdb, eoff, sizeof(struct ntdb_used_record) + elen,
			
 
				+			NTDB_LOCK_WAIT, false);
			
 
				+}
			
 
				+
			
 
				+/* Get bits from a value. */
			
 
				+static uint32_t bits(uint64_t val, unsigned start, unsigned num)
			
 
				+{
			
 
				+	assert(num <= 32);
			
 
				+	return (val >> start) & ((1U << num) - 1);
			
 
				+}
			
 
				+
			
 
				+static ntdb_off_t encode_offset(const struct ntdb_context *ntdb,
			
 
				+				ntdb_off_t new_off, uint32_t hash)
			
 
				+{
			
 
				+	ntdb_off_t extra;
			
 
				+
			
 
				+	assert((new_off & (1ULL << NTDB_OFF_CHAIN_BIT)) == 0);
			
 
				+	assert((new_off >> (64 - NTDB_OFF_UPPER_STEAL)) == 0);
			
 
				+	/* We pack extra hash bits into the upper bits of the offset. */
			
 
				+	extra = bits(hash, ntdb->hash_bits, NTDB_OFF_UPPER_STEAL);
			
 
				+	extra <<= (64 - NTDB_OFF_UPPER_STEAL);
			
 
				+
			
 
				+	return new_off | extra;
			
 
				+}
			
 
				+
			
 
				+static ntdb_off_t hbucket_off(ntdb_len_t idx)
			
 
				+{
			
 
				+	return sizeof(struct ntdb_header) + sizeof(struct ntdb_used_record)
			
 
				+		+ idx * sizeof(ntdb_off_t);
			
 
				+}
			
 
				+
			
 
				+/* FIXME: Our hash table handling here is primitive: we don't expand! */
			
 
				+static void add_to_hashtable(struct ntdb_context *ntdb,
			
 
				+			     ntdb_off_t eoff,
			
 
				+			     NTDB_DATA key)
			
 
				+{
			
 
				+	ntdb_off_t b_off;
			
 
				+	uint32_t h = ntdb_hash(ntdb, key.dptr, key.dsize);
			
 
				+
			
 
				+	b_off = hbucket_off(h & ((1 << ntdb->hash_bits)-1));
			
 
				+	if (ntdb_read_off(ntdb, b_off) != 0)
			
 
				+		abort();
			
 
				+
			
 
				+	ntdb_write_off(ntdb, b_off, encode_offset(ntdb, eoff, h));
			
 
				+}
			
 
				+
			
 
				+static struct tle_freetable *find_ftable(struct ntdb_layout *layout, unsigned num)
			
 
				+{
			
 
				+	unsigned i;
			
 
				+
			
 
				+	for (i = 0; i < layout->num_elems; i++) {
			
 
				+		if (layout->elem[i].base.type != FREETABLE)
			
 
				+			continue;
			
 
				+		if (num == 0)
			
 
				+			return &layout->elem[i].ftable;
			
 
				+		num--;
			
 
				+	}
			
 
				+	abort();
			
 
				+}
			
 
				+
			
 
				+/* FIXME: Support NTDB_CONVERT */
			
 
				+struct ntdb_context *ntdb_layout_get(struct ntdb_layout *layout,
			
 
				+				   void (*freefn)(void *),
			
 
				+				   union ntdb_attribute *attr)
			
 
				+{
			
 
				+	unsigned int i;
			
 
				+	ntdb_off_t off, hdrlen, len, last_ftable, last_cap;
			
 
				+	char *mem;
			
 
				+	struct ntdb_context *ntdb;
			
 
				+
			
 
				+	/* Now populate our header, cribbing from a real NTDB header. */
			
 
				+	ntdb = ntdb_open("layout", NTDB_INTERNAL, O_RDWR, 0, attr);
			
 
				+
			
 
				+	off = sizeof(struct ntdb_header) + sizeof(struct ntdb_used_record)
			
 
				+		+ (sizeof(ntdb_off_t) << ntdb->hash_bits);
			
 
				+	hdrlen = off;
			
 
				+
			
 
				+	/* First pass of layout: calc lengths */
			
 
				+	for (i = 0; i < layout->num_elems; i++) {
			
 
				+		union ntdb_layout_elem *e = &layout->elem[i];
			
 
				+		e->base.off = off;
			
 
				+		switch (e->base.type) {
			
 
				+		case FREETABLE:
			
 
				+			len = freetable_len(&e->ftable);
			
 
				+			break;
			
 
				+		case FREE:
			
 
				+			len = free_record_len(e->free.len);
			
 
				+			break;
			
 
				+		case DATA:
			
 
				+			len = data_record_len(&e->used);
			
 
				+			break;
			
 
				+		case CAPABILITY:
			
 
				+			len = capability_len(&e->capability);
			
 
				+			break;
			
 
				+		default:
			
 
				+			abort();
			
 
				+		}
			
 
				+		off += len;
			
 
				+	}
			
 
				+
			
 
				+	mem = malloc(off);
			
 
				+	/* Fill with some weird pattern. */
			
 
				+	memset(mem, 0x99, off);
			
 
				+	memcpy(mem, ntdb->file->map_ptr, hdrlen);
			
 
				+
			
 
				+	/* Mug the ntdb we have to make it use this. */
			
 
				+	freefn(ntdb->file->map_ptr);
			
 
				+	ntdb->file->map_ptr = mem;
			
 
				+	ntdb->file->map_size = off;
			
 
				+
			
 
				+	last_ftable = 0;
			
 
				+	last_cap = 0;
			
 
				+	for (i = 0; i < layout->num_elems; i++) {
			
 
				+		union ntdb_layout_elem *e = &layout->elem[i];
			
 
				+		switch (e->base.type) {
			
 
				+		case FREETABLE:
			
 
				+			set_freetable(mem + e->base.off, ntdb, &e->ftable,
			
 
				+				     (struct ntdb_header *)mem, last_ftable);
			
 
				+			last_ftable = e->base.off;
			
 
				+			break;
			
 
				+		case FREE:
			
 
				+			set_free_record(mem + e->base.off, e->free.len);
			
 
				+			break;
			
 
				+		case DATA:
			
 
				+			set_data_record(mem + e->base.off, ntdb, &e->used);
			
 
				+			break;
			
 
				+		case CAPABILITY:
			
 
				+			set_capability(mem + e->base.off, ntdb, &e->capability,
			
 
				+				       (struct ntdb_header *)mem, last_cap);
			
 
				+			last_cap = e->base.off;
			
 
				+			break;
			
 
				+		}
			
 
				+	}
			
 
				+	/* Must have a free table! */
			
 
				+	assert(last_ftable);
			
 
				+
			
 
				+	/* Now fill the free and hash tables. */
			
 
				+	for (i = 0; i < layout->num_elems; i++) {
			
 
				+		union ntdb_layout_elem *e = &layout->elem[i];
			
 
				+		switch (e->base.type) {
			
 
				+		case FREE:
			
 
				+			add_to_freetable(ntdb, e->base.off, e->free.len,
			
 
				+					 e->free.ftable_num,
			
 
				+					 find_ftable(layout, e->free.ftable_num));
			
 
				+			break;
			
 
				+		case DATA:
			
 
				+			add_to_hashtable(ntdb, e->base.off, e->used.key);
			
 
				+			break;
			
 
				+		default:
			
 
				+			break;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	ntdb->ftable_off = find_ftable(layout, 0)->base.off;
			
 
				+	return ntdb;
			
 
				+}
			
 
				+
			
 
				+void ntdb_layout_write(struct ntdb_layout *layout, void (*freefn)(void *),
			
 
				+		       union ntdb_attribute *attr, const char *filename)
			
 
				+{
			
 
				+	struct ntdb_context *ntdb = ntdb_layout_get(layout, freefn, attr);
			
 
				+	int fd;
			
 
				+
			
 
				+	fd = open(filename, O_WRONLY|O_TRUNC|O_CREAT,  0600);
			
 
				+	if (fd < 0)
			
 
				+		err(1, "opening %s for writing", filename);
			
 
				+	if (write(fd, ntdb->file->map_ptr, ntdb->file->map_size)
			
 
				+	    != ntdb->file->map_size)
			
 
				+		err(1, "writing %s", filename);
			
 
				+	close(fd);
			
 
				+	ntdb_close(ntdb);
			
 
				+}
			
 
				+
			
 
				+void ntdb_layout_free(struct ntdb_layout *layout)
			
 
				+{
			
 
				+	unsigned int i;
			
 
				+
			
 
				+	for (i = 0; i < layout->num_elems; i++) {
			
 
				+		if (layout->elem[i].base.type == DATA) {
			
 
				+			free(layout->elem[i].used.key.dptr);
			
 
				+			free(layout->elem[i].used.data.dptr);
			
 
				+		}
			
 
				+	}
			
 
				+	free(layout->elem);
			
 
				+	free(layout);
			
 
				+}
			
--- a/ccan/ntdb/test/layout.h
+++ b/ccan/ntdb/test/layout.h
@@ -0,0 +1,79 @@
 
				+#ifndef NTDB_TEST_LAYOUT_H
			
 
				+#define NTDB_TEST_LAYOUT_H
			
 
				+#include "private.h"
			
 
				+
			
 
				+struct ntdb_layout *new_ntdb_layout(void);
			
 
				+void ntdb_layout_add_freetable(struct ntdb_layout *layout);
			
 
				+void ntdb_layout_add_free(struct ntdb_layout *layout, ntdb_len_t len,
			
 
				+			 unsigned ftable);
			
 
				+void ntdb_layout_add_used(struct ntdb_layout *layout,
			
 
				+			 NTDB_DATA key, NTDB_DATA data,
			
 
				+			 ntdb_len_t extra);
			
 
				+void ntdb_layout_add_capability(struct ntdb_layout *layout,
			
 
				+			       uint64_t type,
			
 
				+			       bool write_breaks,
			
 
				+			       bool check_breaks,
			
 
				+			       bool open_breaks,
			
 
				+			       ntdb_len_t extra);
			
 
				+
			
 
				+#if 0 /* FIXME: Allow allocation of subtables */
			
 
				+void ntdb_layout_add_hashtable(struct ntdb_layout *layout,
			
 
				+			      int htable_parent, /* -1 == toplevel */
			
 
				+			      unsigned int bucket,
			
 
				+			      ntdb_len_t extra);
			
 
				+#endif
			
 
				+/* freefn is needed if we're using failtest_free. */
			
 
				+struct ntdb_context *ntdb_layout_get(struct ntdb_layout *layout,
			
 
				+				   void (*freefn)(void *),
			
 
				+				   union ntdb_attribute *attr);
			
 
				+void ntdb_layout_write(struct ntdb_layout *layout, void (*freefn)(void *),
			
 
				+		       union ntdb_attribute *attr, const char *filename);
			
 
				+
			
 
				+void ntdb_layout_free(struct ntdb_layout *layout);
			
 
				+
			
 
				+enum layout_type {
			
 
				+	FREETABLE, FREE, DATA, CAPABILITY
			
 
				+};
			
 
				+
			
 
				+/* Shared by all union members. */
			
 
				+struct tle_base {
			
 
				+	enum layout_type type;
			
 
				+	ntdb_off_t off;
			
 
				+};
			
 
				+
			
 
				+struct tle_freetable {
			
 
				+	struct tle_base base;
			
 
				+};
			
 
				+
			
 
				+struct tle_free {
			
 
				+	struct tle_base base;
			
 
				+	ntdb_len_t len;
			
 
				+	unsigned ftable_num;
			
 
				+};
			
 
				+
			
 
				+struct tle_used {
			
 
				+	struct tle_base base;
			
 
				+	NTDB_DATA key;
			
 
				+	NTDB_DATA data;
			
 
				+	ntdb_len_t extra;
			
 
				+};
			
 
				+
			
 
				+struct tle_capability {
			
 
				+	struct tle_base base;
			
 
				+	uint64_t type;
			
 
				+	ntdb_len_t extra;
			
 
				+};
			
 
				+
			
 
				+union ntdb_layout_elem {
			
 
				+	struct tle_base base;
			
 
				+	struct tle_freetable ftable;
			
 
				+	struct tle_free free;
			
 
				+	struct tle_used used;
			
 
				+	struct tle_capability capability;
			
 
				+};
			
 
				+
			
 
				+struct ntdb_layout {
			
 
				+	unsigned int num_elems;
			
 
				+	union ntdb_layout_elem *elem;
			
 
				+};
			
 
				+#endif /* NTDB_TEST_LAYOUT_H */
			
--- a/ccan/ntdb/test/lock-tracking.c
+++ b/ccan/ntdb/test/lock-tracking.c
@@ -0,0 +1,147 @@
 
				+/* We save the locks so we can reaquire them. */
			
 
				+#include "private.h" /* For NTDB_HASH_LOCK_START, etc. */
			
 
				+#include <unistd.h>
			
 
				+#include <fcntl.h>
			
 
				+#include <stdarg.h>
			
 
				+#include <stdlib.h>
			
 
				+#include "tap-interface.h"
			
 
				+#include "lock-tracking.h"
			
 
				+
			
 
				+struct lock {
			
 
				+	struct lock *next;
			
 
				+	unsigned int off;
			
 
				+	unsigned int len;
			
 
				+	int type;
			
 
				+};
			
 
				+static struct lock *locks;
			
 
				+int locking_errors = 0;
			
 
				+bool suppress_lockcheck = false;
			
 
				+bool nonblocking_locks;
			
 
				+int locking_would_block = 0;
			
 
				+void (*unlock_callback)(int fd);
			
 
				+
			
 
				+int fcntl_with_lockcheck(int fd, int cmd, ... /* arg */ )
			
 
				+{
			
 
				+	va_list ap;
			
 
				+	int ret, arg3;
			
 
				+	struct flock *fl;
			
 
				+	bool may_block = false;
			
 
				+
			
 
				+	if (cmd != F_SETLK && cmd != F_SETLKW) {
			
 
				+		/* This may be totally bogus, but we don't know in general. */
			
 
				+		va_start(ap, cmd);
			
 
				+		arg3 = va_arg(ap, int);
			
 
				+		va_end(ap);
			
 
				+
			
 
				+		return fcntl(fd, cmd, arg3);
			
 
				+	}
			
 
				+
			
 
				+	va_start(ap, cmd);
			
 
				+	fl = va_arg(ap, struct flock *);
			
 
				+	va_end(ap);
			
 
				+
			
 
				+	if (cmd == F_SETLKW && nonblocking_locks) {
			
 
				+		cmd = F_SETLK;
			
 
				+		may_block = true;
			
 
				+	}
			
 
				+	ret = fcntl(fd, cmd, fl);
			
 
				+
			
 
				+	/* Detect when we failed, but might have been OK if we waited. */
			
 
				+	if (may_block && ret == -1 && (errno == EAGAIN || errno == EACCES)) {
			
 
				+		locking_would_block++;
			
 
				+	}
			
 
				+
			
 
				+	if (fl->l_type == F_UNLCK) {
			
 
				+		struct lock **l;
			
 
				+		struct lock *old = NULL;
			
 
				+
			
 
				+		for (l = &locks; *l; l = &(*l)->next) {
			
 
				+			if ((*l)->off == fl->l_start
			
 
				+			    && (*l)->len == fl->l_len) {
			
 
				+				if (ret == 0) {
			
 
				+					old = *l;
			
 
				+					*l = (*l)->next;
			
 
				+					free(old);
			
 
				+				}
			
 
				+				break;
			
 
				+			}
			
 
				+		}
			
 
				+		if (!old && !suppress_lockcheck) {
			
 
				+			diag("Unknown unlock %u@%u - %i",
			
 
				+			     (int)fl->l_len, (int)fl->l_start, ret);
			
 
				+			locking_errors++;
			
 
				+		}
			
 
				+	} else {
			
 
				+		struct lock *new, *i;
			
 
				+		unsigned int fl_end = fl->l_start + fl->l_len;
			
 
				+		if (fl->l_len == 0)
			
 
				+			fl_end = (unsigned int)-1;
			
 
				+
			
 
				+		/* Check for overlaps: we shouldn't do this. */
			
 
				+		for (i = locks; i; i = i->next) {
			
 
				+			unsigned int i_end = i->off + i->len;
			
 
				+			if (i->len == 0)
			
 
				+				i_end = (unsigned int)-1;
			
 
				+
			
 
				+			if (fl->l_start >= i->off && fl->l_start < i_end)
			
 
				+				break;
			
 
				+			if (fl_end > i->off && fl_end < i_end)
			
 
				+				break;
			
 
				+
			
 
				+			/* ntdb_allrecord_lock does this, handle adjacent: */
			
 
				+			if (fl->l_start > NTDB_HASH_LOCK_START
			
 
				+			    && fl->l_start == i_end && fl->l_type == i->type) {
			
 
				+				if (ret == 0) {
			
 
				+					i->len = fl->l_len
			
 
				+						? i->len + fl->l_len
			
 
				+						: 0;
			
 
				+				}
			
 
				+				goto done;
			
 
				+			}
			
 
				+		}
			
 
				+		if (i) {
			
 
				+			/* Special case: upgrade of allrecord lock. */
			
 
				+			if (i->type == F_RDLCK && fl->l_type == F_WRLCK
			
 
				+			    && i->off == NTDB_HASH_LOCK_START
			
 
				+			    && fl->l_start == NTDB_HASH_LOCK_START
			
 
				+			    && i->len == 0
			
 
				+			    && fl->l_len == 0) {
			
 
				+				if (ret == 0)
			
 
				+					i->type = F_WRLCK;
			
 
				+				goto done;
			
 
				+			}
			
 
				+			if (!suppress_lockcheck) {
			
 
				+				diag("%s lock %u@%u overlaps %u@%u",
			
 
				+				     fl->l_type == F_WRLCK ? "write" : "read",
			
 
				+				     (int)fl->l_len, (int)fl->l_start,
			
 
				+				     i->len, (int)i->off);
			
 
				+				locking_errors++;
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+		if (ret == 0) {
			
 
				+			new = malloc(sizeof *new);
			
 
				+			new->off = fl->l_start;
			
 
				+			new->len = fl->l_len;
			
 
				+			new->type = fl->l_type;
			
 
				+			new->next = locks;
			
 
				+			locks = new;
			
 
				+		}
			
 
				+	}
			
 
				+done:
			
 
				+	if (ret == 0 && fl->l_type == F_UNLCK && unlock_callback)
			
 
				+		unlock_callback(fd);
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+unsigned int forget_locking(void)
			
 
				+{
			
 
				+	unsigned int num = 0;
			
 
				+	while (locks) {
			
 
				+		struct lock *next = locks->next;
			
 
				+		free(locks);
			
 
				+		locks = next;
			
 
				+		num++;
			
 
				+	}
			
 
				+	return num;
			
 
				+}
			
--- a/ccan/ntdb/test/lock-tracking.h
+++ b/ccan/ntdb/test/lock-tracking.h
@@ -0,0 +1,25 @@
 
				+#ifndef LOCK_TRACKING_H
			
 
				+#define LOCK_TRACKING_H
			
 
				+#include <stdbool.h>
			
 
				+
			
 
				+/* Set this if you want a callback after fnctl unlock. */
			
 
				+extern void (*unlock_callback)(int fd);
			
 
				+
			
 
				+/* Replacement fcntl. */
			
 
				+int fcntl_with_lockcheck(int fd, int cmd, ... /* arg */ );
			
 
				+
			
 
				+/* Discard locking info: returns number of locks outstanding. */
			
 
				+unsigned int forget_locking(void);
			
 
				+
			
 
				+/* Number of errors in locking. */
			
 
				+extern int locking_errors;
			
 
				+
			
 
				+/* Suppress lock checking. */
			
 
				+extern bool suppress_lockcheck;
			
 
				+
			
 
				+/* Make all locks non-blocking. */
			
 
				+extern bool nonblocking_locks;
			
 
				+
			
 
				+/* Number of times we failed a lock because we made it non-blocking. */
			
 
				+extern int locking_would_block;
			
 
				+#endif /* LOCK_TRACKING_H */
			
--- a/ccan/ntdb/test/logging.c
+++ b/ccan/ntdb/test/logging.c
@@ -0,0 +1,30 @@
 
				+#include <stdio.h>
			
 
				+#include <stdlib.h>
			
 
				+#include "tap-interface.h"
			
 
				+#include "logging.h"
			
 
				+
			
 
				+unsigned tap_log_messages;
			
 
				+const char *log_prefix = "";
			
 
				+char *log_last = NULL;
			
 
				+bool suppress_logging;
			
 
				+
			
 
				+union ntdb_attribute tap_log_attr = {
			
 
				+	.log = { .base = { .attr = NTDB_ATTRIBUTE_LOG },
			
 
				+		 .fn = tap_log_fn }
			
 
				+};
			
 
				+
			
 
				+void tap_log_fn(struct ntdb_context *ntdb,
			
 
				+		enum ntdb_log_level level,
			
 
				+		enum NTDB_ERROR ecode,
			
 
				+		const char *message, void *priv)
			
 
				+{
			
 
				+	if (suppress_logging)
			
 
				+		return;
			
 
				+
			
 
				+	diag("ntdb log level %u: %s: %s%s",
			
 
				+	     level, ntdb_errorstr(ecode), log_prefix, message);
			
 
				+	if (log_last)
			
 
				+		free(log_last);
			
 
				+	log_last = strdup(message);
			
 
				+	tap_log_messages++;
			
 
				+}
			
--- a/ccan/ntdb/test/logging.h
+++ b/ccan/ntdb/test/logging.h
@@ -0,0 +1,17 @@
 
				+#ifndef NTDB_TEST_LOGGING_H
			
 
				+#define NTDB_TEST_LOGGING_H
			
 
				+#include "ntdb.h"
			
 
				+#include <stdbool.h>
			
 
				+#include <string.h>
			
 
				+
			
 
				+extern bool suppress_logging;
			
 
				+extern const char *log_prefix;
			
 
				+extern unsigned tap_log_messages;
			
 
				+extern union ntdb_attribute tap_log_attr;
			
 
				+extern char *log_last;
			
 
				+
			
 
				+void tap_log_fn(struct ntdb_context *ntdb,
			
 
				+		enum ntdb_log_level level,
			
 
				+		enum NTDB_ERROR ecode,
			
 
				+		const char *message, void *priv);
			
 
				+#endif /* NTDB_TEST_LOGGING_H */
			
--- a/ccan/ntdb/test/no-fsync.h
+++ b/ccan/ntdb/test/no-fsync.h
@@ -0,0 +1,6 @@
 
				+#ifndef NTDB_NO_FSYNC_H
			
 
				+#define NTDB_NO_FSYNC_H
			
 
				+/* Obey $TDB_NO_FSYNC, a bit like tdb does (only note our NTDB_NOSYNC
			
 
				+ * does less) */
			
 
				+#define MAYBE_NOSYNC (getenv("TDB_NO_FSYNC") ? NTDB_NOSYNC : 0)
			
 
				+#endif
			
--- a/ccan/ntdb/test/ntdb-source.h
+++ b/ccan/ntdb/test/ntdb-source.h
@@ -0,0 +1,11 @@
 
				+#include "config.h"
			
 
				+#include "check.c"
			
 
				+#include "free.c"
			
 
				+#include "hash.c"
			
 
				+#include "io.c"
			
 
				+#include "lock.c"
			
 
				+#include "open.c"
			
 
				+#include "summary.c"
			
 
				+#include "ntdb.c"
			
 
				+#include "transaction.c"
			
 
				+#include "traverse.c"
			
--- a/ccan/ntdb/test/python-api.py
+++ b/ccan/ntdb/test/python-api.py
@@ -0,0 +1,154 @@
 
				+#!/usr/bin/env python
			
 
				+# Some simple tests for the Python bindings for TDB
			
 
				+# Note that this tests the interface of the Python bindings
			
 
				+# It does not test tdb itself.
			
 
				+#
			
 
				+# Copyright (C) 2007-2013 Jelmer Vernooij <jelmer@samba.org>
			
 
				+# Published under the GNU LGPLv3 or later
			
 
				+
			
 
				+import ntdb
			
 
				+from unittest import TestCase
			
 
				+import os, tempfile
			
 
				+
			
 
				+
			
 
				+class OpenTdbTests(TestCase):
			
 
				+
			
 
				+    def test_nonexistent_read(self):
			
 
				+        self.assertRaises(IOError, ntdb.Ntdb, "/some/nonexistent/file", 0,
			
 
				+                ntdb.DEFAULT, os.O_RDWR)
			
 
				+
			
 
				+class CloseTdbTests(TestCase):
			
 
				+
			
 
				+    def test_double_close(self):
			
 
				+        self.ntdb = ntdb.Ntdb(tempfile.mkstemp()[1], ntdb.DEFAULT,
			
 
				+                           os.O_CREAT|os.O_RDWR)
			
 
				+        self.assertNotEqual(None, self.ntdb)
			
 
				+
			
 
				+        # ensure that double close does not crash python
			
 
				+        self.ntdb.close()
			
 
				+        self.ntdb.close()
			
 
				+
			
 
				+        # Check that further operations do not crash python
			
 
				+        self.assertRaises(RuntimeError, lambda: self.ntdb.transaction_start())
			
 
				+
			
 
				+        self.assertRaises(RuntimeError, lambda: self.ntdb["bar"])
			
 
				+
			
 
				+
			
 
				+class InternalTdbTests(TestCase):
			
 
				+
			
 
				+    def test_repr(self):
			
 
				+        self.ntdb = ntdb.Ntdb()
			
 
				+
			
 
				+        # repr used to crash on internal db
			
 
				+        self.assertEquals(repr(self.ntdb), "Ntdb(<internal>)")
			
 
				+
			
 
				+
			
 
				+class SimpleTdbTests(TestCase):
			
 
				+
			
 
				+    def setUp(self):
			
 
				+        super(SimpleTdbTests, self).setUp()
			
 
				+        self.ntdb = ntdb.Ntdb(tempfile.mkstemp()[1], ntdb.DEFAULT,
			
 
				+                           os.O_CREAT|os.O_RDWR)
			
 
				+        self.assertNotEqual(None, self.ntdb)
			
 
				+
			
 
				+    def tearDown(self):
			
 
				+        del self.ntdb
			
 
				+
			
 
				+    def test_repr(self):
			
 
				+        self.assertTrue(repr(self.ntdb).startswith("Ntdb('"))
			
 
				+
			
 
				+    def test_lockall(self):
			
 
				+        self.ntdb.lock_all()
			
 
				+
			
 
				+    def test_unlockall(self):
			
 
				+        self.ntdb.lock_all()
			
 
				+        self.ntdb.unlock_all()
			
 
				+
			
 
				+    def test_lockall_read(self):
			
 
				+        self.ntdb.read_lock_all()
			
 
				+        self.ntdb.read_unlock_all()
			
 
				+
			
 
				+    def test_store(self):
			
 
				+        self.ntdb.store("bar", "bla")
			
 
				+        self.assertEquals("bla", self.ntdb.get("bar"))
			
 
				+
			
 
				+    def test_getitem(self):
			
 
				+        self.ntdb["bar"] = "foo"
			
 
				+        self.assertEquals("foo", self.ntdb["bar"])
			
 
				+
			
 
				+    def test_delete(self):
			
 
				+        self.ntdb["bar"] = "foo"
			
 
				+        del self.ntdb["bar"]
			
 
				+        self.assertRaises(KeyError, lambda: self.ntdb["bar"])
			
 
				+
			
 
				+    def test_contains(self):
			
 
				+        self.ntdb["bla"] = "bloe"
			
 
				+        self.assertTrue("bla" in self.ntdb)
			
 
				+
			
 
				+    def test_keyerror(self):
			
 
				+        self.assertRaises(KeyError, lambda: self.ntdb["bla"])
			
 
				+
			
 
				+    def test_name(self):
			
 
				+        self.ntdb.filename
			
 
				+
			
 
				+    def test_iterator(self):
			
 
				+        self.ntdb["bla"] = "1"
			
 
				+        self.ntdb["brainslug"] = "2"
			
 
				+        l = list(self.ntdb)
			
 
				+        l.sort()
			
 
				+        self.assertEquals(["bla", "brainslug"], l)
			
 
				+
			
 
				+    def test_transaction_cancel(self):
			
 
				+        self.ntdb["bloe"] = "2"
			
 
				+        self.ntdb.transaction_start()
			
 
				+        self.ntdb["bloe"] = "1"
			
 
				+        self.ntdb.transaction_cancel()
			
 
				+        self.assertEquals("2", self.ntdb["bloe"])
			
 
				+
			
 
				+    def test_transaction_commit(self):
			
 
				+        self.ntdb["bloe"] = "2"
			
 
				+        self.ntdb.transaction_start()
			
 
				+        self.ntdb["bloe"] = "1"
			
 
				+        self.ntdb.transaction_commit()
			
 
				+        self.assertEquals("1", self.ntdb["bloe"])
			
 
				+
			
 
				+    def test_transaction_prepare_commit(self):
			
 
				+        self.ntdb["bloe"] = "2"
			
 
				+        self.ntdb.transaction_start()
			
 
				+        self.ntdb["bloe"] = "1"
			
 
				+        self.ntdb.transaction_prepare_commit()
			
 
				+        self.ntdb.transaction_commit()
			
 
				+        self.assertEquals("1", self.ntdb["bloe"])
			
 
				+
			
 
				+    def test_iterkeys(self):
			
 
				+        self.ntdb["bloe"] = "2"
			
 
				+        self.ntdb["bla"] = "25"
			
 
				+        i = self.ntdb.iterkeys()
			
 
				+        self.assertEquals(set(["bloe", "bla"]), set([i.next(), i.next()]))
			
 
				+
			
 
				+    def test_clear(self):
			
 
				+        self.ntdb["bloe"] = "2"
			
 
				+        self.ntdb["bla"] = "25"
			
 
				+        self.assertEquals(2, len(list(self.ntdb)))
			
 
				+        self.ntdb.clear()
			
 
				+        self.assertEquals(0, len(list(self.ntdb)))
			
 
				+
			
 
				+    def test_len(self):
			
 
				+        self.assertEquals(0, len(list(self.ntdb)))
			
 
				+        self.ntdb["entry"] = "value"
			
 
				+        self.assertEquals(1, len(list(self.ntdb)))
			
 
				+
			
 
				+    def test_add_flags(self):
			
 
				+        self.ntdb.add_flag(ntdb.NOMMAP)
			
 
				+        self.ntdb.remove_flag(ntdb.NOMMAP)
			
 
				+
			
 
				+
			
 
				+class VersionTests(TestCase):
			
 
				+
			
 
				+    def test_present(self):
			
 
				+        self.assertTrue(isinstance(ntdb.__version__, str))
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    import unittest
			
 
				+    unittest.TestProgram()
			
--- a/ccan/ntdb/test/run-001-encode.c
+++ b/ccan/ntdb/test/run-001-encode.c
@@ -0,0 +1,39 @@
 
				+#include "ntdb-source.h"
			
 
				+#include "tap-interface.h"
			
 
				+#include "logging.h"
			
 
				+
			
 
				+int main(int argc, char *argv[])
			
 
				+{
			
 
				+	unsigned int i;
			
 
				+	struct ntdb_used_record rec;
			
 
				+	struct ntdb_context ntdb = { .log_fn = tap_log_fn };
			
 
				+
			
 
				+	plan_tests(64 + 32 + 48*5 + 1);
			
 
				+
			
 
				+	/* We should be able to encode any data value. */
			
 
				+	for (i = 0; i < 64; i++)
			
 
				+		ok1(set_header(&ntdb, &rec, NTDB_USED_MAGIC, 0, 1ULL << i,
			
 
				+			       1ULL << i) == 0);
			
 
				+
			
 
				+	/* And any key and data with < 64 bits between them. */
			
 
				+	for (i = 0; i < 32; i++) {
			
 
				+		ntdb_len_t dlen = 1ULL >> (63 - i), klen = 1ULL << i;
			
 
				+		ok1(set_header(&ntdb, &rec, NTDB_USED_MAGIC, klen, dlen,
			
 
				+			       klen + dlen)  == 0);
			
 
				+	}
			
 
				+
			
 
				+	/* We should neatly encode all values. */
			
 
				+	for (i = 0; i < 48; i++) {
			
 
				+		uint64_t klen = 1ULL << (i < 16 ? i : 15);
			
 
				+		uint64_t dlen = 1ULL << i;
			
 
				+		uint64_t xlen = 1ULL << (i < 32 ? i : 31);
			
 
				+		ok1(set_header(&ntdb, &rec, NTDB_USED_MAGIC, klen, dlen,
			
 
				+			       klen+dlen+xlen) == 0);
			
 
				+		ok1(rec_key_length(&rec) == klen);
			
 
				+		ok1(rec_data_length(&rec) == dlen);
			
 
				+		ok1(rec_extra_padding(&rec) == xlen);
			
 
				+		ok1(rec_magic(&rec) == NTDB_USED_MAGIC);
			
 
				+	}
			
 
				+	ok1(tap_log_messages == 0);
			
 
				+	return exit_status();
			
 
				+}
			
--- a/ccan/ntdb/test/run-001-fls.c
+++ b/ccan/ntdb/test/run-001-fls.c
@@ -0,0 +1,33 @@
 
				+#include "ntdb-source.h"
			
 
				+#include "tap-interface.h"
			
 
				+
			
 
				+static unsigned int dumb_fls(uint64_t num)
			
 
				+{
			
 
				+	int i;
			
 
				+
			
 
				+	for (i = 63; i >= 0; i--) {
			
 
				+		if (num & (1ULL << i))
			
 
				+			break;
			
 
				+	}
			
 
				+	return i + 1;
			
 
				+}
			
 
				+
			
 
				+int main(int argc, char *argv[])
			
 
				+{
			
 
				+	unsigned int i, j;
			
 
				+
			
 
				+	plan_tests(64 * 64 + 2);
			
 
				+
			
 
				+	ok1(fls64(0) == 0);
			
 
				+	ok1(dumb_fls(0) == 0);
			
 
				+
			
 
				+	for (i = 0; i < 64; i++) {
			
 
				+		for (j = 0; j < 64; j++) {
			
 
				+			uint64_t val = (1ULL << i) | (1ULL << j);
			
 
				+			ok(fls64(val) == dumb_fls(val),
			
 
				+			   "%llu -> %u should be %u", (long long)val,
			
 
				+			   fls64(val), dumb_fls(val));
			
 
				+		}
			
 
				+	}
			
 
				+	return exit_status();
			
 
				+}
			
--- a/ccan/ntdb/test/run-01-new_database.c
+++ b/ccan/ntdb/test/run-01-new_database.c
@@ -0,0 +1,40 @@
 
				+#include <ccan/failtest/failtest_override.h>
			
 
				+#include "ntdb-source.h"
			
 
				+#include "tap-interface.h"
			
 
				+#include <ccan/failtest/failtest.h>
			
 
				+#include "logging.h"
			
 
				+#include "failtest_helper.h"
			
 
				+
			
 
				+int main(int argc, char *argv[])
			
 
				+{
			
 
				+	unsigned int i;
			
 
				+	struct ntdb_context *ntdb;
			
 
				+	int flags[] = { NTDB_INTERNAL, NTDB_DEFAULT, NTDB_NOMMAP,
			
 
				+			NTDB_INTERNAL|NTDB_CONVERT, NTDB_CONVERT,
			
 
				+			NTDB_NOMMAP|NTDB_CONVERT };
			
 
				+
			
 
				+	failtest_init(argc, argv);
			
 
				+	failtest_hook = block_repeat_failures;
			
 
				+	failtest_exit_check = exit_check_log;
			
 
				+	plan_tests(sizeof(flags) / sizeof(flags[0]) * 3);
			
 
				+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
			
 
				+		ntdb = ntdb_open("run-new_database.ntdb", flags[i]|MAYBE_NOSYNC,
			
 
				+				 O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
			
 
				+		if (!ok1(ntdb))
			
 
				+			failtest_exit(exit_status());
			
 
				+
			
 
				+		failtest_suppress = true;
			
 
				+		ok1(ntdb_check(ntdb, NULL, NULL) == 0);
			
 
				+		failtest_suppress = false;
			
 
				+		ntdb_close(ntdb);
			
 
				+		if (!ok1(tap_log_messages == 0))
			
 
				+			break;
			
 
				+	}
			
 
				+	failtest_exit(exit_status());
			
 
				+
			
 
				+	/*
			
 
				+	 * We will never reach this but the compiler complains if we do not
			
 
				+	 * return in this function.
			
 
				+	 */
			
 
				+	return EFAULT;
			
 
				+}
			
--- a/ccan/ntdb/test/run-02-expand.c
+++ b/ccan/ntdb/test/run-02-expand.c
@@ -0,0 +1,68 @@
 
				+#include <ccan/failtest/failtest_override.h>
			
 
				+#include "ntdb-source.h"
			
 
				+#include "tap-interface.h"
			
 
				+#include <ccan/failtest/failtest.h>
			
 
				+#include "logging.h"
			
 
				+#include "failtest_helper.h"
			
 
				+
			
 
				+int main(int argc, char *argv[])
			
 
				+{
			
 
				+	unsigned int i;
			
 
				+	uint64_t val;
			
 
				+	struct ntdb_context *ntdb;
			
 
				+	int flags[] = { NTDB_INTERNAL, NTDB_DEFAULT, NTDB_NOMMAP,
			
 
				+			NTDB_INTERNAL|NTDB_CONVERT, NTDB_CONVERT,
			
 
				+			NTDB_NOMMAP|NTDB_CONVERT };
			
 
				+
			
 
				+	plan_tests(sizeof(flags) / sizeof(flags[0]) * 11 + 1);
			
 
				+
			
 
				+	failtest_init(argc, argv);
			
 
				+	failtest_hook = block_repeat_failures;
			
 
				+	failtest_exit_check = exit_check_log;
			
 
				+
			
 
				+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
			
 
				+		failtest_suppress = true;
			
 
				+		ntdb = ntdb_open("run-expand.ntdb", flags[i]|MAYBE_NOSYNC,
			
 
				+				 O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
			
 
				+		if (!ok1(ntdb))
			
 
				+			break;
			
 
				+
			
 
				+		val = ntdb->file->map_size;
			
 
				+		/* Need some hash lock for expand. */
			
 
				+		ok1(ntdb_lock_hash(ntdb, 0, F_WRLCK) == 0);
			
 
				+		failtest_suppress = false;
			
 
				+		if (!ok1(ntdb_expand(ntdb, 1) == 0)) {
			
 
				+			failtest_suppress = true;
			
 
				+			ntdb_close(ntdb);
			
 
				+			break;
			
 
				+		}
			
 
				+		failtest_suppress = true;
			
 
				+
			
 
				+		ok1(ntdb->file->map_size >= val + 1 * NTDB_EXTENSION_FACTOR);
			
 
				+		ok1(ntdb_unlock_hash(ntdb, 0, F_WRLCK) == 0);
			
 
				+		ok1(ntdb_check(ntdb, NULL, NULL) == 0);
			
 
				+
			
 
				+		val = ntdb->file->map_size;
			
 
				+		ok1(ntdb_lock_hash(ntdb, 0, F_WRLCK) == 0);
			
 
				+		failtest_suppress = false;
			
 
				+		if (!ok1(ntdb_expand(ntdb, 1024) == 0)) {
			
 
				+			failtest_suppress = true;
			
 
				+			ntdb_close(ntdb);
			
 
				+			break;
			
 
				+		}
			
 
				+		failtest_suppress = true;
			
 
				+		ok1(ntdb_unlock_hash(ntdb, 0, F_WRLCK) == 0);
			
 
				+		ok1(ntdb->file->map_size >= val + 1024 * NTDB_EXTENSION_FACTOR);
			
 
				+		ok1(ntdb_check(ntdb, NULL, NULL) == 0);
			
 
				+		ntdb_close(ntdb);
			
 
				+	}
			
 
				+
			
 
				+	ok1(tap_log_messages == 0);
			
 
				+	failtest_exit(exit_status());
			
 
				+
			
 
				+	/*
			
 
				+	 * We will never reach this but the compiler complains if we do not
			
 
				+	 * return in this function.
			
 
				+	 */
			
 
				+	return EFAULT;
			
 
				+}
			
--- a/ccan/ntdb/test/run-03-coalesce.c
+++ b/ccan/ntdb/test/run-03-coalesce.c
@@ -0,0 +1,178 @@
 
				+#include "ntdb-source.h"
			
 
				+#include "tap-interface.h"
			
 
				+#include "logging.h"
			
 
				+#include "layout.h"
			
 
				+
			
 
				+static ntdb_len_t free_record_length(struct ntdb_context *ntdb, ntdb_off_t off)
			
 
				+{
			
 
				+	struct ntdb_free_record f;
			
 
				+	enum NTDB_ERROR ecode;
			
 
				+
			
 
				+	ecode = ntdb_read_convert(ntdb, off, &f, sizeof(f));
			
 
				+	if (ecode != NTDB_SUCCESS)
			
 
				+		return ecode;
			
 
				+	if (frec_magic(&f) != NTDB_FREE_MAGIC)
			
 
				+		return NTDB_ERR_CORRUPT;
			
 
				+	return frec_len(&f);
			
 
				+}
			
 
				+
			
 
				+int main(int argc, char *argv[])
			
 
				+{
			
 
				+	ntdb_off_t b_off, test;
			
 
				+	struct ntdb_context *ntdb;
			
 
				+	struct ntdb_layout *layout;
			
 
				+	NTDB_DATA data, key;
			
 
				+	ntdb_len_t len;
			
 
				+
			
 
				+	/* FIXME: Test NTDB_CONVERT */
			
 
				+	/* FIXME: Test lock order fail. */
			
 
				+
			
 
				+	plan_tests(42);
			
 
				+	data = ntdb_mkdata("world", 5);
			
 
				+	key = ntdb_mkdata("hello", 5);
			
 
				+
			
 
				+	/* No coalescing can be done due to EOF */
			
 
				+	layout = new_ntdb_layout();
			
 
				+	ntdb_layout_add_freetable(layout);
			
 
				+	len = 15560;
			
 
				+	ntdb_layout_add_free(layout, len, 0);
			
 
				+	ntdb_layout_write(layout, free, &tap_log_attr, "run-03-coalesce.ntdb");
			
 
				+	/* NOMMAP is for lockcheck. */
			
 
				+	ntdb = ntdb_open("run-03-coalesce.ntdb", NTDB_NOMMAP|MAYBE_NOSYNC,
			
 
				+			 O_RDWR, 0, &tap_log_attr);
			
 
				+	ok1(ntdb_check(ntdb, NULL, NULL) == 0);
			
 
				+	ok1(free_record_length(ntdb, layout->elem[1].base.off) == len);
			
 
				+
			
 
				+	/* Figure out which bucket free entry is. */
			
 
				+	b_off = bucket_off(ntdb->ftable_off, size_to_bucket(len));
			
 
				+	/* Lock and fail to coalesce. */
			
 
				+	ok1(ntdb_lock_free_bucket(ntdb, b_off, NTDB_LOCK_WAIT) == 0);
			
 
				+	test = layout->elem[1].base.off;
			
 
				+	ok1(coalesce(ntdb, layout->elem[1].base.off, b_off, len, &test)
			
 
				+	    == 0);
			
 
				+	ntdb_unlock_free_bucket(ntdb, b_off);
			
 
				+	ok1(free_record_length(ntdb, layout->elem[1].base.off) == len);
			
 
				+	ok1(test == layout->elem[1].base.off);
			
 
				+	ok1(ntdb_check(ntdb, NULL, NULL) == 0);
			
 
				+	ntdb_close(ntdb);
			
 
				+	ntdb_layout_free(layout);
			
 
				+
			
 
				+	/* No coalescing can be done due to used record */
			
 
				+	layout = new_ntdb_layout();
			
 
				+	ntdb_layout_add_freetable(layout);
			
 
				+	ntdb_layout_add_free(layout, 15528, 0);
			
 
				+	ntdb_layout_add_used(layout, key, data, 6);
			
 
				+	ntdb_layout_write(layout, free, &tap_log_attr, "run-03-coalesce.ntdb");
			
 
				+	/* NOMMAP is for lockcheck. */
			
 
				+	ntdb = ntdb_open("run-03-coalesce.ntdb", NTDB_NOMMAP|MAYBE_NOSYNC,
			
 
				+			 O_RDWR, 0, &tap_log_attr);
			
 
				+	ok1(free_record_length(ntdb, layout->elem[1].base.off) == 15528);
			
 
				+	ok1(ntdb_check(ntdb, NULL, NULL) == 0);
			
 
				+
			
 
				+	/* Figure out which bucket free entry is. */
			
 
				+	b_off = bucket_off(ntdb->ftable_off, size_to_bucket(15528));
			
 
				+	/* Lock and fail to coalesce. */
			
 
				+	ok1(ntdb_lock_free_bucket(ntdb, b_off, NTDB_LOCK_WAIT) == 0);
			
 
				+	test = layout->elem[1].base.off;
			
 
				+	ok1(coalesce(ntdb, layout->elem[1].base.off, b_off, 15528, &test)
			
 
				+	    == 0);
			
 
				+	ntdb_unlock_free_bucket(ntdb, b_off);
			
 
				+	ok1(free_record_length(ntdb, layout->elem[1].base.off) == 15528);
			
 
				+	ok1(test == layout->elem[1].base.off);
			
 
				+	ok1(ntdb_check(ntdb, NULL, NULL) == 0);
			
 
				+	ntdb_close(ntdb);
			
 
				+	ntdb_layout_free(layout);
			
 
				+
			
 
				+	/* Coalescing can be done due to two free records, then EOF */
			
 
				+	layout = new_ntdb_layout();
			
 
				+	ntdb_layout_add_freetable(layout);
			
 
				+	ntdb_layout_add_free(layout, 1024, 0);
			
 
				+	ntdb_layout_add_free(layout, 14520, 0);
			
 
				+	ntdb_layout_write(layout, free, &tap_log_attr, "run-03-coalesce.ntdb");
			
 
				+	/* NOMMAP is for lockcheck. */
			
 
				+	ntdb = ntdb_open("run-03-coalesce.ntdb", NTDB_NOMMAP|MAYBE_NOSYNC,
			
 
				+			 O_RDWR, 0, &tap_log_attr);
			
 
				+	ok1(free_record_length(ntdb, layout->elem[1].base.off) == 1024);
			
 
				+	ok1(free_record_length(ntdb, layout->elem[2].base.off) == 14520);
			
 
				+	ok1(ntdb_check(ntdb, NULL, NULL) == 0);
			
 
				+
			
 
				+	/* Figure out which bucket (first) free entry is. */
			
 
				+	b_off = bucket_off(ntdb->ftable_off, size_to_bucket(1024));
			
 
				+	/* Lock and coalesce. */
			
 
				+	ok1(ntdb_lock_free_bucket(ntdb, b_off, NTDB_LOCK_WAIT) == 0);
			
 
				+	test = layout->elem[2].base.off;
			
 
				+	ok1(coalesce(ntdb, layout->elem[1].base.off, b_off, 1024, &test)
			
 
				+	    == 1024 + sizeof(struct ntdb_used_record) + 14520);
			
 
				+	/* Should tell us it's erased this one... */
			
 
				+	ok1(test == NTDB_ERR_NOEXIST);
			
 
				+	ok1(ntdb->file->allrecord_lock.count == 0 && ntdb->file->num_lockrecs == 0);
			
 
				+	ok1(free_record_length(ntdb, layout->elem[1].base.off)
			
 
				+	    == 1024 + sizeof(struct ntdb_used_record) + 14520);
			
 
				+	ok1(ntdb_check(ntdb, NULL, NULL) == 0);
			
 
				+	ntdb_close(ntdb);
			
 
				+	ntdb_layout_free(layout);
			
 
				+
			
 
				+	/* Coalescing can be done due to two free records, then data */
			
 
				+	layout = new_ntdb_layout();
			
 
				+	ntdb_layout_add_freetable(layout);
			
 
				+	ntdb_layout_add_free(layout, 1024, 0);
			
 
				+	ntdb_layout_add_free(layout, 14488, 0);
			
 
				+	ntdb_layout_add_used(layout, key, data, 6);
			
 
				+	ntdb_layout_write(layout, free, &tap_log_attr, "run-03-coalesce.ntdb");
			
 
				+	/* NOMMAP is for lockcheck. */
			
 
				+	ntdb = ntdb_open("run-03-coalesce.ntdb", NTDB_NOMMAP|MAYBE_NOSYNC,
			
 
				+			 O_RDWR, 0, &tap_log_attr);
			
 
				+	ok1(free_record_length(ntdb, layout->elem[1].base.off) == 1024);
			
 
				+	ok1(free_record_length(ntdb, layout->elem[2].base.off) == 14488);
			
 
				+	ok1(ntdb_check(ntdb, NULL, NULL) == 0);
			
 
				+
			
 
				+	/* Figure out which bucket free entry is. */
			
 
				+	b_off = bucket_off(ntdb->ftable_off, size_to_bucket(1024));
			
 
				+	/* Lock and coalesce. */
			
 
				+	ok1(ntdb_lock_free_bucket(ntdb, b_off, NTDB_LOCK_WAIT) == 0);
			
 
				+	test = layout->elem[2].base.off;
			
 
				+	ok1(coalesce(ntdb, layout->elem[1].base.off, b_off, 1024, &test)
			
 
				+	    == 1024 + sizeof(struct ntdb_used_record) + 14488);
			
 
				+	ok1(ntdb->file->allrecord_lock.count == 0 && ntdb->file->num_lockrecs == 0);
			
 
				+	ok1(free_record_length(ntdb, layout->elem[1].base.off)
			
 
				+	    == 1024 + sizeof(struct ntdb_used_record) + 14488);
			
 
				+	ok1(test == NTDB_ERR_NOEXIST);
			
 
				+	ok1(ntdb_check(ntdb, NULL, NULL) == 0);
			
 
				+	ntdb_close(ntdb);
			
 
				+	ntdb_layout_free(layout);
			
 
				+
			
 
				+	/* Coalescing can be done due to three free records, then EOF */
			
 
				+	layout = new_ntdb_layout();
			
 
				+	ntdb_layout_add_freetable(layout);
			
 
				+	ntdb_layout_add_free(layout, 1024, 0);
			
 
				+	ntdb_layout_add_free(layout, 512, 0);
			
 
				+	ntdb_layout_add_free(layout, 13992, 0);
			
 
				+	ntdb_layout_write(layout, free, &tap_log_attr, "run-03-coalesce.ntdb");
			
 
				+	/* NOMMAP is for lockcheck. */
			
 
				+	ntdb = ntdb_open("run-03-coalesce.ntdb", NTDB_NOMMAP|MAYBE_NOSYNC,
			
 
				+			 O_RDWR, 0, &tap_log_attr);
			
 
				+	ok1(free_record_length(ntdb, layout->elem[1].base.off) == 1024);
			
 
				+	ok1(free_record_length(ntdb, layout->elem[2].base.off) == 512);
			
 
				+	ok1(free_record_length(ntdb, layout->elem[3].base.off) == 13992);
			
 
				+	ok1(ntdb_check(ntdb, NULL, NULL) == 0);
			
 
				+
			
 
				+	/* Figure out which bucket free entry is. */
			
 
				+	b_off = bucket_off(ntdb->ftable_off, size_to_bucket(1024));
			
 
				+	/* Lock and coalesce. */
			
 
				+	ok1(ntdb_lock_free_bucket(ntdb, b_off, NTDB_LOCK_WAIT) == 0);
			
 
				+	test = layout->elem[2].base.off;
			
 
				+	ok1(coalesce(ntdb, layout->elem[1].base.off, b_off, 1024, &test)
			
 
				+	    == 1024 + sizeof(struct ntdb_used_record) + 512
			
 
				+	    + sizeof(struct ntdb_used_record) + 13992);
			
 
				+	ok1(ntdb->file->allrecord_lock.count == 0
			
 
				+	    && ntdb->file->num_lockrecs == 0);
			
 
				+	ok1(free_record_length(ntdb, layout->elem[1].base.off)
			
 
				+	    == 1024 + sizeof(struct ntdb_used_record) + 512
			
 
				+	    + sizeof(struct ntdb_used_record) + 13992);
			
 
				+	ok1(ntdb_check(ntdb, NULL, NULL) == 0);
			
 
				+	ntdb_close(ntdb);
			
 
				+	ntdb_layout_free(layout);
			
 
				+
			
 
				+	ok1(tap_log_messages == 0);
			
 
				+	return exit_status();
			
 
				+}
			
--- a/ccan/ntdb/test/run-04-basichash.c
+++ b/ccan/ntdb/test/run-04-basichash.c
@@ -0,0 +1,321 @@
 
				+#include "ntdb-source.h"
			
 
				+#include "tap-interface.h"
			
 
				+#include "logging.h"
			
 
				+
			
 
				+/* We rig the hash so all records clash. */
			
 
				+static uint32_t clash(const void *key, size_t len, uint32_t seed, void *priv)
			
 
				+{
			
 
				+	return *((const unsigned int *)key) << 20;
			
 
				+}
			
 
				+
			
 
				+int main(int argc, char *argv[])
			
 
				+{
			
 
				+	unsigned int i;
			
 
				+	struct ntdb_context *ntdb;
			
 
				+	unsigned int v;
			
 
				+	struct ntdb_used_record rec;
			
 
				+	NTDB_DATA key = { (unsigned char *)&v, sizeof(v) };
			
 
				+	NTDB_DATA dbuf = { (unsigned char *)&v, sizeof(v) };
			
 
				+	union ntdb_attribute hattr = { .hash = { .base = { NTDB_ATTRIBUTE_HASH },
			
 
				+						.fn = clash } };
			
 
				+	int flags[] = { NTDB_INTERNAL, NTDB_DEFAULT, NTDB_NOMMAP,
			
 
				+			NTDB_INTERNAL|NTDB_CONVERT, NTDB_CONVERT,
			
 
				+			NTDB_NOMMAP|NTDB_CONVERT,
			
 
				+	};
			
 
				+
			
 
				+	hattr.base.next = &tap_log_attr;
			
 
				+
			
 
				+	plan_tests(sizeof(flags) / sizeof(flags[0]) * 137 + 1);
			
 
				+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
			
 
				+		struct hash_info h;
			
 
				+		ntdb_off_t new_off, new_off2, off;
			
 
				+
			
 
				+		ntdb = ntdb_open("run-04-basichash.ntdb", flags[i]|MAYBE_NOSYNC,
			
 
				+				 O_RDWR|O_CREAT|O_TRUNC, 0600, &hattr);
			
 
				+		ok1(ntdb);
			
 
				+		if (!ntdb)
			
 
				+			continue;
			
 
				+
			
 
				+		v = 0;
			
 
				+		/* Should not find it. */
			
 
				+		ok1(find_and_lock(ntdb, key, F_WRLCK, &h, &rec, NULL) == 0);
			
 
				+		/* Should have created correct hash. */
			
 
				+		ok1(h.h == ntdb_hash(ntdb, key.dptr, key.dsize));
			
 
				+		/* Should have located space in top table, bucket 0. */
			
 
				+		ok1(h.table == NTDB_HASH_OFFSET);
			
 
				+		ok1(h.table_size == (1 << ntdb->hash_bits));
			
 
				+		ok1(h.bucket == 0);
			
 
				+		ok1(h.old_val == 0);
			
 
				+
			
 
				+		/* Should have lock on bucket 0 */
			
 
				+		ok1(h.h == 0);
			
 
				+		ok1((ntdb->flags & NTDB_NOLOCK) || ntdb->file->num_lockrecs == 1);
			
 
				+		ok1((ntdb->flags & NTDB_NOLOCK)
			
 
				+		    || ntdb->file->lockrecs[0].off == NTDB_HASH_LOCK_START);
			
 
				+		/* FIXME: Check lock length */
			
 
				+
			
 
				+		/* Allocate a new record. */
			
 
				+		new_off = alloc(ntdb, key.dsize, dbuf.dsize,
			
 
				+				NTDB_USED_MAGIC, false);
			
 
				+		ok1(!NTDB_OFF_IS_ERR(new_off));
			
 
				+
			
 
				+		/* We should be able to add it now. */
			
 
				+		ok1(add_to_hash(ntdb, &h, new_off) == 0);
			
 
				+
			
 
				+		/* Make sure we fill it in for later finding. */
			
 
				+		off = new_off + sizeof(struct ntdb_used_record);
			
 
				+		ok1(!ntdb->io->twrite(ntdb, off, key.dptr, key.dsize));
			
 
				+		off += key.dsize;
			
 
				+		ok1(!ntdb->io->twrite(ntdb, off, dbuf.dptr, dbuf.dsize));
			
 
				+
			
 
				+		/* We should be able to unlock that OK. */
			
 
				+		ok1(ntdb_unlock_hash(ntdb, h.h, F_WRLCK) == 0);
			
 
				+
			
 
				+		/* Database should be consistent. */
			
 
				+		ok1(ntdb_check(ntdb, NULL, NULL) == 0);
			
 
				+
			
 
				+		/* Now, this should give a successful lookup. */
			
 
				+		ok1(find_and_lock(ntdb, key, F_WRLCK, &h, &rec, NULL) == new_off);
			
 
				+		/* Should have created correct hash. */
			
 
				+		ok1(h.h == ntdb_hash(ntdb, key.dptr, key.dsize));
			
 
				+		/* Should have located it in top table, bucket 0. */
			
 
				+		ok1(h.table == NTDB_HASH_OFFSET);
			
 
				+		ok1(h.table_size == (1 << ntdb->hash_bits));
			
 
				+		ok1(h.bucket == 0);
			
 
				+
			
 
				+		/* Should have lock on bucket 0 */
			
 
				+		ok1(h.h == 0);
			
 
				+		ok1((ntdb->flags & NTDB_NOLOCK) || ntdb->file->num_lockrecs == 1);
			
 
				+		ok1((ntdb->flags & NTDB_NOLOCK)
			
 
				+		    || ntdb->file->lockrecs[0].off == NTDB_HASH_LOCK_START);
			
 
				+		/* FIXME: Check lock length */
			
 
				+
			
 
				+		ok1(ntdb_unlock_hash(ntdb, h.h, F_WRLCK) == 0);
			
 
				+
			
 
				+		/* Database should be consistent. */
			
 
				+		ok1(ntdb_check(ntdb, NULL, NULL) == 0);
			
 
				+
			
 
				+		/* Test expansion. */
			
 
				+		v = 1;
			
 
				+		ok1(find_and_lock(ntdb, key, F_WRLCK, &h, &rec, NULL) == 0);
			
 
				+		/* Should have created correct hash. */
			
 
				+		ok1(h.h == ntdb_hash(ntdb, key.dptr, key.dsize));
			
 
				+		/* Should have located clash in toplevel bucket 0. */
			
 
				+		ok1(h.table == NTDB_HASH_OFFSET);
			
 
				+		ok1(h.table_size == (1 << ntdb->hash_bits));
			
 
				+		ok1(h.bucket == 0);
			
 
				+		ok1((h.old_val & NTDB_OFF_MASK) == new_off);
			
 
				+
			
 
				+		/* Should have lock on bucket 0 */
			
 
				+		ok1((h.h & ((1 << ntdb->hash_bits)-1)) == 0);
			
 
				+		ok1((ntdb->flags & NTDB_NOLOCK) || ntdb->file->num_lockrecs == 1);
			
 
				+		ok1((ntdb->flags & NTDB_NOLOCK)
			
 
				+		    || ntdb->file->lockrecs[0].off == NTDB_HASH_LOCK_START);
			
 
				+		/* FIXME: Check lock length */
			
 
				+
			
 
				+		new_off2 = alloc(ntdb, key.dsize, dbuf.dsize,
			
 
				+				 NTDB_USED_MAGIC, false);
			
 
				+		ok1(!NTDB_OFF_IS_ERR(new_off2));
			
 
				+
			
 
				+		off = new_off2 + sizeof(struct ntdb_used_record);
			
 
				+		ok1(!ntdb->io->twrite(ntdb, off, key.dptr, key.dsize));
			
 
				+		off += key.dsize;
			
 
				+		ok1(!ntdb->io->twrite(ntdb, off, dbuf.dptr, dbuf.dsize));
			
 
				+
			
 
				+		/* We should be able to add it now. */
			
 
				+		ok1(add_to_hash(ntdb, &h, new_off2) == 0);
			
 
				+		ok1(ntdb_unlock_hash(ntdb, h.h, F_WRLCK) == 0);
			
 
				+
			
 
				+		/* Should be happy with expansion. */
			
 
				+		ok1(ntdb_check(ntdb, NULL, NULL) == 0);
			
 
				+
			
 
				+		/* Should be able to find both. */
			
 
				+		v = 1;
			
 
				+		ok1(find_and_lock(ntdb, key, F_WRLCK, &h, &rec, NULL) == new_off2);
			
 
				+		/* Should have created correct hash. */
			
 
				+		ok1(h.h == ntdb_hash(ntdb, key.dptr, key.dsize));
			
 
				+		/* Should have located space in chain. */
			
 
				+		ok1(h.table > NTDB_HASH_OFFSET);
			
 
				+		ok1(h.table_size == 2);
			
 
				+		ok1(h.bucket == 1);
			
 
				+		/* Should have lock on bucket 0 */
			
 
				+		ok1((h.h & ((1 << ntdb->hash_bits)-1)) == 0);
			
 
				+		ok1((ntdb->flags & NTDB_NOLOCK) || ntdb->file->num_lockrecs == 1);
			
 
				+		ok1((ntdb->flags & NTDB_NOLOCK)
			
 
				+		    || ntdb->file->lockrecs[0].off == NTDB_HASH_LOCK_START);
			
 
				+		ok1(ntdb_unlock_hash(ntdb, h.h, F_WRLCK) == 0);
			
 
				+
			
 
				+		v = 0;
			
 
				+		ok1(find_and_lock(ntdb, key, F_WRLCK, &h, &rec, NULL) == new_off);
			
 
				+		/* Should have created correct hash. */
			
 
				+		ok1(h.h == ntdb_hash(ntdb, key.dptr, key.dsize));
			
 
				+		/* Should have located space in chain. */
			
 
				+		ok1(h.table > NTDB_HASH_OFFSET);
			
 
				+		ok1(h.table_size == 2);
			
 
				+		ok1(h.bucket == 0);
			
 
				+
			
 
				+		/* Should have lock on bucket 0 */
			
 
				+		ok1((h.h & ((1 << ntdb->hash_bits)-1)) == 0);
			
 
				+		ok1((ntdb->flags & NTDB_NOLOCK) || ntdb->file->num_lockrecs == 1);
			
 
				+		ok1((ntdb->flags & NTDB_NOLOCK)
			
 
				+		    || ntdb->file->lockrecs[0].off == NTDB_HASH_LOCK_START);
			
 
				+		/* FIXME: Check lock length */
			
 
				+
			
 
				+		/* Simple delete should work. */
			
 
				+		ok1(delete_from_hash(ntdb, &h) == 0);
			
 
				+		ok1(add_free_record(ntdb, new_off,
			
 
				+				    sizeof(struct ntdb_used_record)
			
 
				+				    + rec_key_length(&rec)
			
 
				+				    + rec_data_length(&rec)
			
 
				+				    + rec_extra_padding(&rec),
			
 
				+				    NTDB_LOCK_NOWAIT, false) == 0);
			
 
				+		ok1(ntdb_unlock_hash(ntdb, h.h, F_WRLCK) == 0);
			
 
				+		ok1(ntdb_check(ntdb, NULL, NULL) == 0);
			
 
				+
			
 
				+		/* Should still be able to find other record. */
			
 
				+		v = 1;
			
 
				+		ok1(find_and_lock(ntdb, key, F_WRLCK, &h, &rec, NULL) == new_off2);
			
 
				+		/* Should have created correct hash. */
			
 
				+		ok1(h.h == ntdb_hash(ntdb, key.dptr, key.dsize));
			
 
				+		/* Should have located space in chain. */
			
 
				+		ok1(h.table > NTDB_HASH_OFFSET);
			
 
				+		ok1(h.table_size == 2);
			
 
				+		ok1(h.bucket == 1);
			
 
				+		/* Should have lock on bucket 0 */
			
 
				+		ok1((h.h & ((1 << ntdb->hash_bits)-1)) == 0);
			
 
				+		ok1((ntdb->flags & NTDB_NOLOCK) || ntdb->file->num_lockrecs == 1);
			
 
				+		ok1((ntdb->flags & NTDB_NOLOCK)
			
 
				+		    || ntdb->file->lockrecs[0].off == NTDB_HASH_LOCK_START);
			
 
				+		ok1(ntdb_unlock_hash(ntdb, h.h, F_WRLCK) == 0);
			
 
				+
			
 
				+		/* Now should find empty space. */
			
 
				+		v = 0;
			
 
				+		ok1(find_and_lock(ntdb, key, F_WRLCK, &h, &rec, NULL) == 0);
			
 
				+		/* Should have created correct hash. */
			
 
				+		ok1(h.h == ntdb_hash(ntdb, key.dptr, key.dsize));
			
 
				+		/* Should have located space in chain, bucket 0. */
			
 
				+		ok1(h.table > NTDB_HASH_OFFSET);
			
 
				+		ok1(h.table_size == 2);
			
 
				+		ok1(h.bucket == 0);
			
 
				+		ok1(h.old_val == 0);
			
 
				+
			
 
				+		/* Adding another record should work. */
			
 
				+		v = 2;
			
 
				+		ok1(find_and_lock(ntdb, key, F_WRLCK, &h, &rec, NULL) == 0);
			
 
				+		/* Should have created correct hash. */
			
 
				+		ok1(h.h == ntdb_hash(ntdb, key.dptr, key.dsize));
			
 
				+		/* Should have located space in chain, bucket 0. */
			
 
				+		ok1(h.table > NTDB_HASH_OFFSET);
			
 
				+		ok1(h.table_size == 2);
			
 
				+		ok1(h.bucket == 0);
			
 
				+		ok1(h.old_val == 0);
			
 
				+
			
 
				+		/* Should have lock on bucket 0 */
			
 
				+		ok1((h.h & ((1 << ntdb->hash_bits)-1)) == 0);
			
 
				+		ok1((ntdb->flags & NTDB_NOLOCK) || ntdb->file->num_lockrecs == 1);
			
 
				+		ok1((ntdb->flags & NTDB_NOLOCK)
			
 
				+		    || ntdb->file->lockrecs[0].off == NTDB_HASH_LOCK_START);
			
 
				+
			
 
				+		new_off = alloc(ntdb, key.dsize, dbuf.dsize,
			
 
				+				NTDB_USED_MAGIC, false);
			
 
				+		ok1(!NTDB_OFF_IS_ERR(new_off2));
			
 
				+		ok1(add_to_hash(ntdb, &h, new_off) == 0);
			
 
				+		ok1(ntdb_unlock_hash(ntdb, h.h, F_WRLCK) == 0);
			
 
				+
			
 
				+		off = new_off + sizeof(struct ntdb_used_record);
			
 
				+		ok1(!ntdb->io->twrite(ntdb, off, key.dptr, key.dsize));
			
 
				+		off += key.dsize;
			
 
				+		ok1(!ntdb->io->twrite(ntdb, off, dbuf.dptr, dbuf.dsize));
			
 
				+
			
 
				+		/* Adding another record should cause expansion. */
			
 
				+		v = 3;
			
 
				+		ok1(find_and_lock(ntdb, key, F_WRLCK, &h, &rec, NULL) == 0);
			
 
				+		/* Should have created correct hash. */
			
 
				+		ok1(h.h == ntdb_hash(ntdb, key.dptr, key.dsize));
			
 
				+		/* Should not have located space in chain. */
			
 
				+		ok1(h.table > NTDB_HASH_OFFSET);
			
 
				+		ok1(h.table_size == 2);
			
 
				+		ok1(h.bucket == 2);
			
 
				+		ok1(h.old_val != 0);
			
 
				+
			
 
				+		/* Should have lock on bucket 0 */
			
 
				+		ok1((h.h & ((1 << ntdb->hash_bits)-1)) == 0);
			
 
				+		ok1((ntdb->flags & NTDB_NOLOCK) || ntdb->file->num_lockrecs == 1);
			
 
				+		ok1((ntdb->flags & NTDB_NOLOCK)
			
 
				+		    || ntdb->file->lockrecs[0].off == NTDB_HASH_LOCK_START);
			
 
				+
			
 
				+		new_off = alloc(ntdb, key.dsize, dbuf.dsize,
			
 
				+				NTDB_USED_MAGIC, false);
			
 
				+		ok1(!NTDB_OFF_IS_ERR(new_off2));
			
 
				+		off = new_off + sizeof(struct ntdb_used_record);
			
 
				+		ok1(!ntdb->io->twrite(ntdb, off, key.dptr, key.dsize));
			
 
				+		off += key.dsize;
			
 
				+		ok1(!ntdb->io->twrite(ntdb, off, dbuf.dptr, dbuf.dsize));
			
 
				+		ok1(add_to_hash(ntdb, &h, new_off) == 0);
			
 
				+		ok1(ntdb_unlock_hash(ntdb, h.h, F_WRLCK) == 0);
			
 
				+
			
 
				+		/* Retrieve it and check. */
			
 
				+		ok1(find_and_lock(ntdb, key, F_WRLCK, &h, &rec, NULL) == new_off);
			
 
				+		/* Should have created correct hash. */
			
 
				+		ok1(h.h == ntdb_hash(ntdb, key.dptr, key.dsize));
			
 
				+		/* Should have appended to chain, bucket 2. */
			
 
				+		ok1(h.table > NTDB_HASH_OFFSET);
			
 
				+		ok1(h.table_size == 3);
			
 
				+		ok1(h.bucket == 2);
			
 
				+
			
 
				+		/* Should have lock on bucket 0 */
			
 
				+		ok1((h.h & ((1 << ntdb->hash_bits)-1)) == 0);
			
 
				+		ok1((ntdb->flags & NTDB_NOLOCK) || ntdb->file->num_lockrecs == 1);
			
 
				+		ok1((ntdb->flags & NTDB_NOLOCK)
			
 
				+		    || ntdb->file->lockrecs[0].off == NTDB_HASH_LOCK_START);
			
 
				+		ok1(ntdb_unlock_hash(ntdb, h.h, F_WRLCK) == 0);
			
 
				+
			
 
				+		/* YA record: relocation. */
			
 
				+		v = 4;
			
 
				+		ok1(find_and_lock(ntdb, key, F_WRLCK, &h, &rec, NULL) == 0);
			
 
				+		/* Should have created correct hash. */
			
 
				+		ok1(h.h == ntdb_hash(ntdb, key.dptr, key.dsize));
			
 
				+		/* Should not have located space in chain. */
			
 
				+		ok1(h.table > NTDB_HASH_OFFSET);
			
 
				+		ok1(h.table_size == 3);
			
 
				+		ok1(h.bucket == 3);
			
 
				+		ok1(h.old_val != 0);
			
 
				+
			
 
				+		/* Should have lock on bucket 0 */
			
 
				+		ok1((h.h & ((1 << ntdb->hash_bits)-1)) == 0);
			
 
				+		ok1((ntdb->flags & NTDB_NOLOCK) || ntdb->file->num_lockrecs == 1);
			
 
				+		ok1((ntdb->flags & NTDB_NOLOCK)
			
 
				+		    || ntdb->file->lockrecs[0].off == NTDB_HASH_LOCK_START);
			
 
				+
			
 
				+		new_off = alloc(ntdb, key.dsize, dbuf.dsize,
			
 
				+				NTDB_USED_MAGIC, false);
			
 
				+		ok1(!NTDB_OFF_IS_ERR(new_off2));
			
 
				+		off = new_off + sizeof(struct ntdb_used_record);
			
 
				+		ok1(!ntdb->io->twrite(ntdb, off, key.dptr, key.dsize));
			
 
				+		off += key.dsize;
			
 
				+		ok1(!ntdb->io->twrite(ntdb, off, dbuf.dptr, dbuf.dsize));
			
 
				+		ok1(add_to_hash(ntdb, &h, new_off) == 0);
			
 
				+		ok1(ntdb_unlock_hash(ntdb, h.h, F_WRLCK) == 0);
			
 
				+
			
 
				+		/* Retrieve it and check. */
			
 
				+		ok1(find_and_lock(ntdb, key, F_WRLCK, &h, &rec, NULL) == new_off);
			
 
				+		/* Should have created correct hash. */
			
 
				+		ok1(h.h == ntdb_hash(ntdb, key.dptr, key.dsize));
			
 
				+		/* Should have appended to chain, bucket 2. */
			
 
				+		ok1(h.table > NTDB_HASH_OFFSET);
			
 
				+		ok1(h.table_size == 4);
			
 
				+		ok1(h.bucket == 3);
			
 
				+
			
 
				+		/* Should have lock on bucket 0 */
			
 
				+		ok1((h.h & ((1 << ntdb->hash_bits)-1)) == 0);
			
 
				+		ok1((ntdb->flags & NTDB_NOLOCK) || ntdb->file->num_lockrecs == 1);
			
 
				+		ok1((ntdb->flags & NTDB_NOLOCK)
			
 
				+		    || ntdb->file->lockrecs[0].off == NTDB_HASH_LOCK_START);
			
 
				+		ok1(ntdb_unlock_hash(ntdb, h.h, F_WRLCK) == 0);
			
 
				+
			
 
				+		ntdb_close(ntdb);
			
 
				+	}
			
 
				+
			
 
				+	ok1(tap_log_messages == 0);
			
 
				+	return exit_status();
			
 
				+}
			
--- a/ccan/ntdb/test/run-05-readonly-open.c
+++ b/ccan/ntdb/test/run-05-readonly-open.c
@@ -0,0 +1,79 @@
 
				+#include <ccan/failtest/failtest_override.h>
			
 
				+#include "ntdb-source.h"
			
 
				+#include "tap-interface.h"
			
 
				+#include <ccan/failtest/failtest.h>
			
 
				+#include "logging.h"
			
 
				+#include "failtest_helper.h"
			
 
				+
			
 
				+int main(int argc, char *argv[])
			
 
				+{
			
 
				+	unsigned int i;
			
 
				+	struct ntdb_context *ntdb;
			
 
				+	int flags[] = { NTDB_DEFAULT, NTDB_NOMMAP,
			
 
				+			NTDB_CONVERT, NTDB_NOMMAP|NTDB_CONVERT };
			
 
				+	NTDB_DATA key = ntdb_mkdata("key", 3);
			
 
				+	NTDB_DATA data = ntdb_mkdata("data", 4), d;
			
 
				+	union ntdb_attribute seed_attr;
			
 
				+	unsigned int msgs = 0;
			
 
				+
			
 
				+	failtest_init(argc, argv);
			
 
				+	failtest_hook = block_repeat_failures;
			
 
				+	failtest_exit_check = exit_check_log;
			
 
				+
			
 
				+	seed_attr.base.attr = NTDB_ATTRIBUTE_SEED;
			
 
				+	seed_attr.base.next = &tap_log_attr;
			
 
				+	seed_attr.seed.seed = 0;
			
 
				+
			
 
				+	failtest_suppress = true;
			
 
				+	plan_tests(sizeof(flags) / sizeof(flags[0]) * 11);
			
 
				+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
			
 
				+		ntdb = ntdb_open("run-05-readonly-open.ntdb",
			
 
				+				 flags[i]|MAYBE_NOSYNC,
			
 
				+				 O_RDWR|O_CREAT|O_TRUNC, 0600,
			
 
				+				 &seed_attr);
			
 
				+		ok1(ntdb_store(ntdb, key, data, NTDB_INSERT) == 0);
			
 
				+		ntdb_close(ntdb);
			
 
				+
			
 
				+		failtest_suppress = false;
			
 
				+		ntdb = ntdb_open("run-05-readonly-open.ntdb",
			
 
				+				 flags[i]|MAYBE_NOSYNC,
			
 
				+				 O_RDONLY, 0600, &tap_log_attr);
			
 
				+		if (!ok1(ntdb))
			
 
				+			break;
			
 
				+		ok1(tap_log_messages == msgs);
			
 
				+		/* Fetch should succeed, stores should fail. */
			
 
				+		if (!ok1(ntdb_fetch(ntdb, key, &d) == 0))
			
 
				+			goto fail;
			
 
				+		ok1(ntdb_deq(d, data));
			
 
				+		free(d.dptr);
			
 
				+		if (!ok1(ntdb_store(ntdb, key, data, NTDB_MODIFY)
			
 
				+			 == NTDB_ERR_RDONLY))
			
 
				+			goto fail;
			
 
				+		ok1(tap_log_messages == ++msgs);
			
 
				+		if (!ok1(ntdb_store(ntdb, key, data, NTDB_INSERT)
			
 
				+			 == NTDB_ERR_RDONLY))
			
 
				+			goto fail;
			
 
				+		ok1(tap_log_messages == ++msgs);
			
 
				+		failtest_suppress = true;
			
 
				+		ok1(ntdb_check(ntdb, NULL, NULL) == 0);
			
 
				+		ntdb_close(ntdb);
			
 
				+		ok1(tap_log_messages == msgs);
			
 
				+		/* SIGH: failtest bug, it doesn't save the ntdb file because
			
 
				+		 * we have it read-only.  If we go around again, it gets
			
 
				+		 * changed underneath us and things get screwy. */
			
 
				+		if (failtest_has_failed())
			
 
				+			break;
			
 
				+	}
			
 
				+	failtest_exit(exit_status());
			
 
				+
			
 
				+fail:
			
 
				+	failtest_suppress = true;
			
 
				+	ntdb_close(ntdb);
			
 
				+	failtest_exit(exit_status());
			
 
				+
			
 
				+	/*
			
 
				+	 * We will never reach this but the compiler complains if we do not
			
 
				+	 * return in this function.
			
 
				+	 */
			
 
				+	return EFAULT;
			
 
				+}
			
--- a/ccan/ntdb/test/run-10-simple-store.c
+++ b/ccan/ntdb/test/run-10-simple-store.c
@@ -0,0 +1,65 @@
 
				+#include <ccan/failtest/failtest_override.h>
			
 
				+#include "ntdb-source.h"
			
 
				+#include "tap-interface.h"
			
 
				+#include <ccan/failtest/failtest.h>
			
 
				+#include "logging.h"
			
 
				+#include "failtest_helper.h"
			
 
				+
			
 
				+int main(int argc, char *argv[])
			
 
				+{
			
 
				+	unsigned int i;
			
 
				+	struct ntdb_context *ntdb;
			
 
				+	int flags[] = { NTDB_INTERNAL, NTDB_DEFAULT, NTDB_NOMMAP,
			
 
				+			NTDB_INTERNAL|NTDB_CONVERT, NTDB_CONVERT,
			
 
				+			NTDB_NOMMAP|NTDB_CONVERT };
			
 
				+	NTDB_DATA key = ntdb_mkdata("key", 3);
			
 
				+	NTDB_DATA data = ntdb_mkdata("data", 4);
			
 
				+
			
 
				+	failtest_init(argc, argv);
			
 
				+	failtest_hook = block_repeat_failures;
			
 
				+	failtest_exit_check = exit_check_log;
			
 
				+
			
 
				+	failtest_suppress = true;
			
 
				+	plan_tests(sizeof(flags) / sizeof(flags[0]) * 7 + 1);
			
 
				+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
			
 
				+		ntdb = ntdb_open("run-10-simple-store.ntdb",
			
 
				+				 flags[i]|MAYBE_NOSYNC,
			
 
				+				 O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
			
 
				+		if (!ok1(ntdb))
			
 
				+			break;
			
 
				+		/* Modify should fail. */
			
 
				+		failtest_suppress = false;
			
 
				+		if (!ok1(ntdb_store(ntdb, key, data, NTDB_MODIFY)
			
 
				+			 == NTDB_ERR_NOEXIST))
			
 
				+			goto fail;
			
 
				+		failtest_suppress = true;
			
 
				+		ok1(ntdb_check(ntdb, NULL, NULL) == 0);
			
 
				+		/* Insert should succeed. */
			
 
				+		failtest_suppress = false;
			
 
				+		if (!ok1(ntdb_store(ntdb, key, data, NTDB_INSERT) == 0))
			
 
				+			goto fail;
			
 
				+		failtest_suppress = true;
			
 
				+		ok1(ntdb_check(ntdb, NULL, NULL) == 0);
			
 
				+		/* Second insert should fail. */
			
 
				+		failtest_suppress = false;
			
 
				+		if (!ok1(ntdb_store(ntdb, key, data, NTDB_INSERT)
			
 
				+			 == NTDB_ERR_EXISTS))
			
 
				+			goto fail;
			
 
				+		failtest_suppress = true;
			
 
				+		ok1(ntdb_check(ntdb, NULL, NULL) == 0);
			
 
				+		ntdb_close(ntdb);
			
 
				+	}
			
 
				+	ok1(tap_log_messages == 0);
			
 
				+	failtest_exit(exit_status());
			
 
				+
			
 
				+fail:
			
 
				+	failtest_suppress = true;
			
 
				+	ntdb_close(ntdb);
			
 
				+	failtest_exit(exit_status());
			
 
				+
			
 
				+	/*
			
 
				+	 * We will never reach this but the compiler complains if we do not
			
 
				+	 * return in this function.
			
 
				+	 */
			
 
				+	return EFAULT;
			
 
				+}
			
--- a/ccan/ntdb/test/run-11-simple-fetch.c
+++ b/ccan/ntdb/test/run-11-simple-fetch.c
@@ -0,0 +1,65 @@
 
				+#include <ccan/failtest/failtest_override.h>
			
 
				+#include "ntdb-source.h"
			
 
				+#include "tap-interface.h"
			
 
				+#include <ccan/failtest/failtest.h>
			
 
				+#include "logging.h"
			
 
				+#include "failtest_helper.h"
			
 
				+
			
 
				+int main(int argc, char *argv[])
			
 
				+{
			
 
				+	unsigned int i;
			
 
				+	struct ntdb_context *ntdb;
			
 
				+	int flags[] = { NTDB_INTERNAL, NTDB_DEFAULT, NTDB_NOMMAP,
			
 
				+			NTDB_INTERNAL|NTDB_CONVERT, NTDB_CONVERT,
			
 
				+			NTDB_NOMMAP|NTDB_CONVERT };
			
 
				+	NTDB_DATA key = ntdb_mkdata("key", 3);
			
 
				+	NTDB_DATA data = ntdb_mkdata("data", 4);
			
 
				+
			
 
				+	failtest_init(argc, argv);
			
 
				+	failtest_hook = block_repeat_failures;
			
 
				+	failtest_exit_check = exit_check_log;
			
 
				+
			
 
				+	failtest_suppress = true;
			
 
				+	plan_tests(sizeof(flags) / sizeof(flags[0]) * 8 + 1);
			
 
				+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
			
 
				+		ntdb = ntdb_open("run-11-simple-fetch.ntdb",
			
 
				+				 flags[i]|MAYBE_NOSYNC,
			
 
				+				 O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
			
 
				+		ok1(ntdb);
			
 
				+		if (ntdb) {
			
 
				+			NTDB_DATA d = { NULL, 0 }; /* Bogus GCC warning */
			
 
				+
			
 
				+			/* fetch should fail. */
			
 
				+			failtest_suppress = false;
			
 
				+			if (!ok1(ntdb_fetch(ntdb, key, &d) == NTDB_ERR_NOEXIST))
			
 
				+				goto fail;
			
 
				+			failtest_suppress = true;
			
 
				+			ok1(ntdb_check(ntdb, NULL, NULL) == 0);
			
 
				+			/* Insert should succeed. */
			
 
				+			ok1(ntdb_store(ntdb, key, data, NTDB_INSERT) == 0);
			
 
				+			ok1(ntdb_check(ntdb, NULL, NULL) == 0);
			
 
				+			/* Fetch should now work. */
			
 
				+			failtest_suppress = false;
			
 
				+			if (!ok1(ntdb_fetch(ntdb, key, &d) == NTDB_SUCCESS))
			
 
				+				goto fail;
			
 
				+			failtest_suppress = true;
			
 
				+			ok1(ntdb_deq(d, data));
			
 
				+			free(d.dptr);
			
 
				+			ok1(ntdb_check(ntdb, NULL, NULL) == 0);
			
 
				+			ntdb_close(ntdb);
			
 
				+		}
			
 
				+	}
			
 
				+	ok1(tap_log_messages == 0);
			
 
				+	failtest_exit(exit_status());
			
 
				+
			
 
				+fail:
			
 
				+	failtest_suppress = true;
			
 
				+	ntdb_close(ntdb);
			
 
				+	failtest_exit(exit_status());
			
 
				+
			
 
				+	/*
			
 
				+	 * We will never reach this but the compiler complains if we do not
			
 
				+	 * return in this function.
			
 
				+	 */
			
 
				+	return EFAULT;
			
 
				+}
			
--- a/ccan/ntdb/test/run-12-check.c
+++ b/ccan/ntdb/test/run-12-check.c
@@ -0,0 +1,52 @@
 
				+#include "private.h"
			
 
				+#include <ccan/failtest/failtest_override.h>
			
 
				+#include "ntdb-source.h"
			
 
				+#include "tap-interface.h"
			
 
				+#include <ccan/failtest/failtest.h>
			
 
				+#include "logging.h"
			
 
				+#include "failtest_helper.h"
			
 
				+
			
 
				+int main(int argc, char *argv[])
			
 
				+{
			
 
				+	unsigned int i;
			
 
				+	struct ntdb_context *ntdb;
			
 
				+	int flags[] = { NTDB_INTERNAL,
			
 
				+			NTDB_INTERNAL|NTDB_CONVERT,
			
 
				+			NTDB_CONVERT };
			
 
				+	NTDB_DATA key = ntdb_mkdata("key", 3);
			
 
				+	NTDB_DATA data = ntdb_mkdata("data", 4);
			
 
				+
			
 
				+	failtest_init(argc, argv);
			
 
				+	failtest_hook = block_repeat_failures;
			
 
				+	failtest_exit_check = exit_check_log;
			
 
				+
			
 
				+	failtest_suppress = true;
			
 
				+	plan_tests(sizeof(flags) / sizeof(flags[0]) * 3 + 1);
			
 
				+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
			
 
				+		ntdb = ntdb_open("run-12-check.ntdb", flags[i]|MAYBE_NOSYNC,
			
 
				+				 O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
			
 
				+		ok1(ntdb);
			
 
				+		ok1(ntdb_store(ntdb, key, data, NTDB_INSERT) == 0);
			
 
				+
			
 
				+		/* This is what we really want to test: ntdb_check(). */
			
 
				+		failtest_suppress = false;
			
 
				+		if (!ok1(ntdb_check(ntdb, NULL, NULL) == 0))
			
 
				+			goto fail;
			
 
				+		failtest_suppress = true;
			
 
				+
			
 
				+		ntdb_close(ntdb);
			
 
				+	}
			
 
				+	ok1(tap_log_messages == 0);
			
 
				+	failtest_exit(exit_status());
			
 
				+
			
 
				+fail:
			
 
				+	failtest_suppress = true;
			
 
				+	ntdb_close(ntdb);
			
 
				+	failtest_exit(exit_status());
			
 
				+
			
 
				+	/*
			
 
				+	 * We will never reach this but the compiler complains if we do not
			
 
				+	 * return in this function.
			
 
				+	 */
			
 
				+	return EFAULT;
			
 
				+}
			
--- a/ccan/ntdb/test/run-15-append.c
+++ b/ccan/ntdb/test/run-15-append.c
@@ -0,0 +1,130 @@
 
				+#include "ntdb-source.h"
			
 
				+#include "tap-interface.h"
			
 
				+#include <ccan/ilog/ilog.h>
			
 
				+#include "logging.h"
			
 
				+
			
 
				+#define MAX_SIZE 13100
			
 
				+#define SIZE_STEP 131
			
 
				+
			
 
				+static ntdb_off_t ntdb_offset(struct ntdb_context *ntdb, NTDB_DATA key)
			
 
				+{
			
 
				+	ntdb_off_t off;
			
 
				+	struct ntdb_used_record urec;
			
 
				+	struct hash_info h;
			
 
				+
			
 
				+	off = find_and_lock(ntdb, key, F_RDLCK, &h, &urec, NULL);
			
 
				+	if (NTDB_OFF_IS_ERR(off))
			
 
				+		return 0;
			
 
				+	ntdb_unlock_hash(ntdb, h.h, F_RDLCK);
			
 
				+	return off;
			
 
				+}
			
 
				+
			
 
				+int main(int argc, char *argv[])
			
 
				+{
			
 
				+	unsigned int i, j, moves;
			
 
				+	struct ntdb_context *ntdb;
			
 
				+	unsigned char *buffer;
			
 
				+	ntdb_off_t oldoff = 0, newoff;
			
 
				+	int flags[] = { NTDB_INTERNAL, NTDB_DEFAULT, NTDB_NOMMAP,
			
 
				+			NTDB_INTERNAL|NTDB_CONVERT, NTDB_CONVERT,
			
 
				+			NTDB_NOMMAP|NTDB_CONVERT };
			
 
				+	NTDB_DATA key = ntdb_mkdata("key", 3);
			
 
				+	NTDB_DATA data;
			
 
				+
			
 
				+	buffer = malloc(MAX_SIZE);
			
 
				+	for (i = 0; i < MAX_SIZE; i++)
			
 
				+		buffer[i] = i;
			
 
				+
			
 
				+	plan_tests(sizeof(flags) / sizeof(flags[0])
			
 
				+		   * ((3 + MAX_SIZE/SIZE_STEP * 5) * 2 + 7)
			
 
				+		   + 1);
			
 
				+
			
 
				+	/* Using ntdb_store. */
			
 
				+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
			
 
				+		ntdb = ntdb_open("run-append.ntdb", flags[i]|MAYBE_NOSYNC,
			
 
				+				 O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
			
 
				+		ok1(ntdb);
			
 
				+		if (!ntdb)
			
 
				+			continue;
			
 
				+
			
 
				+		moves = 0;
			
 
				+		for (j = 0; j < MAX_SIZE; j += SIZE_STEP) {
			
 
				+			data.dptr = buffer;
			
 
				+			data.dsize = j;
			
 
				+			ok1(ntdb_store(ntdb, key, data, NTDB_REPLACE) == 0);
			
 
				+			ok1(ntdb_check(ntdb, NULL, NULL) == 0);
			
 
				+			ok1(ntdb_fetch(ntdb, key, &data) == NTDB_SUCCESS);
			
 
				+			ok1(data.dsize == j);
			
 
				+			ok1(memcmp(data.dptr, buffer, data.dsize) == 0);
			
 
				+			free(data.dptr);
			
 
				+			newoff = ntdb_offset(ntdb, key);
			
 
				+			if (newoff != oldoff)
			
 
				+				moves++;
			
 
				+			oldoff = newoff;
			
 
				+		}
			
 
				+		ok1(!ntdb->file || (ntdb->file->allrecord_lock.count == 0
			
 
				+				   && ntdb->file->num_lockrecs == 0));
			
 
				+		/* We should increase by 50% each time... */
			
 
				+		ok(moves <= ilog64(j / SIZE_STEP)*2,
			
 
				+		   "Moved %u times", moves);
			
 
				+		ntdb_close(ntdb);
			
 
				+	}
			
 
				+
			
 
				+	/* Using ntdb_append. */
			
 
				+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
			
 
				+		size_t prev_len = 0;
			
 
				+		ntdb = ntdb_open("run-append.ntdb", flags[i]|MAYBE_NOSYNC,
			
 
				+				 O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
			
 
				+		ok1(ntdb);
			
 
				+		if (!ntdb)
			
 
				+			continue;
			
 
				+
			
 
				+		moves = 0;
			
 
				+		for (j = 0; j < MAX_SIZE; j += SIZE_STEP) {
			
 
				+			data.dptr = buffer + prev_len;
			
 
				+			data.dsize = j - prev_len;
			
 
				+			ok1(ntdb_append(ntdb, key, data) == 0);
			
 
				+			ok1(ntdb_check(ntdb, NULL, NULL) == 0);
			
 
				+			ok1(ntdb_fetch(ntdb, key, &data) == NTDB_SUCCESS);
			
 
				+			ok1(data.dsize == j);
			
 
				+			ok1(memcmp(data.dptr, buffer, data.dsize) == 0);
			
 
				+			free(data.dptr);
			
 
				+			prev_len = data.dsize;
			
 
				+			newoff = ntdb_offset(ntdb, key);
			
 
				+			if (newoff != oldoff)
			
 
				+				moves++;
			
 
				+			oldoff = newoff;
			
 
				+		}
			
 
				+		ok1(!ntdb->file || (ntdb->file->allrecord_lock.count == 0
			
 
				+				   && ntdb->file->num_lockrecs == 0));
			
 
				+		/* We should increase by 50% each time... */
			
 
				+		ok(moves <= ilog64(j / SIZE_STEP)*2,
			
 
				+		   "Moved %u times", moves);
			
 
				+		ntdb_close(ntdb);
			
 
				+	}
			
 
				+
			
 
				+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
			
 
				+		ntdb = ntdb_open("run-append.ntdb", flags[i]|MAYBE_NOSYNC,
			
 
				+				 O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
			
 
				+		ok1(ntdb);
			
 
				+		if (!ntdb)
			
 
				+			continue;
			
 
				+
			
 
				+		/* Huge initial store. */
			
 
				+		data.dptr = buffer;
			
 
				+		data.dsize = MAX_SIZE;
			
 
				+		ok1(ntdb_append(ntdb, key, data) == 0);
			
 
				+		ok1(ntdb_check(ntdb, NULL, NULL) == 0);
			
 
				+		ok1(ntdb_fetch(ntdb, key, &data) == NTDB_SUCCESS);
			
 
				+		ok1(data.dsize == MAX_SIZE);
			
 
				+		ok1(memcmp(data.dptr, buffer, data.dsize) == 0);
			
 
				+		free(data.dptr);
			
 
				+		ok1(!ntdb->file || (ntdb->file->allrecord_lock.count == 0
			
 
				+				   && ntdb->file->num_lockrecs == 0));
			
 
				+		ntdb_close(ntdb);
			
 
				+	}
			
 
				+
			
 
				+	ok1(tap_log_messages == 0);
			
 
				+	free(buffer);
			
 
				+	return exit_status();
			
 
				+}
			
--- a/ccan/ntdb/test/run-25-hashoverload.c
+++ b/ccan/ntdb/test/run-25-hashoverload.c
@@ -0,0 +1,93 @@
 
				+#include "ntdb-source.h"
			
 
				+#include "tap-interface.h"
			
 
				+#include "logging.h"
			
 
				+
			
 
				+#define OVERLOAD 100
			
 
				+
			
 
				+static uint32_t badhash(const void *key, size_t len, uint32_t seed, void *priv)
			
 
				+{
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int trav(struct ntdb_context *ntdb, NTDB_DATA key, NTDB_DATA dbuf, void *p)
			
 
				+{
			
 
				+	if (p)
			
 
				+		return ntdb_delete(ntdb, key);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+int main(int argc, char *argv[])
			
 
				+{
			
 
				+	unsigned int i, j;
			
 
				+	struct ntdb_context *ntdb;
			
 
				+	NTDB_DATA key = { (unsigned char *)&j, sizeof(j) };
			
 
				+	NTDB_DATA dbuf = { (unsigned char *)&j, sizeof(j) };
			
 
				+	union ntdb_attribute hattr = { .hash = { .base = { NTDB_ATTRIBUTE_HASH },
			
 
				+						.fn = badhash } };
			
 
				+	int flags[] = { NTDB_INTERNAL, NTDB_DEFAULT, NTDB_NOMMAP,
			
 
				+			NTDB_INTERNAL|NTDB_CONVERT, NTDB_CONVERT,
			
 
				+			NTDB_NOMMAP|NTDB_CONVERT,
			
 
				+	};
			
 
				+
			
 
				+	hattr.base.next = &tap_log_attr;
			
 
				+
			
 
				+	plan_tests(sizeof(flags) / sizeof(flags[0]) * (7 * OVERLOAD + 11) + 1);
			
 
				+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
			
 
				+		NTDB_DATA d = { NULL, 0 }; /* Bogus GCC warning */
			
 
				+
			
 
				+		ntdb = ntdb_open("run-25-hashoverload.ntdb",
			
 
				+				 flags[i]|MAYBE_NOSYNC,
			
 
				+				 O_RDWR|O_CREAT|O_TRUNC, 0600, &hattr);
			
 
				+		ok1(ntdb);
			
 
				+		if (!ntdb)
			
 
				+			continue;
			
 
				+
			
 
				+		/* Overload a bucket. */
			
 
				+		for (j = 0; j < OVERLOAD; j++) {
			
 
				+			ok1(ntdb_store(ntdb, key, dbuf, NTDB_INSERT) == 0);
			
 
				+		}
			
 
				+		ok1(ntdb_check(ntdb, NULL, NULL) == 0);
			
 
				+
			
 
				+		/* Check we can find them all. */
			
 
				+		for (j = 0; j < OVERLOAD; j++) {
			
 
				+			ok1(ntdb_fetch(ntdb, key, &d) == NTDB_SUCCESS);
			
 
				+			ok1(d.dsize == sizeof(j));
			
 
				+			ok1(d.dptr != NULL);
			
 
				+			ok1(d.dptr && memcmp(d.dptr, &j, d.dsize) == 0);
			
 
				+			free(d.dptr);
			
 
				+		}
			
 
				+
			
 
				+		/* Traverse through them. */
			
 
				+		ok1(ntdb_traverse(ntdb, trav, NULL) == OVERLOAD);
			
 
				+
			
 
				+		/* Delete the first 99. */
			
 
				+		for (j = 0; j < OVERLOAD-1; j++)
			
 
				+			ok1(ntdb_delete(ntdb, key) == 0);
			
 
				+
			
 
				+		ok1(ntdb_check(ntdb, NULL, NULL) == 0);
			
 
				+
			
 
				+		ok1(ntdb_fetch(ntdb, key, &d) == NTDB_SUCCESS);
			
 
				+		ok1(d.dsize == sizeof(j));
			
 
				+		ok1(d.dptr != NULL);
			
 
				+		ok1(d.dptr && memcmp(d.dptr, &j, d.dsize) == 0);
			
 
				+		free(d.dptr);
			
 
				+
			
 
				+		/* Traverse through them. */
			
 
				+		ok1(ntdb_traverse(ntdb, trav, NULL) == 1);
			
 
				+
			
 
				+		/* Re-add */
			
 
				+		for (j = 0; j < OVERLOAD-1; j++) {
			
 
				+			ok1(ntdb_store(ntdb, key, dbuf, NTDB_INSERT) == 0);
			
 
				+		}
			
 
				+		ok1(ntdb_check(ntdb, NULL, NULL) == 0);
			
 
				+
			
 
				+		/* Now try deleting as we go. */
			
 
				+		ok1(ntdb_traverse(ntdb, trav, trav) == OVERLOAD);
			
 
				+		ok1(ntdb_check(ntdb, NULL, NULL) == 0);
			
 
				+		ok1(ntdb_traverse(ntdb, trav, NULL) == 0);
			
 
				+		ntdb_close(ntdb);
			
 
				+	}
			
 
				+
			
 
				+	ok1(tap_log_messages == 0);
			
 
				+	return exit_status();
			
 
				+}
			
--- a/ccan/ntdb/test/run-30-exhaust-before-expand.c
+++ b/ccan/ntdb/test/run-30-exhaust-before-expand.c
@@ -0,0 +1,76 @@
 
				+#include "ntdb-source.h"
			
 
				+#include "tap-interface.h"
			
 
				+#include "logging.h"
			
 
				+
			
 
				+static bool empty_freetable(struct ntdb_context *ntdb)
			
 
				+{
			
 
				+	struct ntdb_freetable ftab;
			
 
				+	unsigned int i;
			
 
				+
			
 
				+	/* Now, free table should be completely exhausted in zone 0 */
			
 
				+	if (ntdb_read_convert(ntdb, ntdb->ftable_off, &ftab, sizeof(ftab)) != 0)
			
 
				+		abort();
			
 
				+
			
 
				+	for (i = 0; i < sizeof(ftab.buckets)/sizeof(ftab.buckets[0]); i++) {
			
 
				+		if (ftab.buckets[i])
			
 
				+			return false;
			
 
				+	}
			
 
				+	return true;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+int main(int argc, char *argv[])
			
 
				+{
			
 
				+	unsigned int i, j;
			
 
				+	struct ntdb_context *ntdb;
			
 
				+	int flags[] = { NTDB_INTERNAL, NTDB_DEFAULT, NTDB_NOMMAP,
			
 
				+			NTDB_INTERNAL|NTDB_CONVERT, NTDB_CONVERT,
			
 
				+			NTDB_NOMMAP|NTDB_CONVERT };
			
 
				+
			
 
				+	plan_tests(sizeof(flags) / sizeof(flags[0]) * 7 + 1);
			
 
				+
			
 
				+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
			
 
				+		NTDB_DATA k, d;
			
 
				+		uint64_t size;
			
 
				+		bool was_empty = false;
			
 
				+
			
 
				+		k.dptr = (void *)&j;
			
 
				+		k.dsize = sizeof(j);
			
 
				+
			
 
				+		ntdb = ntdb_open("run-30-exhaust-before-expand.ntdb",
			
 
				+				 flags[i]|MAYBE_NOSYNC,
			
 
				+				 O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
			
 
				+		ok1(ntdb);
			
 
				+		if (!ntdb)
			
 
				+			continue;
			
 
				+
			
 
				+		ok1(ntdb_check(ntdb, NULL, NULL) == 0);
			
 
				+		/* There's one empty record in initial db. */
			
 
				+		ok1(!empty_freetable(ntdb));
			
 
				+
			
 
				+		size = ntdb->file->map_size;
			
 
				+
			
 
				+		/* Create one record to chew up most space. */
			
 
				+		d.dsize = size - NEW_DATABASE_HDR_SIZE(ntdb->hash_bits) - 32;
			
 
				+		d.dptr = calloc(d.dsize, 1);
			
 
				+		j = 0;
			
 
				+		ok1(ntdb_store(ntdb, k, d, NTDB_INSERT) == 0);
			
 
				+		ok1(ntdb->file->map_size == size);
			
 
				+		free(d.dptr);
			
 
				+
			
 
				+		/* Now insert minimal-length records until we expand. */
			
 
				+		for (j = 1; ntdb->file->map_size == size; j++) {
			
 
				+			was_empty = empty_freetable(ntdb);
			
 
				+			if (ntdb_store(ntdb, k, k, NTDB_INSERT) != 0)
			
 
				+				err(1, "Failed to store record %i", j);
			
 
				+		}
			
 
				+
			
 
				+		/* Would have been empty before expansion, but no longer. */
			
 
				+		ok1(was_empty);
			
 
				+		ok1(!empty_freetable(ntdb));
			
 
				+		ntdb_close(ntdb);
			
 
				+	}
			
 
				+
			
 
				+	ok1(tap_log_messages == 0);
			
 
				+	return exit_status();
			
 
				+}
			
--- a/ccan/ntdb/test/run-35-convert.c
+++ b/ccan/ntdb/test/run-35-convert.c
@@ -0,0 +1,64 @@
 
				+#include "private.h"
			
 
				+#include <ccan/failtest/failtest_override.h>
			
 
				+#include "ntdb-source.h"
			
 
				+#include "tap-interface.h"
			
 
				+#include <ccan/failtest/failtest.h>
			
 
				+#include "logging.h"
			
 
				+#include "failtest_helper.h"
			
 
				+
			
 
				+int main(int argc, char *argv[])
			
 
				+{
			
 
				+	unsigned int i, messages = 0;
			
 
				+	struct ntdb_context *ntdb;
			
 
				+	int flags[] = { NTDB_DEFAULT, NTDB_NOMMAP,
			
 
				+			NTDB_CONVERT, NTDB_NOMMAP|NTDB_CONVERT };
			
 
				+
			
 
				+	failtest_init(argc, argv);
			
 
				+	failtest_hook = block_repeat_failures;
			
 
				+	failtest_exit_check = exit_check_log;
			
 
				+	plan_tests(sizeof(flags) / sizeof(flags[0]) * 4);
			
 
				+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
			
 
				+		ntdb = ntdb_open("run-35-convert.ntdb", flags[i]|MAYBE_NOSYNC,
			
 
				+				 O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
			
 
				+		if (!ok1(ntdb))
			
 
				+			failtest_exit(exit_status());
			
 
				+
			
 
				+		ntdb_close(ntdb);
			
 
				+		/* We can fail in log message formatting or open.  That's OK */
			
 
				+		if (failtest_has_failed()) {
			
 
				+			failtest_exit(exit_status());
			
 
				+		}
			
 
				+		/* If we say NTDB_CONVERT, it must be converted */
			
 
				+		ntdb = ntdb_open("run-35-convert.ntdb",
			
 
				+				 flags[i]|NTDB_CONVERT|MAYBE_NOSYNC,
			
 
				+				 O_RDWR, 0600, &tap_log_attr);
			
 
				+		if (flags[i] & NTDB_CONVERT) {
			
 
				+			if (!ntdb)
			
 
				+				failtest_exit(exit_status());
			
 
				+			ok1(ntdb_get_flags(ntdb) & NTDB_CONVERT);
			
 
				+			ntdb_close(ntdb);
			
 
				+		} else {
			
 
				+			if (!ok1(!ntdb && errno == EIO))
			
 
				+				failtest_exit(exit_status());
			
 
				+			ok1(tap_log_messages == ++messages);
			
 
				+			if (!ok1(log_last && strstr(log_last, "NTDB_CONVERT")))
			
 
				+				failtest_exit(exit_status());
			
 
				+		}
			
 
				+
			
 
				+		/* If don't say NTDB_CONVERT, it *may* be converted */
			
 
				+		ntdb = ntdb_open("run-35-convert.ntdb",
			
 
				+				 (flags[i] & ~NTDB_CONVERT)|MAYBE_NOSYNC,
			
 
				+				 O_RDWR, 0600, &tap_log_attr);
			
 
				+		if (!ntdb)
			
 
				+			failtest_exit(exit_status());
			
 
				+		ok1(ntdb_get_flags(ntdb) == (flags[i]|MAYBE_NOSYNC));
			
 
				+		ntdb_close(ntdb);
			
 
				+	}
			
 
				+	failtest_exit(exit_status());
			
 
				+
			
 
				+	/*
			
 
				+	 * We will never reach this but the compiler complains if we do not
			
 
				+	 * return in this function.
			
 
				+	 */
			
 
				+	return EFAULT;
			
 
				+}
			
--- a/ccan/ntdb/test/run-50-multiple-freelists.c
+++ b/ccan/ntdb/test/run-50-multiple-freelists.c
@@ -0,0 +1,70 @@
 
				+#include "ntdb-source.h"
			
 
				+#include "tap-interface.h"
			
 
				+#include "logging.h"
			
 
				+#include "layout.h"
			
 
				+
			
 
				+int main(int argc, char *argv[])
			
 
				+{
			
 
				+	ntdb_off_t off;
			
 
				+	struct ntdb_context *ntdb;
			
 
				+	struct ntdb_layout *layout;
			
 
				+	NTDB_DATA key, data;
			
 
				+	union ntdb_attribute seed;
			
 
				+
			
 
				+	/* This seed value previously tickled a layout.c bug. */
			
 
				+	seed.base.attr = NTDB_ATTRIBUTE_SEED;
			
 
				+	seed.seed.seed = 0xb1142bc054d035b4ULL;
			
 
				+	seed.base.next = &tap_log_attr;
			
 
				+
			
 
				+	plan_tests(11);
			
 
				+	key = ntdb_mkdata("Hello", 5);
			
 
				+	data = ntdb_mkdata("world", 5);
			
 
				+
			
 
				+	/* Create a NTDB with three free tables. */
			
 
				+	layout = new_ntdb_layout();
			
 
				+	ntdb_layout_add_freetable(layout);
			
 
				+	ntdb_layout_add_freetable(layout);
			
 
				+	ntdb_layout_add_freetable(layout);
			
 
				+	ntdb_layout_add_free(layout, 80, 0);
			
 
				+	/* Used record prevent coalescing. */
			
 
				+	ntdb_layout_add_used(layout, key, data, 6);
			
 
				+	ntdb_layout_add_free(layout, 160, 1);
			
 
				+	key.dsize--;
			
 
				+	ntdb_layout_add_used(layout, key, data, 7);
			
 
				+	ntdb_layout_add_free(layout, 320, 2);
			
 
				+	key.dsize--;
			
 
				+	ntdb_layout_add_used(layout, key, data, 8);
			
 
				+	ntdb_layout_add_free(layout, 40, 0);
			
 
				+	ntdb = ntdb_layout_get(layout, free, &seed);
			
 
				+	ok1(ntdb_check(ntdb, NULL, NULL) == 0);
			
 
				+
			
 
				+	off = get_free(ntdb, 0, 80 - sizeof(struct ntdb_used_record), 0,
			
 
				+		       NTDB_USED_MAGIC);
			
 
				+	ok1(off == layout->elem[3].base.off);
			
 
				+	ok1(ntdb->ftable_off == layout->elem[0].base.off);
			
 
				+
			
 
				+	off = get_free(ntdb, 0, 160 - sizeof(struct ntdb_used_record), 0,
			
 
				+		       NTDB_USED_MAGIC);
			
 
				+	ok1(off == layout->elem[5].base.off);
			
 
				+	ok1(ntdb->ftable_off == layout->elem[1].base.off);
			
 
				+
			
 
				+	off = get_free(ntdb, 0, 320 - sizeof(struct ntdb_used_record), 0,
			
 
				+		       NTDB_USED_MAGIC);
			
 
				+	ok1(off == layout->elem[7].base.off);
			
 
				+	ok1(ntdb->ftable_off == layout->elem[2].base.off);
			
 
				+
			
 
				+	off = get_free(ntdb, 0, 40 - sizeof(struct ntdb_used_record), 0,
			
 
				+		       NTDB_USED_MAGIC);
			
 
				+	ok1(off == layout->elem[9].base.off);
			
 
				+	ok1(ntdb->ftable_off == layout->elem[0].base.off);
			
 
				+
			
 
				+	/* Now we fail. */
			
 
				+	off = get_free(ntdb, 0, 0, 1, NTDB_USED_MAGIC);
			
 
				+	ok1(off == 0);
			
 
				+
			
 
				+	ntdb_close(ntdb);
			
 
				+	ntdb_layout_free(layout);
			
 
				+
			
 
				+	ok1(tap_log_messages == 0);
			
 
				+	return exit_status();
			
 
				+}
			
--- a/ccan/ntdb/test/run-56-open-during-transaction.c
+++ b/ccan/ntdb/test/run-56-open-during-transaction.c
@@ -0,0 +1,165 @@
 
				+#include "private.h"
			
 
				+#include <unistd.h>
			
 
				+#include "lock-tracking.h"
			
 
				+
			
 
				+static ssize_t pwrite_check(int fd, const void *buf, size_t count, off_t offset);
			
 
				+static ssize_t write_check(int fd, const void *buf, size_t count);
			
 
				+static int ftruncate_check(int fd, off_t length);
			
 
				+
			
 
				+#define pwrite pwrite_check
			
 
				+#define write write_check
			
 
				+#define fcntl fcntl_with_lockcheck
			
 
				+#define ftruncate ftruncate_check
			
 
				+
			
 
				+#include "ntdb-source.h"
			
 
				+#include "tap-interface.h"
			
 
				+#include <stdlib.h>
			
 
				+#include <stdbool.h>
			
 
				+#include <stdarg.h>
			
 
				+#include "external-agent.h"
			
 
				+#include "logging.h"
			
 
				+
			
 
				+static struct agent *agent;
			
 
				+static bool opened;
			
 
				+static int errors = 0;
			
 
				+#define TEST_DBNAME "run-56-open-during-transaction.ntdb"
			
 
				+
			
 
				+#undef write
			
 
				+#undef pwrite
			
 
				+#undef fcntl
			
 
				+#undef ftruncate
			
 
				+
			
 
				+static bool is_same(const char *snapshot, const char *latest, off_t len)
			
 
				+{
			
 
				+	unsigned i;
			
 
				+
			
 
				+	for (i = 0; i < len; i++) {
			
 
				+		if (snapshot[i] != latest[i])
			
 
				+			return false;
			
 
				+	}
			
 
				+	return true;
			
 
				+}
			
 
				+
			
 
				+static bool compare_file(int fd, const char *snapshot, off_t snapshot_len)
			
 
				+{
			
 
				+	char *contents;
			
 
				+	bool ret;
			
 
				+
			
 
				+	/* over-length read serves as length check. */
			
 
				+	contents = malloc(snapshot_len+1);
			
 
				+	ret = pread(fd, contents, snapshot_len+1, 0) == snapshot_len
			
 
				+		&& is_same(snapshot, contents, snapshot_len);
			
 
				+	free(contents);
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+static void check_file_intact(int fd)
			
 
				+{
			
 
				+	enum agent_return ret;
			
 
				+	struct stat st;
			
 
				+	char *contents;
			
 
				+
			
 
				+	fstat(fd, &st);
			
 
				+	contents = malloc(st.st_size);
			
 
				+	if (pread(fd, contents, st.st_size, 0) != st.st_size) {
			
 
				+		diag("Read fail");
			
 
				+		errors++;
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	/* Ask agent to open file. */
			
 
				+	ret = external_agent_operation(agent, OPEN, TEST_DBNAME);
			
 
				+
			
 
				+	/* It's OK to open it, but it must not have changed! */
			
 
				+	if (!compare_file(fd, contents, st.st_size)) {
			
 
				+		diag("Agent changed file after opening %s",
			
 
				+		     agent_return_name(ret));
			
 
				+		errors++;
			
 
				+	}
			
 
				+
			
 
				+	if (ret == SUCCESS) {
			
 
				+		ret = external_agent_operation(agent, CLOSE, NULL);
			
 
				+		if (ret != SUCCESS) {
			
 
				+			diag("Agent failed to close ntdb: %s",
			
 
				+			     agent_return_name(ret));
			
 
				+			errors++;
			
 
				+		}
			
 
				+	} else if (ret != WOULD_HAVE_BLOCKED) {
			
 
				+		diag("Agent opening file gave %s",
			
 
				+		     agent_return_name(ret));
			
 
				+		errors++;
			
 
				+	}
			
 
				+
			
 
				+	free(contents);
			
 
				+}
			
 
				+
			
 
				+static void after_unlock(int fd)
			
 
				+{
			
 
				+	if (opened)
			
 
				+		check_file_intact(fd);
			
 
				+}
			
 
				+
			
 
				+static ssize_t pwrite_check(int fd,
			
 
				+			    const void *buf, size_t count, off_t offset)
			
 
				+{
			
 
				+	if (opened)
			
 
				+		check_file_intact(fd);
			
 
				+
			
 
				+	return pwrite(fd, buf, count, offset);
			
 
				+}
			
 
				+
			
 
				+static ssize_t write_check(int fd, const void *buf, size_t count)
			
 
				+{
			
 
				+	if (opened)
			
 
				+		check_file_intact(fd);
			
 
				+
			
 
				+	return write(fd, buf, count);
			
 
				+}
			
 
				+
			
 
				+static int ftruncate_check(int fd, off_t length)
			
 
				+{
			
 
				+	if (opened)
			
 
				+		check_file_intact(fd);
			
 
				+
			
 
				+	return ftruncate(fd, length);
			
 
				+
			
 
				+}
			
 
				+
			
 
				+int main(int argc, char *argv[])
			
 
				+{
			
 
				+	const int flags[] = { NTDB_DEFAULT, NTDB_NOMMAP,
			
 
				+			NTDB_CONVERT, NTDB_NOMMAP|NTDB_CONVERT };
			
 
				+	int i;
			
 
				+	struct ntdb_context *ntdb;
			
 
				+	NTDB_DATA key, data;
			
 
				+
			
 
				+	plan_tests(sizeof(flags)/sizeof(flags[0]) * 5);
			
 
				+	agent = prepare_external_agent();
			
 
				+	if (!agent)
			
 
				+		err(1, "preparing agent");
			
 
				+
			
 
				+	unlock_callback = after_unlock;
			
 
				+	for (i = 0; i < sizeof(flags)/sizeof(flags[0]); i++) {
			
 
				+		diag("Test with %s and %s\n",
			
 
				+		     (flags[i] & NTDB_CONVERT) ? "CONVERT" : "DEFAULT",
			
 
				+		     (flags[i] & NTDB_NOMMAP) ? "no mmap" : "mmap");
			
 
				+		unlink(TEST_DBNAME);
			
 
				+		ntdb = ntdb_open(TEST_DBNAME, flags[i]|MAYBE_NOSYNC,
			
 
				+				 O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
			
 
				+		ok1(ntdb);
			
 
				+
			
 
				+		opened = true;
			
 
				+		ok1(ntdb_transaction_start(ntdb) == 0);
			
 
				+		key = ntdb_mkdata("hi", strlen("hi"));
			
 
				+		data = ntdb_mkdata("world", strlen("world"));
			
 
				+
			
 
				+		ok1(ntdb_store(ntdb, key, data, NTDB_INSERT) == 0);
			
 
				+		ok1(ntdb_transaction_commit(ntdb) == 0);
			
 
				+		ok(!errors, "We had %u open errors", errors);
			
 
				+
			
 
				+		opened = false;
			
 
				+		ntdb_close(ntdb);
			
 
				+	}
			
 
				+
			
 
				+	return exit_status();
			
 
				+}
			
--- a/ccan/ntdb/test/run-57-die-during-transaction.c
+++ b/ccan/ntdb/test/run-57-die-during-transaction.c
@@ -0,0 +1,321 @@
 
				+#include "private.h"
			
 
				+#include <unistd.h>
			
 
				+#include "lock-tracking.h"
			
 
				+#include "tap-interface.h"
			
 
				+#include <stdlib.h>
			
 
				+#include <assert.h>
			
 
				+static ssize_t pwrite_check(int fd, const void *buf, size_t count, off_t offset);
			
 
				+static ssize_t write_check(int fd, const void *buf, size_t count);
			
 
				+static int ftruncate_check(int fd, off_t length);
			
 
				+
			
 
				+#define pwrite pwrite_check
			
 
				+#define write write_check
			
 
				+#define fcntl fcntl_with_lockcheck
			
 
				+#define ftruncate ftruncate_check
			
 
				+
			
 
				+/* There's a malloc inside transaction_setup_recovery, and valgrind complains
			
 
				+ * when we longjmp and leak it. */
			
 
				+#define MAX_ALLOCATIONS 10
			
 
				+static void *allocated[MAX_ALLOCATIONS];
			
 
				+static unsigned max_alloc = 0;
			
 
				+
			
 
				+static void *malloc_noleak(size_t len)
			
 
				+{
			
 
				+	unsigned int i;
			
 
				+
			
 
				+	for (i = 0; i < MAX_ALLOCATIONS; i++)
			
 
				+		if (!allocated[i]) {
			
 
				+			allocated[i] = malloc(len);
			
 
				+			if (i > max_alloc) {
			
 
				+				max_alloc = i;
			
 
				+				diag("max_alloc: %i", max_alloc);
			
 
				+			}
			
 
				+			return allocated[i];
			
 
				+		}
			
 
				+	diag("Too many allocations!");
			
 
				+	abort();
			
 
				+}
			
 
				+
			
 
				+static void *realloc_noleak(void *p, size_t size)
			
 
				+{
			
 
				+	unsigned int i;
			
 
				+
			
 
				+	for (i = 0; i < MAX_ALLOCATIONS; i++) {
			
 
				+		if (allocated[i] == p) {
			
 
				+			if (i > max_alloc) {
			
 
				+				max_alloc = i;
			
 
				+				diag("max_alloc: %i", max_alloc);
			
 
				+			}
			
 
				+			return allocated[i] = realloc(p, size);
			
 
				+		}
			
 
				+	}
			
 
				+	diag("Untracked realloc!");
			
 
				+	abort();
			
 
				+}
			
 
				+
			
 
				+static void free_noleak(void *p)
			
 
				+{
			
 
				+	unsigned int i;
			
 
				+
			
 
				+	/* We don't catch asprintf, so don't complain if we miss one. */
			
 
				+	for (i = 0; i < MAX_ALLOCATIONS; i++) {
			
 
				+		if (allocated[i] == p) {
			
 
				+			allocated[i] = NULL;
			
 
				+			break;
			
 
				+		}
			
 
				+	}
			
 
				+	free(p);
			
 
				+}
			
 
				+
			
 
				+static void free_all(void)
			
 
				+{
			
 
				+	unsigned int i;
			
 
				+
			
 
				+	for (i = 0; i < MAX_ALLOCATIONS; i++) {
			
 
				+		free(allocated[i]);
			
 
				+		allocated[i] = NULL;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+#define malloc malloc_noleak
			
 
				+#define free(x) free_noleak(x)
			
 
				+#define realloc realloc_noleak
			
 
				+
			
 
				+#include "ntdb-source.h"
			
 
				+
			
 
				+#undef malloc
			
 
				+#undef free
			
 
				+#undef realloc
			
 
				+#undef write
			
 
				+#undef pwrite
			
 
				+#undef fcntl
			
 
				+#undef ftruncate
			
 
				+
			
 
				+#include <stdbool.h>
			
 
				+#include <stdarg.h>
			
 
				+#include <ccan/err/err.h>
			
 
				+#include <setjmp.h>
			
 
				+#include "external-agent.h"
			
 
				+#include "logging.h"
			
 
				+
			
 
				+static bool in_transaction;
			
 
				+static int target, current;
			
 
				+static jmp_buf jmpbuf;
			
 
				+#define TEST_DBNAME "run-57-die-during-transaction.ntdb"
			
 
				+#define KEY_STRING "helloworld"
			
 
				+#define DATA_STRING "Helloworld"
			
 
				+
			
 
				+static void maybe_die(int fd)
			
 
				+{
			
 
				+	if (in_transaction && current++ == target) {
			
 
				+		longjmp(jmpbuf, 1);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static ssize_t pwrite_check(int fd,
			
 
				+			    const void *buf, size_t count, off_t offset)
			
 
				+{
			
 
				+	ssize_t ret;
			
 
				+
			
 
				+	maybe_die(fd);
			
 
				+
			
 
				+	ret = pwrite(fd, buf, count, offset);
			
 
				+	if (ret != count)
			
 
				+		return ret;
			
 
				+
			
 
				+	maybe_die(fd);
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+static ssize_t write_check(int fd, const void *buf, size_t count)
			
 
				+{
			
 
				+	ssize_t ret;
			
 
				+
			
 
				+	maybe_die(fd);
			
 
				+
			
 
				+	ret = write(fd, buf, count);
			
 
				+	if (ret != count)
			
 
				+		return ret;
			
 
				+
			
 
				+	maybe_die(fd);
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+static int ftruncate_check(int fd, off_t length)
			
 
				+{
			
 
				+	int ret;
			
 
				+
			
 
				+	maybe_die(fd);
			
 
				+
			
 
				+	ret = ftruncate(fd, length);
			
 
				+
			
 
				+	maybe_die(fd);
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+static bool test_death(enum operation op, struct agent *agent,
			
 
				+		       bool pre_create_recovery)
			
 
				+{
			
 
				+	struct ntdb_context *ntdb = NULL;
			
 
				+	NTDB_DATA key, data;
			
 
				+	enum agent_return ret;
			
 
				+	int needed_recovery = 0;
			
 
				+
			
 
				+	current = target = 0;
			
 
				+	/* Big long data to force a change. */
			
 
				+	data = ntdb_mkdata(DATA_STRING, strlen(DATA_STRING));
			
 
				+
			
 
				+reset:
			
 
				+	unlink(TEST_DBNAME);
			
 
				+	ntdb = ntdb_open(TEST_DBNAME, NTDB_NOMMAP|MAYBE_NOSYNC,
			
 
				+			 O_CREAT|O_TRUNC|O_RDWR, 0600, &tap_log_attr);
			
 
				+	if (!ntdb) {
			
 
				+		diag("Failed opening NTDB: %s", strerror(errno));
			
 
				+		return false;
			
 
				+	}
			
 
				+
			
 
				+	if (setjmp(jmpbuf) != 0) {
			
 
				+		/* We're partway through.  Simulate our death. */
			
 
				+		close(ntdb->file->fd);
			
 
				+		forget_locking();
			
 
				+		in_transaction = false;
			
 
				+
			
 
				+		ret = external_agent_operation(agent, NEEDS_RECOVERY, "");
			
 
				+		if (ret == SUCCESS)
			
 
				+			needed_recovery++;
			
 
				+		else if (ret != FAILED) {
			
 
				+			diag("Step %u agent NEEDS_RECOVERY = %s", current,
			
 
				+			     agent_return_name(ret));
			
 
				+			return false;
			
 
				+		}
			
 
				+
			
 
				+		/* Could be key, or data. */
			
 
				+		ret = external_agent_operation(agent, op,
			
 
				+					       KEY_STRING "=" KEY_STRING);
			
 
				+		if (ret != SUCCESS) {
			
 
				+			ret = external_agent_operation(agent, op,
			
 
				+						       KEY_STRING
			
 
				+						       "=" DATA_STRING);
			
 
				+		}
			
 
				+		if (ret != SUCCESS) {
			
 
				+			diag("Step %u op %s failed = %s", current,
			
 
				+			     operation_name(op),
			
 
				+			     agent_return_name(ret));
			
 
				+			return false;
			
 
				+		}
			
 
				+
			
 
				+		ret = external_agent_operation(agent, NEEDS_RECOVERY, "");
			
 
				+		if (ret != FAILED) {
			
 
				+			diag("Still needs recovery after step %u = %s",
			
 
				+			     current, agent_return_name(ret));
			
 
				+			return false;
			
 
				+		}
			
 
				+
			
 
				+		ret = external_agent_operation(agent, CHECK, "");
			
 
				+		if (ret != SUCCESS) {
			
 
				+			diag("Step %u check failed = %s", current,
			
 
				+			     agent_return_name(ret));
			
 
				+			return false;
			
 
				+		}
			
 
				+
			
 
				+		ret = external_agent_operation(agent, CLOSE, "");
			
 
				+		if (ret != SUCCESS) {
			
 
				+			diag("Step %u close failed = %s", current,
			
 
				+			     agent_return_name(ret));
			
 
				+			return false;
			
 
				+		}
			
 
				+
			
 
				+		/* Suppress logging as this tries to use closed fd. */
			
 
				+		suppress_logging = true;
			
 
				+		suppress_lockcheck = true;
			
 
				+		ntdb_close(ntdb);
			
 
				+		suppress_logging = false;
			
 
				+		suppress_lockcheck = false;
			
 
				+		target++;
			
 
				+		current = 0;
			
 
				+		free_all();
			
 
				+		goto reset;
			
 
				+	}
			
 
				+
			
 
				+	/* Put key for agent to fetch. */
			
 
				+	key = ntdb_mkdata(KEY_STRING, strlen(KEY_STRING));
			
 
				+
			
 
				+	if (pre_create_recovery) {
			
 
				+		/* Using a transaction now means we allocate the recovery
			
 
				+		 * area immediately.  That makes the later transaction smaller
			
 
				+		 * and thus tickles a bug we had. */
			
 
				+		if (ntdb_transaction_start(ntdb) != 0)
			
 
				+			return false;
			
 
				+	}
			
 
				+	if (ntdb_store(ntdb, key, key, NTDB_INSERT) != 0)
			
 
				+		return false;
			
 
				+	if (pre_create_recovery) {
			
 
				+		if (ntdb_transaction_commit(ntdb) != 0)
			
 
				+			return false;
			
 
				+	}
			
 
				+
			
 
				+	/* This is the key we insert in transaction. */
			
 
				+	key.dsize--;
			
 
				+
			
 
				+	ret = external_agent_operation(agent, OPEN, TEST_DBNAME);
			
 
				+	if (ret != SUCCESS)
			
 
				+		errx(1, "Agent failed to open: %s", agent_return_name(ret));
			
 
				+
			
 
				+	ret = external_agent_operation(agent, FETCH, KEY_STRING "=" KEY_STRING);
			
 
				+	if (ret != SUCCESS)
			
 
				+		errx(1, "Agent failed find key: %s", agent_return_name(ret));
			
 
				+
			
 
				+	in_transaction = true;
			
 
				+	if (ntdb_transaction_start(ntdb) != 0)
			
 
				+		return false;
			
 
				+
			
 
				+	if (ntdb_store(ntdb, key, data, NTDB_INSERT) != 0)
			
 
				+		return false;
			
 
				+
			
 
				+	if (ntdb_transaction_commit(ntdb) != 0)
			
 
				+		return false;
			
 
				+
			
 
				+	in_transaction = false;
			
 
				+
			
 
				+	/* We made it! */
			
 
				+	diag("Completed %u runs", current);
			
 
				+	ntdb_close(ntdb);
			
 
				+	ret = external_agent_operation(agent, CLOSE, "");
			
 
				+	if (ret != SUCCESS) {
			
 
				+		diag("Step %u close failed = %s", current,
			
 
				+		     agent_return_name(ret));
			
 
				+		return false;
			
 
				+	}
			
 
				+
			
 
				+	ok1(needed_recovery);
			
 
				+	ok1(locking_errors == 0);
			
 
				+	ok1(forget_locking() == 0);
			
 
				+	locking_errors = 0;
			
 
				+	return true;
			
 
				+}
			
 
				+
			
 
				+int main(int argc, char *argv[])
			
 
				+{
			
 
				+	enum operation ops[] = { FETCH, STORE, TRANSACTION_START };
			
 
				+	struct agent *agent;
			
 
				+	int i, j;
			
 
				+
			
 
				+	plan_tests(24);
			
 
				+	unlock_callback = maybe_die;
			
 
				+
			
 
				+	external_agent_free = free_noleak;
			
 
				+	agent = prepare_external_agent();
			
 
				+	if (!agent)
			
 
				+		err(1, "preparing agent");
			
 
				+
			
 
				+	for (j = 0; j < 2; j++) {
			
 
				+		for (i = 0; i < sizeof(ops)/sizeof(ops[0]); i++) {
			
 
				+			diag("Testing %s after death (%s recovery area)",
			
 
				+			     operation_name(ops[i]), j ? "with" : "without");
			
 
				+			ok1(test_death(ops[i], agent, j));
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	free_external_agent(agent);
			
 
				+	return exit_status();
			
 
				+}
			
--- a/ccan/ntdb/test/run-64-bit-tdb.c
+++ b/ccan/ntdb/test/run-64-bit-tdb.c
@@ -0,0 +1,88 @@
 
				+#include "ntdb-source.h"
			
 
				+#include "tap-interface.h"
			
 
				+#include "logging.h"
			
 
				+
			
 
				+/* The largest 32-bit value which is still a multiple of NTDB_PGSIZE */
			
 
				+#define ALMOST_4G ((uint32_t)-NTDB_PGSIZE)
			
 
				+/* And this pushes it over 32 bits */
			
 
				+#define A_LITTLE_BIT (NTDB_PGSIZE * 2)
			
 
				+
			
 
				+int main(int argc, char *argv[])
			
 
				+{
			
 
				+	unsigned int i;
			
 
				+	struct ntdb_context *ntdb;
			
 
				+	int flags[] = { NTDB_DEFAULT, NTDB_NOMMAP,
			
 
				+			NTDB_CONVERT,
			
 
				+			NTDB_NOMMAP|NTDB_CONVERT };
			
 
				+
			
 
				+	if (sizeof(off_t) <= 4) {
			
 
				+		plan_tests(1);
			
 
				+		pass("No 64 bit off_t");
			
 
				+		return exit_status();
			
 
				+	}
			
 
				+
			
 
				+	plan_tests(sizeof(flags) / sizeof(flags[0]) * 16);
			
 
				+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
			
 
				+		off_t old_size;
			
 
				+		NTDB_DATA k, d;
			
 
				+		struct hash_info h;
			
 
				+		struct ntdb_used_record rec;
			
 
				+		ntdb_off_t off;
			
 
				+
			
 
				+		ntdb = ntdb_open("run-64-bit-ntdb.ntdb", flags[i]|MAYBE_NOSYNC,
			
 
				+				 O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
			
 
				+		ok1(ntdb);
			
 
				+		if (!ntdb)
			
 
				+			continue;
			
 
				+
			
 
				+		old_size = ntdb->file->map_size;
			
 
				+
			
 
				+		/* Add a fake record to chew up the existing free space. */
			
 
				+		k = ntdb_mkdata("fake", 4);
			
 
				+		d.dsize = ntdb->file->map_size
			
 
				+			- NEW_DATABASE_HDR_SIZE(ntdb->hash_bits) - 8;
			
 
				+		d.dptr = malloc(d.dsize);
			
 
				+		memset(d.dptr, 0, d.dsize);
			
 
				+		ok1(ntdb_store(ntdb, k, d, NTDB_INSERT) == 0);
			
 
				+		ok1(ntdb->file->map_size == old_size);
			
 
				+		free(d.dptr);
			
 
				+
			
 
				+		/* This makes a sparse file */
			
 
				+		ok1(ftruncate(ntdb->file->fd, ALMOST_4G) == 0);
			
 
				+		ok1(add_free_record(ntdb, old_size, ALMOST_4G - old_size,
			
 
				+				    NTDB_LOCK_WAIT, false) == NTDB_SUCCESS);
			
 
				+
			
 
				+		/* Now add a little record past the 4G barrier. */
			
 
				+		ok1(ntdb_expand_file(ntdb, A_LITTLE_BIT) == NTDB_SUCCESS);
			
 
				+		ok1(add_free_record(ntdb, ALMOST_4G, A_LITTLE_BIT,
			
 
				+				    NTDB_LOCK_WAIT, false)
			
 
				+		    == NTDB_SUCCESS);
			
 
				+
			
 
				+		ok1(ntdb_check(ntdb, NULL, NULL) == NTDB_SUCCESS);
			
 
				+
			
 
				+		/* Test allocation path. */
			
 
				+		k = ntdb_mkdata("key", 4);
			
 
				+		d = ntdb_mkdata("data", 5);
			
 
				+		ok1(ntdb_store(ntdb, k, d, NTDB_INSERT) == 0);
			
 
				+		ok1(ntdb_check(ntdb, NULL, NULL) == NTDB_SUCCESS);
			
 
				+
			
 
				+		/* Make sure it put it at end as we expected. */
			
 
				+		off = find_and_lock(ntdb, k, F_RDLCK, &h, &rec, NULL);
			
 
				+		ok1(off >= ALMOST_4G);
			
 
				+		ntdb_unlock_hash(ntdb, h.h, F_RDLCK);
			
 
				+
			
 
				+		ok1(ntdb_fetch(ntdb, k, &d) == 0);
			
 
				+		ok1(d.dsize == 5);
			
 
				+		ok1(strcmp((char *)d.dptr, "data") == 0);
			
 
				+		free(d.dptr);
			
 
				+
			
 
				+		ok1(ntdb_delete(ntdb, k) == 0);
			
 
				+		ok1(ntdb_check(ntdb, NULL, NULL) == NTDB_SUCCESS);
			
 
				+
			
 
				+		ntdb_close(ntdb);
			
 
				+	}
			
 
				+
			
 
				+	/* We might get messages about mmap failing, so don't test
			
 
				+	 * tap_log_messages */
			
 
				+	return exit_status();
			
 
				+}
			
--- a/ccan/ntdb/test/run-90-get-set-attributes.c
+++ b/ccan/ntdb/test/run-90-get-set-attributes.c
@@ -0,0 +1,161 @@
 
				+#include "ntdb-source.h"
			
 
				+#include "tap-interface.h"
			
 
				+#include "logging.h"
			
 
				+
			
 
				+static int mylock(int fd, int rw, off_t off, off_t len, bool waitflag,
			
 
				+		  void *unused)
			
 
				+{
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int myunlock(int fd, int rw, off_t off, off_t len, void *unused)
			
 
				+{
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static uint32_t hash_fn(const void *key, size_t len, uint32_t seed,
			
 
				+			void *priv)
			
 
				+{
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+int main(int argc, char *argv[])
			
 
				+{
			
 
				+	unsigned int i;
			
 
				+	struct ntdb_context *ntdb;
			
 
				+	int flags[] = { NTDB_DEFAULT, NTDB_NOMMAP,
			
 
				+			NTDB_CONVERT, NTDB_NOMMAP|NTDB_CONVERT };
			
 
				+	union ntdb_attribute seed_attr;
			
 
				+	union ntdb_attribute hash_attr;
			
 
				+	union ntdb_attribute lock_attr;
			
 
				+
			
 
				+	seed_attr.base.attr = NTDB_ATTRIBUTE_SEED;
			
 
				+	seed_attr.base.next = &hash_attr;
			
 
				+	seed_attr.seed.seed = 100;
			
 
				+
			
 
				+	hash_attr.base.attr = NTDB_ATTRIBUTE_HASH;
			
 
				+	hash_attr.base.next = &lock_attr;
			
 
				+	hash_attr.hash.fn = hash_fn;
			
 
				+	hash_attr.hash.data = &hash_attr;
			
 
				+
			
 
				+	lock_attr.base.attr = NTDB_ATTRIBUTE_FLOCK;
			
 
				+	lock_attr.base.next = &tap_log_attr;
			
 
				+	lock_attr.flock.lock = mylock;
			
 
				+	lock_attr.flock.unlock = myunlock;
			
 
				+	lock_attr.flock.data = &lock_attr;
			
 
				+
			
 
				+	plan_tests(sizeof(flags) / sizeof(flags[0]) * 50);
			
 
				+
			
 
				+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
			
 
				+		union ntdb_attribute attr;
			
 
				+
			
 
				+		/* First open with no attributes. */
			
 
				+		ntdb = ntdb_open("run-90-get-set-attributes.ntdb",
			
 
				+				 flags[i] |MAYBE_NOSYNC,
			
 
				+				 O_RDWR|O_CREAT|O_TRUNC, 0600, NULL);
			
 
				+		ok1(ntdb);
			
 
				+
			
 
				+		/* Get log on no attributes will fail */
			
 
				+		attr.base.attr = NTDB_ATTRIBUTE_LOG;
			
 
				+		ok1(ntdb_get_attribute(ntdb, &attr) == NTDB_ERR_NOEXIST);
			
 
				+		/* These always work. */
			
 
				+		attr.base.attr = NTDB_ATTRIBUTE_HASH;
			
 
				+		ok1(ntdb_get_attribute(ntdb, &attr) == 0);
			
 
				+		ok1(attr.base.attr == NTDB_ATTRIBUTE_HASH);
			
 
				+		ok1(attr.hash.fn == ntdb_jenkins_hash);
			
 
				+		attr.base.attr = NTDB_ATTRIBUTE_FLOCK;
			
 
				+		ok1(ntdb_get_attribute(ntdb, &attr) == 0);
			
 
				+		ok1(attr.base.attr == NTDB_ATTRIBUTE_FLOCK);
			
 
				+		ok1(attr.flock.lock == ntdb_fcntl_lock);
			
 
				+		ok1(attr.flock.unlock == ntdb_fcntl_unlock);
			
 
				+		attr.base.attr = NTDB_ATTRIBUTE_SEED;
			
 
				+		ok1(ntdb_get_attribute(ntdb, &attr) == 0);
			
 
				+		ok1(attr.base.attr == NTDB_ATTRIBUTE_SEED);
			
 
				+		/* This is possible, just astronomically unlikely. */
			
 
				+		ok1(attr.seed.seed != 0);
			
 
				+
			
 
				+		/* Unset attributes. */
			
 
				+		ntdb_unset_attribute(ntdb, NTDB_ATTRIBUTE_LOG);
			
 
				+		ntdb_unset_attribute(ntdb, NTDB_ATTRIBUTE_FLOCK);
			
 
				+
			
 
				+		/* Set them. */
			
 
				+		ok1(ntdb_set_attribute(ntdb, &tap_log_attr) == 0);
			
 
				+		ok1(ntdb_set_attribute(ntdb, &lock_attr) == 0);
			
 
				+		/* These should fail. */
			
 
				+		ok1(ntdb_set_attribute(ntdb, &seed_attr) == NTDB_ERR_EINVAL);
			
 
				+		ok1(tap_log_messages == 1);
			
 
				+		ok1(ntdb_set_attribute(ntdb, &hash_attr) == NTDB_ERR_EINVAL);
			
 
				+		ok1(tap_log_messages == 2);
			
 
				+		tap_log_messages = 0;
			
 
				+
			
 
				+		/* Getting them should work as expected. */
			
 
				+		attr.base.attr = NTDB_ATTRIBUTE_LOG;
			
 
				+		ok1(ntdb_get_attribute(ntdb, &attr) == 0);
			
 
				+		ok1(attr.base.attr == NTDB_ATTRIBUTE_LOG);
			
 
				+		ok1(attr.log.fn == tap_log_attr.log.fn);
			
 
				+		ok1(attr.log.data == tap_log_attr.log.data);
			
 
				+
			
 
				+		attr.base.attr = NTDB_ATTRIBUTE_FLOCK;
			
 
				+		ok1(ntdb_get_attribute(ntdb, &attr) == 0);
			
 
				+		ok1(attr.base.attr == NTDB_ATTRIBUTE_FLOCK);
			
 
				+		ok1(attr.flock.lock == mylock);
			
 
				+		ok1(attr.flock.unlock == myunlock);
			
 
				+		ok1(attr.flock.data == &lock_attr);
			
 
				+
			
 
				+		/* Unset them again. */
			
 
				+		ntdb_unset_attribute(ntdb, NTDB_ATTRIBUTE_FLOCK);
			
 
				+		ok1(tap_log_messages == 0);
			
 
				+		ntdb_unset_attribute(ntdb, NTDB_ATTRIBUTE_LOG);
			
 
				+		ok1(tap_log_messages == 0);
			
 
				+
			
 
				+		ntdb_close(ntdb);
			
 
				+		ok1(tap_log_messages == 0);
			
 
				+
			
 
				+		/* Now open with all attributes. */
			
 
				+		ntdb = ntdb_open("run-90-get-set-attributes.ntdb",
			
 
				+				 flags[i] | MAYBE_NOSYNC,
			
 
				+				 O_RDWR|O_CREAT|O_TRUNC, 0600,
			
 
				+				 &seed_attr);
			
 
				+
			
 
				+		ok1(ntdb);
			
 
				+
			
 
				+		/* Get will succeed */
			
 
				+		attr.base.attr = NTDB_ATTRIBUTE_LOG;
			
 
				+		ok1(ntdb_get_attribute(ntdb, &attr) == 0);
			
 
				+		ok1(attr.base.attr == NTDB_ATTRIBUTE_LOG);
			
 
				+		ok1(attr.log.fn == tap_log_attr.log.fn);
			
 
				+		ok1(attr.log.data == tap_log_attr.log.data);
			
 
				+
			
 
				+		attr.base.attr = NTDB_ATTRIBUTE_HASH;
			
 
				+		ok1(ntdb_get_attribute(ntdb, &attr) == 0);
			
 
				+		ok1(attr.base.attr == NTDB_ATTRIBUTE_HASH);
			
 
				+		ok1(attr.hash.fn == hash_fn);
			
 
				+		ok1(attr.hash.data == &hash_attr);
			
 
				+
			
 
				+		attr.base.attr = NTDB_ATTRIBUTE_FLOCK;
			
 
				+		ok1(ntdb_get_attribute(ntdb, &attr) == 0);
			
 
				+		ok1(attr.base.attr == NTDB_ATTRIBUTE_FLOCK);
			
 
				+		ok1(attr.flock.lock == mylock);
			
 
				+		ok1(attr.flock.unlock == myunlock);
			
 
				+		ok1(attr.flock.data == &lock_attr);
			
 
				+
			
 
				+		attr.base.attr = NTDB_ATTRIBUTE_SEED;
			
 
				+		ok1(ntdb_get_attribute(ntdb, &attr) == 0);
			
 
				+		ok1(attr.base.attr == NTDB_ATTRIBUTE_SEED);
			
 
				+		ok1(attr.seed.seed == seed_attr.seed.seed);
			
 
				+
			
 
				+		/* Unset attributes. */
			
 
				+		ntdb_unset_attribute(ntdb, NTDB_ATTRIBUTE_HASH);
			
 
				+		ok1(tap_log_messages == 1);
			
 
				+		ntdb_unset_attribute(ntdb, NTDB_ATTRIBUTE_SEED);
			
 
				+		ok1(tap_log_messages == 2);
			
 
				+		ntdb_unset_attribute(ntdb, NTDB_ATTRIBUTE_FLOCK);
			
 
				+		ntdb_unset_attribute(ntdb, NTDB_ATTRIBUTE_LOG);
			
 
				+		ok1(tap_log_messages == 2);
			
 
				+		tap_log_messages = 0;
			
 
				+
			
 
				+		ntdb_close(ntdb);
			
 
				+
			
 
				+	}
			
 
				+	return exit_status();
			
 
				+}
			
--- a/ccan/ntdb/test/run-capabilities.c
+++ b/ccan/ntdb/test/run-capabilities.c
@@ -0,0 +1,283 @@
 
				+#include <ccan/failtest/failtest_override.h>
			
 
				+#include "ntdb-source.h"
			
 
				+#include "tap-interface.h"
			
 
				+#include "logging.h"
			
 
				+#include "layout.h"
			
 
				+#include "failtest_helper.h"
			
 
				+#include <stdarg.h>
			
 
				+
			
 
				+static size_t len_of(bool breaks_check, bool breaks_write, bool breaks_open)
			
 
				+{
			
 
				+	size_t len = 0;
			
 
				+	if (breaks_check)
			
 
				+		len += 8;
			
 
				+	if (breaks_write)
			
 
				+		len += 16;
			
 
				+	if (breaks_open)
			
 
				+		len += 32;
			
 
				+	return len;
			
 
				+}
			
 
				+
			
 
				+/* Creates a NTDB with various capabilities. */
			
 
				+static void create_ntdb(const char *name,
			
 
				+		       unsigned int cap,
			
 
				+		       bool breaks_check,
			
 
				+		       bool breaks_write,
			
 
				+		       bool breaks_open, ...)
			
 
				+{
			
 
				+	NTDB_DATA key, data;
			
 
				+	va_list ap;
			
 
				+	struct ntdb_layout *layout;
			
 
				+	struct ntdb_context *ntdb;
			
 
				+	int fd, clen;
			
 
				+	union ntdb_attribute seed_attr;
			
 
				+
			
 
				+	/* Force a seed which doesn't allow records to clash! */
			
 
				+	seed_attr.base.attr = NTDB_ATTRIBUTE_SEED;
			
 
				+	seed_attr.base.next = &tap_log_attr;
			
 
				+	seed_attr.seed.seed = 0;
			
 
				+
			
 
				+	key = ntdb_mkdata("Hello", 5);
			
 
				+	data = ntdb_mkdata("world", 5);
			
 
				+
			
 
				+	/* Create a NTDB with some data, and some capabilities */
			
 
				+	layout = new_ntdb_layout();
			
 
				+	ntdb_layout_add_freetable(layout);
			
 
				+	ntdb_layout_add_used(layout, key, data, 6);
			
 
				+	clen = len_of(breaks_check, breaks_write, breaks_open);
			
 
				+	ntdb_layout_add_free(layout, 15496 - clen, 0);
			
 
				+	ntdb_layout_add_capability(layout, cap,
			
 
				+				   breaks_write, breaks_check, breaks_open,
			
 
				+				   clen);
			
 
				+
			
 
				+	va_start(ap, breaks_open);
			
 
				+	while ((cap = va_arg(ap, int)) != 0) {
			
 
				+		breaks_check = va_arg(ap, int);
			
 
				+		breaks_write = va_arg(ap, int);
			
 
				+		breaks_open = va_arg(ap, int);
			
 
				+
			
 
				+		key.dsize--;
			
 
				+		ntdb_layout_add_used(layout, key, data, 11 - key.dsize);
			
 
				+		clen = len_of(breaks_check, breaks_write, breaks_open);
			
 
				+		ntdb_layout_add_free(layout, 16304 - clen, 0);
			
 
				+		ntdb_layout_add_capability(layout, cap,
			
 
				+					  breaks_write, breaks_check,
			
 
				+					  breaks_open, clen);
			
 
				+	}
			
 
				+	va_end(ap);
			
 
				+
			
 
				+	/* We open-code this, because we need to use the failtest write. */
			
 
				+	ntdb = ntdb_layout_get(layout, failtest_free, &seed_attr);
			
 
				+
			
 
				+	fd = open(name, O_RDWR|O_TRUNC|O_CREAT, 0600);
			
 
				+	if (fd < 0)
			
 
				+		err(1, "opening %s for writing", name);
			
 
				+	if (write(fd, ntdb->file->map_ptr, ntdb->file->map_size)
			
 
				+	    != ntdb->file->map_size)
			
 
				+		err(1, "writing %s", name);
			
 
				+	close(fd);
			
 
				+	ntdb_close(ntdb);
			
 
				+	ntdb_layout_free(layout);
			
 
				+}
			
 
				+
			
 
				+/* Note all the "goto out" early exits: they're to shorten failtest time. */
			
 
				+int main(int argc, char *argv[])
			
 
				+{
			
 
				+	struct ntdb_context *ntdb;
			
 
				+	char *summary;
			
 
				+
			
 
				+	failtest_init(argc, argv);
			
 
				+	failtest_hook = block_repeat_failures;
			
 
				+	failtest_exit_check = exit_check_log;
			
 
				+	plan_tests(60);
			
 
				+
			
 
				+	failtest_suppress = true;
			
 
				+	/* Capability says you can ignore it? */
			
 
				+	create_ntdb("run-capabilities.ntdb", 1, false, false, false, 0);
			
 
				+
			
 
				+	failtest_suppress = false;
			
 
				+	ntdb = ntdb_open("run-capabilities.ntdb", MAYBE_NOSYNC, O_RDWR, 0,
			
 
				+			 &tap_log_attr);
			
 
				+	failtest_suppress = true;
			
 
				+	if (!ok1(ntdb))
			
 
				+		goto out;
			
 
				+	ok1(tap_log_messages == 0);
			
 
				+	ok1(ntdb_check(ntdb, NULL, NULL) == NTDB_SUCCESS);
			
 
				+	ok1(tap_log_messages == 0);
			
 
				+	ntdb_close(ntdb);
			
 
				+
			
 
				+	/* Two capabilitues say you can ignore them? */
			
 
				+	create_ntdb("run-capabilities.ntdb",
			
 
				+		   1, false, false, false,
			
 
				+		   2, false, false, false, 0);
			
 
				+
			
 
				+	failtest_suppress = false;
			
 
				+	ntdb = ntdb_open("run-capabilities.ntdb", MAYBE_NOSYNC, O_RDWR, 0,
			
 
				+			 &tap_log_attr);
			
 
				+	failtest_suppress = true;
			
 
				+	if (!ok1(ntdb))
			
 
				+		goto out;
			
 
				+	ok1(tap_log_messages == 0);
			
 
				+	ok1(ntdb_check(ntdb, NULL, NULL) == NTDB_SUCCESS);
			
 
				+	ok1(tap_log_messages == 0);
			
 
				+	ok1(ntdb_summary(ntdb, 0, &summary) == NTDB_SUCCESS);
			
 
				+	ok1(strstr(summary, "Capability 1\n"));
			
 
				+	free(summary);
			
 
				+	ntdb_close(ntdb);
			
 
				+
			
 
				+	/* Capability says you can't check. */
			
 
				+	create_ntdb("run-capabilities.ntdb",
			
 
				+		   1, false, false, false,
			
 
				+		   2, true, false, false, 0);
			
 
				+
			
 
				+	failtest_suppress = false;
			
 
				+	ntdb = ntdb_open("run-capabilities.ntdb", MAYBE_NOSYNC, O_RDWR, 0,
			
 
				+			 &tap_log_attr);
			
 
				+	failtest_suppress = true;
			
 
				+	if (!ok1(ntdb))
			
 
				+		goto out;
			
 
				+	ok1(tap_log_messages == 0);
			
 
				+	ok1(ntdb_get_flags(ntdb) & NTDB_CANT_CHECK);
			
 
				+	ok1(ntdb_check(ntdb, NULL, NULL) == NTDB_SUCCESS);
			
 
				+	/* We expect a warning! */
			
 
				+	ok1(tap_log_messages == 1);
			
 
				+	ok1(strstr(log_last, "capabilit"));
			
 
				+	ok1(ntdb_summary(ntdb, 0, &summary) == NTDB_SUCCESS);
			
 
				+	ok1(strstr(summary, "Capability 1\n"));
			
 
				+	ok1(strstr(summary, "Capability 2 (uncheckable)\n"));
			
 
				+	free(summary);
			
 
				+	ntdb_close(ntdb);
			
 
				+
			
 
				+	/* Capability says you can't write. */
			
 
				+	create_ntdb("run-capabilities.ntdb",
			
 
				+		   1, false, false, false,
			
 
				+		   2, false, true, false, 0);
			
 
				+
			
 
				+	failtest_suppress = false;
			
 
				+	ntdb = ntdb_open("run-capabilities.ntdb", MAYBE_NOSYNC, O_RDWR, 0,
			
 
				+			 &tap_log_attr);
			
 
				+	failtest_suppress = true;
			
 
				+	/* We expect a message. */
			
 
				+	ok1(!ntdb);
			
 
				+	if (!ok1(tap_log_messages == 2))
			
 
				+		goto out;
			
 
				+	if (!ok1(strstr(log_last, "unknown")))
			
 
				+		goto out;
			
 
				+	ok1(strstr(log_last, "write"));
			
 
				+
			
 
				+	/* We can open it read-only though! */
			
 
				+	failtest_suppress = false;
			
 
				+	ntdb = ntdb_open("run-capabilities.ntdb", MAYBE_NOSYNC, O_RDONLY, 0,
			
 
				+			 &tap_log_attr);
			
 
				+	failtest_suppress = true;
			
 
				+	if (!ok1(ntdb))
			
 
				+		goto out;
			
 
				+	ok1(tap_log_messages == 2);
			
 
				+	ok1(ntdb_check(ntdb, NULL, NULL) == NTDB_SUCCESS);
			
 
				+	ok1(tap_log_messages == 2);
			
 
				+	ok1(ntdb_summary(ntdb, 0, &summary) == NTDB_SUCCESS);
			
 
				+	ok1(strstr(summary, "Capability 1\n"));
			
 
				+	ok1(strstr(summary, "Capability 2 (read-only)\n"));
			
 
				+	free(summary);
			
 
				+	ntdb_close(ntdb);
			
 
				+
			
 
				+	/* Capability says you can't open. */
			
 
				+	create_ntdb("run-capabilities.ntdb",
			
 
				+		   1, false, false, false,
			
 
				+		   2, false, false, true, 0);
			
 
				+
			
 
				+	failtest_suppress = false;
			
 
				+	ntdb = ntdb_open("run-capabilities.ntdb", MAYBE_NOSYNC, O_RDWR, 0,
			
 
				+		       &tap_log_attr);
			
 
				+	failtest_suppress = true;
			
 
				+	/* We expect a message. */
			
 
				+	ok1(!ntdb);
			
 
				+	if (!ok1(tap_log_messages == 3))
			
 
				+		goto out;
			
 
				+	if (!ok1(strstr(log_last, "unknown")))
			
 
				+		goto out;
			
 
				+
			
 
				+	/* Combine capabilities correctly. */
			
 
				+	create_ntdb("run-capabilities.ntdb",
			
 
				+		   1, false, false, false,
			
 
				+		   2, true, false, false,
			
 
				+		   3, false, true, false, 0);
			
 
				+
			
 
				+	failtest_suppress = false;
			
 
				+	ntdb = ntdb_open("run-capabilities.ntdb", MAYBE_NOSYNC, O_RDWR, 0,
			
 
				+		       &tap_log_attr);
			
 
				+	failtest_suppress = true;
			
 
				+	/* We expect a message. */
			
 
				+	ok1(!ntdb);
			
 
				+	if (!ok1(tap_log_messages == 4))
			
 
				+		goto out;
			
 
				+	if (!ok1(strstr(log_last, "unknown")))
			
 
				+		goto out;
			
 
				+	ok1(strstr(log_last, "write"));
			
 
				+
			
 
				+	/* We can open it read-only though! */
			
 
				+	failtest_suppress = false;
			
 
				+	ntdb = ntdb_open("run-capabilities.ntdb", MAYBE_NOSYNC, O_RDONLY, 0,
			
 
				+		       &tap_log_attr);
			
 
				+	failtest_suppress = true;
			
 
				+	if (!ok1(ntdb))
			
 
				+		goto out;
			
 
				+	ok1(tap_log_messages == 4);
			
 
				+	ok1(ntdb_get_flags(ntdb) & NTDB_CANT_CHECK);
			
 
				+	ok1(ntdb_check(ntdb, NULL, NULL) == NTDB_SUCCESS);
			
 
				+	/* We expect a warning! */
			
 
				+	ok1(tap_log_messages == 5);
			
 
				+	ok1(strstr(log_last, "unknown"));
			
 
				+	ok1(ntdb_summary(ntdb, 0, &summary) == NTDB_SUCCESS);
			
 
				+	ok1(strstr(summary, "Capability 1\n"));
			
 
				+	ok1(strstr(summary, "Capability 2 (uncheckable)\n"));
			
 
				+	ok1(strstr(summary, "Capability 3 (read-only)\n"));
			
 
				+	free(summary);
			
 
				+	ntdb_close(ntdb);
			
 
				+
			
 
				+	/* Two capability flags in one. */
			
 
				+	create_ntdb("run-capabilities.ntdb",
			
 
				+		   1, false, false, false,
			
 
				+		   2, true, true, false,
			
 
				+		   0);
			
 
				+
			
 
				+	failtest_suppress = false;
			
 
				+	ntdb = ntdb_open("run-capabilities.ntdb", MAYBE_NOSYNC, O_RDWR, 0,
			
 
				+		       &tap_log_attr);
			
 
				+	failtest_suppress = true;
			
 
				+	/* We expect a message. */
			
 
				+	ok1(!ntdb);
			
 
				+	if (!ok1(tap_log_messages == 6))
			
 
				+		goto out;
			
 
				+	if (!ok1(strstr(log_last, "unknown")))
			
 
				+		goto out;
			
 
				+	ok1(strstr(log_last, "write"));
			
 
				+
			
 
				+	/* We can open it read-only though! */
			
 
				+	failtest_suppress = false;
			
 
				+	ntdb = ntdb_open("run-capabilities.ntdb", MAYBE_NOSYNC, O_RDONLY, 0,
			
 
				+		       &tap_log_attr);
			
 
				+	failtest_suppress = true;
			
 
				+	if (!ok1(ntdb))
			
 
				+		goto out;
			
 
				+	ok1(tap_log_messages == 6);
			
 
				+	ok1(ntdb_get_flags(ntdb) & NTDB_CANT_CHECK);
			
 
				+	ok1(ntdb_check(ntdb, NULL, NULL) == NTDB_SUCCESS);
			
 
				+	/* We expect a warning! */
			
 
				+	ok1(tap_log_messages == 7);
			
 
				+	ok1(strstr(log_last, "unknown"));
			
 
				+	ok1(ntdb_summary(ntdb, 0, &summary) == NTDB_SUCCESS);
			
 
				+	ok1(strstr(summary, "Capability 1\n"));
			
 
				+	ok1(strstr(summary, "Capability 2 (uncheckable,read-only)\n"));
			
 
				+	free(summary);
			
 
				+	ntdb_close(ntdb);
			
 
				+
			
 
				+out:
			
 
				+	failtest_exit(exit_status());
			
 
				+
			
 
				+	/*
			
 
				+	 * We will never reach this but the compiler complains if we do not
			
 
				+	 * return in this function.
			
 
				+	 */
			
 
				+	return EFAULT;
			
 
				+}
			
--- a/ccan/ntdb/test/run-expand-in-transaction.c
+++ b/ccan/ntdb/test/run-expand-in-transaction.c
@@ -0,0 +1,47 @@
 
				+#include "ntdb-source.h"
			
 
				+#include "tap-interface.h"
			
 
				+#include "logging.h"
			
 
				+
			
 
				+int main(int argc, char *argv[])
			
 
				+{
			
 
				+	unsigned int i;
			
 
				+	struct ntdb_context *ntdb;
			
 
				+	int flags[] = { NTDB_DEFAULT, NTDB_NOMMAP,
			
 
				+			NTDB_CONVERT, NTDB_NOMMAP|NTDB_CONVERT };
			
 
				+	NTDB_DATA key = ntdb_mkdata("key", 3);
			
 
				+	NTDB_DATA data = ntdb_mkdata("data", 4);
			
 
				+
			
 
				+	plan_tests(sizeof(flags) / sizeof(flags[0]) * 9 + 1);
			
 
				+
			
 
				+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
			
 
				+		size_t size;
			
 
				+		NTDB_DATA k, d;
			
 
				+		ntdb = ntdb_open("run-expand-in-transaction.ntdb",
			
 
				+				 flags[i]|MAYBE_NOSYNC,
			
 
				+				 O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
			
 
				+		ok1(ntdb);
			
 
				+		if (!ntdb)
			
 
				+			continue;
			
 
				+
			
 
				+		size = ntdb->file->map_size;
			
 
				+		/* Add a fake record to chew up the existing free space. */
			
 
				+		k = ntdb_mkdata("fake", 4);
			
 
				+		d.dsize = ntdb->file->map_size
			
 
				+			- NEW_DATABASE_HDR_SIZE(ntdb->hash_bits) - 8;
			
 
				+		d.dptr = malloc(d.dsize);
			
 
				+		memset(d.dptr, 0, d.dsize);
			
 
				+		ok1(ntdb_store(ntdb, k, d, NTDB_INSERT) == 0);
			
 
				+		ok1(ntdb->file->map_size == size);
			
 
				+		free(d.dptr);
			
 
				+		ok1(ntdb_transaction_start(ntdb) == 0);
			
 
				+		ok1(ntdb_store(ntdb, key, data, NTDB_INSERT) == 0);
			
 
				+		ok1(ntdb->file->map_size > size);
			
 
				+		ok1(ntdb_transaction_commit(ntdb) == 0);
			
 
				+		ok1(ntdb->file->map_size > size);
			
 
				+		ok1(ntdb_check(ntdb, NULL, NULL) == 0);
			
 
				+		ntdb_close(ntdb);
			
 
				+	}
			
 
				+
			
 
				+	ok1(tap_log_messages == 0);
			
 
				+	return exit_status();
			
 
				+}
			
--- a/ccan/ntdb/test/run-features.c
+++ b/ccan/ntdb/test/run-features.c
@@ -0,0 +1,62 @@
 
				+#include "ntdb-source.h"
			
 
				+#include "tap-interface.h"
			
 
				+#include "logging.h"
			
 
				+
			
 
				+int main(int argc, char *argv[])
			
 
				+{
			
 
				+	unsigned int i, j;
			
 
				+	struct ntdb_context *ntdb;
			
 
				+	int flags[] = { NTDB_DEFAULT, NTDB_NOMMAP,
			
 
				+			NTDB_CONVERT, NTDB_NOMMAP|NTDB_CONVERT };
			
 
				+	NTDB_DATA key = { (unsigned char *)&j, sizeof(j) };
			
 
				+	NTDB_DATA data = { (unsigned char *)&j, sizeof(j) };
			
 
				+
			
 
				+	plan_tests(sizeof(flags) / sizeof(flags[0]) * 8 + 1);
			
 
				+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
			
 
				+		uint64_t features;
			
 
				+		ntdb = ntdb_open("run-features.ntdb", flags[i]|MAYBE_NOSYNC,
			
 
				+				 O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
			
 
				+		ok1(ntdb);
			
 
				+		if (!ntdb)
			
 
				+			continue;
			
 
				+
			
 
				+		/* Put some stuff in there. */
			
 
				+		for (j = 0; j < 100; j++) {
			
 
				+			if (ntdb_store(ntdb, key, data, NTDB_REPLACE) != 0)
			
 
				+				fail("Storing in ntdb");
			
 
				+		}
			
 
				+
			
 
				+		/* Mess with features fields in hdr. */
			
 
				+		features = (~NTDB_FEATURE_MASK ^ 1);
			
 
				+		ok1(ntdb_write_convert(ntdb, offsetof(struct ntdb_header,
			
 
				+						    features_used),
			
 
				+				      &features, sizeof(features)) == 0);
			
 
				+		ok1(ntdb_write_convert(ntdb, offsetof(struct ntdb_header,
			
 
				+						    features_offered),
			
 
				+				      &features, sizeof(features)) == 0);
			
 
				+		ntdb_close(ntdb);
			
 
				+
			
 
				+		ntdb = ntdb_open("run-features.ntdb", flags[i]|MAYBE_NOSYNC,
			
 
				+				 O_RDWR, 0, &tap_log_attr);
			
 
				+		ok1(ntdb);
			
 
				+		if (!ntdb)
			
 
				+			continue;
			
 
				+
			
 
				+		/* Should not have changed features offered. */
			
 
				+		ok1(ntdb_read_convert(ntdb, offsetof(struct ntdb_header,
			
 
				+						   features_offered),
			
 
				+				     &features, sizeof(features)) == 0);
			
 
				+		ok1(features == (~NTDB_FEATURE_MASK ^ 1));
			
 
				+
			
 
				+		/* Should have cleared unknown bits in features_used. */
			
 
				+		ok1(ntdb_read_convert(ntdb, offsetof(struct ntdb_header,
			
 
				+						   features_used),
			
 
				+				     &features, sizeof(features)) == 0);
			
 
				+		ok1(features == (1 & NTDB_FEATURE_MASK));
			
 
				+
			
 
				+		ntdb_close(ntdb);
			
 
				+	}
			
 
				+
			
 
				+	ok1(tap_log_messages == 0);
			
 
				+	return exit_status();
			
 
				+}
			
--- a/ccan/ntdb/test/run-lockall.c
+++ b/ccan/ntdb/test/run-lockall.c
@@ -0,0 +1,74 @@
 
				+#include "private.h"
			
 
				+#include <unistd.h>
			
 
				+#include "lock-tracking.h"
			
 
				+
			
 
				+#define fcntl fcntl_with_lockcheck
			
 
				+#include "ntdb-source.h"
			
 
				+
			
 
				+#include "tap-interface.h"
			
 
				+#include <stdlib.h>
			
 
				+#include <stdbool.h>
			
 
				+#include <stdarg.h>
			
 
				+#include "external-agent.h"
			
 
				+#include "logging.h"
			
 
				+
			
 
				+#define TEST_DBNAME "run-lockall.ntdb"
			
 
				+#define KEY_STR "key"
			
 
				+
			
 
				+#undef fcntl
			
 
				+
			
 
				+int main(int argc, char *argv[])
			
 
				+{
			
 
				+	struct agent *agent;
			
 
				+	int flags[] = { NTDB_DEFAULT, NTDB_NOMMAP,
			
 
				+			NTDB_CONVERT, NTDB_NOMMAP|NTDB_CONVERT };
			
 
				+	int i;
			
 
				+
			
 
				+	plan_tests(13 * sizeof(flags)/sizeof(flags[0]) + 1);
			
 
				+	agent = prepare_external_agent();
			
 
				+	if (!agent)
			
 
				+		err(1, "preparing agent");
			
 
				+
			
 
				+	for (i = 0; i < sizeof(flags)/sizeof(flags[0]); i++) {
			
 
				+		enum agent_return ret;
			
 
				+		struct ntdb_context *ntdb;
			
 
				+
			
 
				+		ntdb = ntdb_open(TEST_DBNAME, flags[i]|MAYBE_NOSYNC,
			
 
				+				 O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
			
 
				+		ok1(ntdb);
			
 
				+
			
 
				+		ret = external_agent_operation(agent, OPEN, TEST_DBNAME);
			
 
				+		ok1(ret == SUCCESS);
			
 
				+
			
 
				+		ok1(ntdb_lockall(ntdb) == NTDB_SUCCESS);
			
 
				+		ok1(external_agent_operation(agent, STORE, KEY_STR "=" KEY_STR)
			
 
				+		    == WOULD_HAVE_BLOCKED);
			
 
				+		ok1(external_agent_operation(agent, FETCH, KEY_STR "=" KEY_STR)
			
 
				+		    == WOULD_HAVE_BLOCKED);
			
 
				+		/* Test nesting. */
			
 
				+		ok1(ntdb_lockall(ntdb) == NTDB_SUCCESS);
			
 
				+		ntdb_unlockall(ntdb);
			
 
				+		ntdb_unlockall(ntdb);
			
 
				+
			
 
				+		ok1(external_agent_operation(agent, STORE, KEY_STR "=" KEY_STR)
			
 
				+		    == SUCCESS);
			
 
				+
			
 
				+		ok1(ntdb_lockall_read(ntdb) == NTDB_SUCCESS);
			
 
				+		ok1(external_agent_operation(agent, STORE, KEY_STR "=" KEY_STR)
			
 
				+		    == WOULD_HAVE_BLOCKED);
			
 
				+		ok1(external_agent_operation(agent, FETCH, KEY_STR "=" KEY_STR)
			
 
				+		    == SUCCESS);
			
 
				+		ok1(ntdb_lockall_read(ntdb) == NTDB_SUCCESS);
			
 
				+		ntdb_unlockall_read(ntdb);
			
 
				+		ntdb_unlockall_read(ntdb);
			
 
				+
			
 
				+		ok1(external_agent_operation(agent, STORE, KEY_STR "=" KEY_STR)
			
 
				+		    == SUCCESS);
			
 
				+		ok1(external_agent_operation(agent, CLOSE, NULL) == SUCCESS);
			
 
				+		ntdb_close(ntdb);
			
 
				+	}
			
 
				+
			
 
				+	free_external_agent(agent);
			
 
				+	ok1(tap_log_messages == 0);
			
 
				+	return exit_status();
			
 
				+}
			
--- a/ccan/ntdb/test/run-remap-in-read_traverse.c
+++ b/ccan/ntdb/test/run-remap-in-read_traverse.c
@@ -0,0 +1,57 @@
 
				+#include "ntdb-source.h"
			
 
				+/* We had a bug where we marked the ntdb read-only for a ntdb_traverse_read.
			
 
				+ * If we then expanded the ntdb, we would remap read-only, and later SEGV. */
			
 
				+#include "tap-interface.h"
			
 
				+#include "external-agent.h"
			
 
				+#include "logging.h"
			
 
				+
			
 
				+static bool file_larger(int fd, ntdb_len_t size)
			
 
				+{
			
 
				+	struct stat st;
			
 
				+
			
 
				+	fstat(fd, &st);
			
 
				+	return st.st_size != size;
			
 
				+}
			
 
				+
			
 
				+static unsigned add_records_to_grow(struct agent *agent, int fd, ntdb_len_t size)
			
 
				+{
			
 
				+	unsigned int i;
			
 
				+
			
 
				+	for (i = 0; !file_larger(fd, size); i++) {
			
 
				+		char data[50];
			
 
				+		sprintf(data, "%i=%i", i, i);
			
 
				+		if (external_agent_operation(agent, STORE, data) != SUCCESS)
			
 
				+			return 0;
			
 
				+	}
			
 
				+	diag("Added %u records to grow file", i);
			
 
				+	return i;
			
 
				+}
			
 
				+
			
 
				+int main(int argc, char *argv[])
			
 
				+{
			
 
				+	unsigned int i;
			
 
				+	struct agent *agent;
			
 
				+	struct ntdb_context *ntdb;
			
 
				+	NTDB_DATA d = ntdb_mkdata("hello", 5);
			
 
				+	const char filename[] = "run-remap-in-read_traverse.ntdb";
			
 
				+
			
 
				+	plan_tests(4);
			
 
				+
			
 
				+	agent = prepare_external_agent();
			
 
				+
			
 
				+	ntdb = ntdb_open(filename, MAYBE_NOSYNC,
			
 
				+		       O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
			
 
				+
			
 
				+	ok1(external_agent_operation(agent, OPEN, filename) == SUCCESS);
			
 
				+	i = add_records_to_grow(agent, ntdb->file->fd, ntdb->file->map_size);
			
 
				+
			
 
				+	/* Do a traverse. */
			
 
				+	ok1(ntdb_traverse(ntdb, NULL, NULL) == i);
			
 
				+
			
 
				+	/* Now store something! */
			
 
				+	ok1(ntdb_store(ntdb, d, d, NTDB_INSERT) == 0);
			
 
				+	ok1(tap_log_messages == 0);
			
 
				+	ntdb_close(ntdb);
			
 
				+	free_external_agent(agent);
			
 
				+	return exit_status();
			
 
				+}
			
--- a/ccan/ntdb/test/run-seed.c
+++ b/ccan/ntdb/test/run-seed.c
@@ -0,0 +1,61 @@
 
				+#include "ntdb-source.h"
			
 
				+#include "tap-interface.h"
			
 
				+#include "logging.h"
			
 
				+
			
 
				+static int log_count = 0;
			
 
				+
			
 
				+/* Normally we get a log when setting random seed. */
			
 
				+static void my_log_fn(struct ntdb_context *ntdb,
			
 
				+		      enum ntdb_log_level level,
			
 
				+		      enum NTDB_ERROR ecode,
			
 
				+		      const char *message, void *priv)
			
 
				+{
			
 
				+	log_count++;
			
 
				+}
			
 
				+
			
 
				+static union ntdb_attribute log_attr = {
			
 
				+	.log = { .base = { .attr = NTDB_ATTRIBUTE_LOG },
			
 
				+		 .fn = my_log_fn }
			
 
				+};
			
 
				+
			
 
				+int main(int argc, char *argv[])
			
 
				+{
			
 
				+	unsigned int i;
			
 
				+	struct ntdb_context *ntdb;
			
 
				+	union ntdb_attribute attr;
			
 
				+	int flags[] = { NTDB_INTERNAL, NTDB_DEFAULT, NTDB_NOMMAP,
			
 
				+			NTDB_INTERNAL|NTDB_CONVERT, NTDB_CONVERT,
			
 
				+			NTDB_NOMMAP|NTDB_CONVERT };
			
 
				+
			
 
				+	attr.seed.base.attr = NTDB_ATTRIBUTE_SEED;
			
 
				+	attr.seed.base.next = &log_attr;
			
 
				+	attr.seed.seed = 42;
			
 
				+
			
 
				+	plan_tests(sizeof(flags) / sizeof(flags[0]) * 4 + 4 * 3);
			
 
				+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
			
 
				+		struct ntdb_header hdr;
			
 
				+		int fd;
			
 
				+		ntdb = ntdb_open("run-seed.ntdb", flags[i]|MAYBE_NOSYNC,
			
 
				+			       O_RDWR|O_CREAT|O_TRUNC, 0600, &attr);
			
 
				+		ok1(ntdb);
			
 
				+		if (!ntdb)
			
 
				+			continue;
			
 
				+		ok1(ntdb_check(ntdb, NULL, NULL) == 0);
			
 
				+		ok1(ntdb->hash_seed == 42);
			
 
				+		ok1(log_count == 0);
			
 
				+		ntdb_close(ntdb);
			
 
				+
			
 
				+		if (flags[i] & NTDB_INTERNAL)
			
 
				+			continue;
			
 
				+
			
 
				+		fd = open("run-seed.ntdb", O_RDONLY);
			
 
				+		ok1(fd >= 0);
			
 
				+		ok1(read(fd, &hdr, sizeof(hdr)) == sizeof(hdr));
			
 
				+		if (flags[i] & NTDB_CONVERT)
			
 
				+			ok1(bswap_64(hdr.hash_seed) == 42);
			
 
				+		else
			
 
				+			ok1(hdr.hash_seed == 42);
			
 
				+		close(fd);
			
 
				+	}
			
 
				+	return exit_status();
			
 
				+}
			
--- a/ccan/ntdb/test/run-tdb_errorstr.c
+++ b/ccan/ntdb/test/run-tdb_errorstr.c
@@ -0,0 +1,52 @@
 
				+#include "ntdb-source.h"
			
 
				+#include "tap-interface.h"
			
 
				+
			
 
				+int main(int argc, char *argv[])
			
 
				+{
			
 
				+	enum NTDB_ERROR e;
			
 
				+	plan_tests(NTDB_ERR_RDONLY*-1 + 2);
			
 
				+
			
 
				+	for (e = NTDB_SUCCESS; e >= NTDB_ERR_RDONLY; e--) {
			
 
				+		switch (e) {
			
 
				+		case NTDB_SUCCESS:
			
 
				+			ok1(!strcmp(ntdb_errorstr(e),
			
 
				+				    "Success"));
			
 
				+			break;
			
 
				+		case NTDB_ERR_IO:
			
 
				+			ok1(!strcmp(ntdb_errorstr(e),
			
 
				+				    "IO Error"));
			
 
				+			break;
			
 
				+		case NTDB_ERR_LOCK:
			
 
				+			ok1(!strcmp(ntdb_errorstr(e),
			
 
				+				    "Locking error"));
			
 
				+			break;
			
 
				+		case NTDB_ERR_OOM:
			
 
				+			ok1(!strcmp(ntdb_errorstr(e),
			
 
				+				    "Out of memory"));
			
 
				+			break;
			
 
				+		case NTDB_ERR_EXISTS:
			
 
				+			ok1(!strcmp(ntdb_errorstr(e),
			
 
				+				    "Record exists"));
			
 
				+			break;
			
 
				+		case NTDB_ERR_EINVAL:
			
 
				+			ok1(!strcmp(ntdb_errorstr(e),
			
 
				+				    "Invalid parameter"));
			
 
				+			break;
			
 
				+		case NTDB_ERR_NOEXIST:
			
 
				+			ok1(!strcmp(ntdb_errorstr(e),
			
 
				+				    "Record does not exist"));
			
 
				+			break;
			
 
				+		case NTDB_ERR_RDONLY:
			
 
				+			ok1(!strcmp(ntdb_errorstr(e),
			
 
				+				    "write not permitted"));
			
 
				+			break;
			
 
				+		case NTDB_ERR_CORRUPT:
			
 
				+			ok1(!strcmp(ntdb_errorstr(e),
			
 
				+				    "Corrupt database"));
			
 
				+			break;
			
 
				+		}
			
 
				+	}
			
 
				+	ok1(!strcmp(ntdb_errorstr(e), "Invalid error code"));
			
 
				+
			
 
				+	return exit_status();
			
 
				+}
			
--- a/ccan/ntdb/test/run-tdb_foreach.c
+++ b/ccan/ntdb/test/run-tdb_foreach.c
@@ -0,0 +1,90 @@
 
				+#include "ntdb-source.h"
			
 
				+#include "tap-interface.h"
			
 
				+#include "logging.h"
			
 
				+
			
 
				+static int drop_count(struct ntdb_context *ntdb, unsigned int *count)
			
 
				+{
			
 
				+	if (--(*count) == 0)
			
 
				+		return 1;
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int set_found(struct ntdb_context *ntdb, bool found[3])
			
 
				+{
			
 
				+	unsigned int idx;
			
 
				+
			
 
				+	if (strcmp(ntdb_name(ntdb), "run-ntdb_foreach0.ntdb") == 0)
			
 
				+		idx = 0;
			
 
				+	else if (strcmp(ntdb_name(ntdb), "run-ntdb_foreach1.ntdb") == 0)
			
 
				+		idx = 1;
			
 
				+	else if (strcmp(ntdb_name(ntdb), "run-ntdb_foreach2.ntdb") == 0)
			
 
				+		idx = 2;
			
 
				+	else
			
 
				+		abort();
			
 
				+
			
 
				+	if (found[idx])
			
 
				+		abort();
			
 
				+	found[idx] = true;
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+int main(int argc, char *argv[])
			
 
				+{
			
 
				+	unsigned int i, count;
			
 
				+	bool found[3];
			
 
				+	struct ntdb_context *ntdb0, *ntdb1, *ntdb;
			
 
				+	int flags[] = { NTDB_DEFAULT, NTDB_NOMMAP,
			
 
				+			NTDB_CONVERT, NTDB_NOMMAP|NTDB_CONVERT };
			
 
				+
			
 
				+	plan_tests(sizeof(flags) / sizeof(flags[0]) * 8);
			
 
				+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
			
 
				+		ntdb0 = ntdb_open("run-ntdb_foreach0.ntdb",
			
 
				+				  flags[i]|MAYBE_NOSYNC,
			
 
				+				  O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
			
 
				+		ntdb1 = ntdb_open("run-ntdb_foreach1.ntdb",
			
 
				+				  flags[i]|MAYBE_NOSYNC,
			
 
				+				  O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
			
 
				+		ntdb = ntdb_open("run-ntdb_foreach2.ntdb",
			
 
				+				 flags[i]|MAYBE_NOSYNC,
			
 
				+				 O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
			
 
				+
			
 
				+		memset(found, 0, sizeof(found));
			
 
				+		ntdb_foreach(set_found, found);
			
 
				+		ok1(found[0] && found[1] && found[2]);
			
 
				+
			
 
				+		/* Test premature iteration termination */
			
 
				+		count = 1;
			
 
				+		ntdb_foreach(drop_count, &count);
			
 
				+		ok1(count == 0);
			
 
				+
			
 
				+		ntdb_close(ntdb1);
			
 
				+		memset(found, 0, sizeof(found));
			
 
				+		ntdb_foreach(set_found, found);
			
 
				+		ok1(found[0] && !found[1] && found[2]);
			
 
				+
			
 
				+		ntdb_close(ntdb);
			
 
				+		memset(found, 0, sizeof(found));
			
 
				+		ntdb_foreach(set_found, found);
			
 
				+		ok1(found[0] && !found[1] && !found[2]);
			
 
				+
			
 
				+		ntdb1 = ntdb_open("run-ntdb_foreach1.ntdb",
			
 
				+				  flags[i]|MAYBE_NOSYNC,
			
 
				+				  O_RDWR, 0600, &tap_log_attr);
			
 
				+		memset(found, 0, sizeof(found));
			
 
				+		ntdb_foreach(set_found, found);
			
 
				+		ok1(found[0] && found[1] && !found[2]);
			
 
				+
			
 
				+		ntdb_close(ntdb0);
			
 
				+		memset(found, 0, sizeof(found));
			
 
				+		ntdb_foreach(set_found, found);
			
 
				+		ok1(!found[0] && found[1] && !found[2]);
			
 
				+
			
 
				+		ntdb_close(ntdb1);
			
 
				+		memset(found, 0, sizeof(found));
			
 
				+		ntdb_foreach(set_found, found);
			
 
				+		ok1(!found[0] && !found[1] && !found[2]);
			
 
				+		ok1(tap_log_messages == 0);
			
 
				+	}
			
 
				+
			
 
				+	return exit_status();
			
 
				+}
			
--- a/ccan/ntdb/test/run-traverse.c
+++ b/ccan/ntdb/test/run-traverse.c
@@ -0,0 +1,203 @@
 
				+#include "ntdb-source.h"
			
 
				+#include "tap-interface.h"
			
 
				+#include "logging.h"
			
 
				+
			
 
				+#define NUM_RECORDS 1000
			
 
				+
			
 
				+/* We use the same seed which we saw a failure on. */
			
 
				+static uint32_t fixedhash(const void *key, size_t len, uint32_t seed, void *p)
			
 
				+{
			
 
				+	return hash64_stable((const unsigned char *)key, len,
			
 
				+			     *(uint64_t *)p);
			
 
				+}
			
 
				+
			
 
				+static bool store_records(struct ntdb_context *ntdb)
			
 
				+{
			
 
				+	int i;
			
 
				+	NTDB_DATA key = { (unsigned char *)&i, sizeof(i) };
			
 
				+	NTDB_DATA data = { (unsigned char *)&i, sizeof(i) };
			
 
				+
			
 
				+	for (i = 0; i < NUM_RECORDS; i++)
			
 
				+		if (ntdb_store(ntdb, key, data, NTDB_REPLACE) != 0)
			
 
				+			return false;
			
 
				+	return true;
			
 
				+}
			
 
				+
			
 
				+struct trav_data {
			
 
				+	unsigned int calls, call_limit;
			
 
				+	int low, high;
			
 
				+	bool mismatch;
			
 
				+	bool delete;
			
 
				+	enum NTDB_ERROR delete_error;
			
 
				+};
			
 
				+
			
 
				+static int trav(struct ntdb_context *ntdb, NTDB_DATA key, NTDB_DATA dbuf,
			
 
				+		struct trav_data *td)
			
 
				+{
			
 
				+	int val;
			
 
				+
			
 
				+	td->calls++;
			
 
				+	if (key.dsize != sizeof(val) || dbuf.dsize != sizeof(val)
			
 
				+	    || memcmp(key.dptr, dbuf.dptr, key.dsize) != 0) {
			
 
				+		td->mismatch = true;
			
 
				+		return -1;
			
 
				+	}
			
 
				+	memcpy(&val, dbuf.dptr, dbuf.dsize);
			
 
				+	if (val < td->low)
			
 
				+		td->low = val;
			
 
				+	if (val > td->high)
			
 
				+		td->high = val;
			
 
				+
			
 
				+	if (td->delete) {
			
 
				+		td->delete_error = ntdb_delete(ntdb, key);
			
 
				+		if (td->delete_error != NTDB_SUCCESS) {
			
 
				+			return -1;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	if (td->calls == td->call_limit)
			
 
				+		return 1;
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+struct trav_grow_data {
			
 
				+	unsigned int calls;
			
 
				+	unsigned int num_large;
			
 
				+	bool mismatch;
			
 
				+	enum NTDB_ERROR error;
			
 
				+};
			
 
				+
			
 
				+static int trav_grow(struct ntdb_context *ntdb, NTDB_DATA key, NTDB_DATA dbuf,
			
 
				+		     struct trav_grow_data *tgd)
			
 
				+{
			
 
				+	int val;
			
 
				+	unsigned char buffer[128] = { 0 };
			
 
				+
			
 
				+	tgd->calls++;
			
 
				+	if (key.dsize != sizeof(val) || dbuf.dsize < sizeof(val)
			
 
				+	    || memcmp(key.dptr, dbuf.dptr, key.dsize) != 0) {
			
 
				+		tgd->mismatch = true;
			
 
				+		return -1;
			
 
				+	}
			
 
				+
			
 
				+	if (dbuf.dsize > sizeof(val))
			
 
				+		/* We must have seen this before! */
			
 
				+		tgd->num_large++;
			
 
				+
			
 
				+	/* Make a big difference to the database. */
			
 
				+	dbuf.dptr = buffer;
			
 
				+	dbuf.dsize = sizeof(buffer);
			
 
				+	tgd->error = ntdb_append(ntdb, key, dbuf);
			
 
				+	if (tgd->error != NTDB_SUCCESS) {
			
 
				+		return -1;
			
 
				+	}
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+int main(int argc, char *argv[])
			
 
				+{
			
 
				+	unsigned int i;
			
 
				+	int num;
			
 
				+	struct trav_data td;
			
 
				+	struct trav_grow_data tgd;
			
 
				+	struct ntdb_context *ntdb;
			
 
				+	uint64_t seed = 16014841315512641303ULL;
			
 
				+	int flags[] = { NTDB_INTERNAL, NTDB_DEFAULT, NTDB_NOMMAP,
			
 
				+			NTDB_INTERNAL|NTDB_CONVERT, NTDB_CONVERT,
			
 
				+			NTDB_NOMMAP|NTDB_CONVERT };
			
 
				+	union ntdb_attribute hattr = { .hash = { .base = { NTDB_ATTRIBUTE_HASH },
			
 
				+						.fn = fixedhash,
			
 
				+						.data = &seed } };
			
 
				+
			
 
				+	hattr.base.next = &tap_log_attr;
			
 
				+
			
 
				+	plan_tests(sizeof(flags) / sizeof(flags[0]) * 32 + 1);
			
 
				+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
			
 
				+		ntdb = ntdb_open("run-traverse.ntdb", flags[i]|MAYBE_NOSYNC,
			
 
				+				 O_RDWR|O_CREAT|O_TRUNC, 0600, &hattr);
			
 
				+		ok1(ntdb);
			
 
				+		if (!ntdb)
			
 
				+			continue;
			
 
				+
			
 
				+		ok1(ntdb_traverse(ntdb, NULL, NULL) == 0);
			
 
				+
			
 
				+		ok1(store_records(ntdb));
			
 
				+		num = ntdb_traverse(ntdb, NULL, NULL);
			
 
				+		ok1(num == NUM_RECORDS);
			
 
				+
			
 
				+		/* Full traverse. */
			
 
				+		td.calls = 0;
			
 
				+		td.call_limit = UINT_MAX;
			
 
				+		td.low = INT_MAX;
			
 
				+		td.high = INT_MIN;
			
 
				+		td.mismatch = false;
			
 
				+		td.delete = false;
			
 
				+
			
 
				+		num = ntdb_traverse(ntdb, trav, &td);
			
 
				+		ok1(num == NUM_RECORDS);
			
 
				+		ok1(!td.mismatch);
			
 
				+		ok1(td.calls == NUM_RECORDS);
			
 
				+		ok1(td.low == 0);
			
 
				+		ok1(td.high == NUM_RECORDS-1);
			
 
				+
			
 
				+		/* Short traverse. */
			
 
				+		td.calls = 0;
			
 
				+		td.call_limit = NUM_RECORDS / 2;
			
 
				+		td.low = INT_MAX;
			
 
				+		td.high = INT_MIN;
			
 
				+		td.mismatch = false;
			
 
				+		td.delete = false;
			
 
				+
			
 
				+		num = ntdb_traverse(ntdb, trav, &td);
			
 
				+		ok1(num == NUM_RECORDS / 2);
			
 
				+		ok1(!td.mismatch);
			
 
				+		ok1(td.calls == NUM_RECORDS / 2);
			
 
				+		ok1(td.low <= NUM_RECORDS / 2);
			
 
				+		ok1(td.high > NUM_RECORDS / 2);
			
 
				+		ok1(ntdb_check(ntdb, NULL, NULL) == 0);
			
 
				+		ok1(tap_log_messages == 0);
			
 
				+
			
 
				+		/* Deleting traverse (delete everything). */
			
 
				+		td.calls = 0;
			
 
				+		td.call_limit = UINT_MAX;
			
 
				+		td.low = INT_MAX;
			
 
				+		td.high = INT_MIN;
			
 
				+		td.mismatch = false;
			
 
				+		td.delete = true;
			
 
				+		td.delete_error = NTDB_SUCCESS;
			
 
				+		num = ntdb_traverse(ntdb, trav, &td);
			
 
				+		ok1(num == NUM_RECORDS);
			
 
				+		ok1(td.delete_error == NTDB_SUCCESS);
			
 
				+		ok1(!td.mismatch);
			
 
				+		ok1(td.calls == NUM_RECORDS);
			
 
				+		ok1(td.low == 0);
			
 
				+		ok1(td.high == NUM_RECORDS - 1);
			
 
				+		ok1(ntdb_check(ntdb, NULL, NULL) == 0);
			
 
				+
			
 
				+		/* Now it's empty! */
			
 
				+		ok1(ntdb_traverse(ntdb, NULL, NULL) == 0);
			
 
				+
			
 
				+		/* Re-add. */
			
 
				+		ok1(store_records(ntdb));
			
 
				+		ok1(ntdb_traverse(ntdb, NULL, NULL) == NUM_RECORDS);
			
 
				+		ok1(ntdb_check(ntdb, NULL, NULL) == 0);
			
 
				+
			
 
				+		/* Grow.  This will cause us to be reshuffled. */
			
 
				+		tgd.calls = 0;
			
 
				+		tgd.num_large = 0;
			
 
				+		tgd.mismatch = false;
			
 
				+		tgd.error = NTDB_SUCCESS;
			
 
				+		ok1(ntdb_traverse(ntdb, trav_grow, &tgd) > 1);
			
 
				+		ok1(tgd.error == 0);
			
 
				+		ok1(!tgd.mismatch);
			
 
				+		ok1(ntdb_check(ntdb, NULL, NULL) == 0);
			
 
				+		ok1(tgd.num_large < tgd.calls);
			
 
				+		diag("growing db: %u calls, %u repeats",
			
 
				+		     tgd.calls, tgd.num_large);
			
 
				+
			
 
				+		ntdb_close(ntdb);
			
 
				+	}
			
 
				+
			
 
				+	ok1(tap_log_messages == 0);
			
 
				+	return exit_status();
			
 
				+}
			
--- a/ccan/ntdb/test/tap-interface.c
+++ b/ccan/ntdb/test/tap-interface.c
@@ -0,0 +1,3 @@
 
				+#include "tap-interface.h"
			
 
				+
			
 
				+unsigned tap_ok_count, tap_ok_target = -1U;
			
--- a/ccan/ntdb/test/tap-interface.h
+++ b/ccan/ntdb/test/tap-interface.h
@@ -0,0 +1,42 @@
 
				+/*
			
 
				+   Unix SMB/CIFS implementation.
			
 
				+   Simplistic implementation of tap interface.
			
 
				+
			
 
				+   Copyright (C) Rusty Russell 2012
			
 
				+
			
 
				+     ** NOTE! The following LGPL license applies to the talloc
			
 
				+     ** library. This does NOT imply that all of Samba is released
			
 
				+     ** under the LGPL
			
 
				+
			
 
				+   This library is free software; you can redistribute it and/or
			
 
				+   modify it under the terms of the GNU Lesser General Public
			
 
				+   License as published by the Free Software Foundation; either
			
 
				+   version 3 of the License, or (at your option) any later version.
			
 
				+
			
 
				+   This library is distributed in the hope that it will be useful,
			
 
				+   but WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
			
 
				+   Lesser General Public License for more details.
			
 
				+
			
 
				+   You should have received a copy of the GNU Lesser General Public
			
 
				+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
			
 
				+*/
			
 
				+#include <stdio.h>
			
 
				+#include <ccan/err/err.h>
			
 
				+#include "no-fsync.h"
			
 
				+
			
 
				+#ifndef __location__
			
 
				+#define __TAP_STRING_LINE1__(s)    #s
			
 
				+#define __TAP_STRING_LINE2__(s)   __TAP_STRING_LINE1__(s)
			
 
				+#define __TAP_STRING_LINE3__  __TAP_STRING_LINE2__(__LINE__)
			
 
				+#define __location__ __FILE__ ":" __TAP_STRING_LINE3__
			
 
				+#endif
			
 
				+
			
 
				+extern unsigned tap_ok_count, tap_ok_target;
			
 
				+#define plan_tests(num) do { tap_ok_target = (num); } while(0)
			
 
				+#define ok(e, ...) ((e) ? (printf("."), tap_ok_count++, true) : (warnx(__VA_ARGS__), false))
			
 
				+#define ok1(e) ok((e), "%s:%s", __location__, #e)
			
 
				+#define pass(...) (printf("."), tap_ok_count++)
			
 
				+#define fail(...) warnx(__VA_ARGS__)
			
 
				+#define diag(...) do { printf(__VA_ARGS__); printf("\n"); } while(0)
			
 
				+#define exit_status() (tap_ok_count == tap_ok_target ? 0 : 1)
			
--- a/ccan/ntdb/tools/Makefile
+++ b/ccan/ntdb/tools/Makefile
@@ -0,0 +1,16 @@
 
				+OBJS:=../../ntdb.o ../../hash.o ../../tally.o
			
 
				+CFLAGS:=-I../../.. -I.. -Wall -g -O3 #-g -pg
			
 
				+LDFLAGS:=-L../../..
			
 
				+
			
 
				+default: ntdbtorture ntdbtool ntdbdump ntdbrestore mkntdb speed growtdb-bench
			
 
				+
			
 
				+ntdbdump: ntdbdump.c $(OBJS)
			
 
				+ntdbrestore: ntdbrestore.c $(OBJS)
			
 
				+ntdbtorture: ntdbtorture.c $(OBJS)
			
 
				+ntdbtool: ntdbtool.c $(OBJS)
			
 
				+mkntdb: mkntdb.c $(OBJS)
			
 
				+speed: speed.c $(OBJS)
			
 
				+growtdb-bench: growtdb-bench.c $(OBJS)
			
 
				+
			
 
				+clean:
			
 
				+	rm -f ntdbtorture ntdbdump ntdbrestore ntdbtool mkntdb speed growtdb-bench
			
--- a/ccan/ntdb/tools/growtdb-bench.c
+++ b/ccan/ntdb/tools/growtdb-bench.c
@@ -0,0 +1,127 @@
 
				+#include "ntdb.h"
			
 
				+#include <stdlib.h>
			
 
				+#include <string.h>
			
 
				+#include <stdio.h>
			
 
				+#include <unistd.h>
			
 
				+#include <ccan/err/err.h>
			
 
				+
			
 
				+static void logfn(struct ntdb_context *ntdb,
			
 
				+		  enum ntdb_log_level level,
			
 
				+		  enum NTDB_ERROR ecode,
			
 
				+		  const char *message,
			
 
				+		  void *data)
			
 
				+{
			
 
				+	fprintf(stderr, "ntdb:%s:%s:%s\n",
			
 
				+		ntdb_name(ntdb), ntdb_errorstr(ecode), message);
			
 
				+}
			
 
				+
			
 
				+int main(int argc, char *argv[])
			
 
				+{
			
 
				+	unsigned int i, j, users, groups;
			
 
				+	NTDB_DATA idxkey, idxdata;
			
 
				+	NTDB_DATA k, d, gk;
			
 
				+	char cmd[100];
			
 
				+	struct ntdb_context *ntdb;
			
 
				+	enum NTDB_ERROR ecode;
			
 
				+	union ntdb_attribute log;
			
 
				+
			
 
				+	if (argc != 3) {
			
 
				+		printf("Usage: growtdb-bench <users> <groups>\n");
			
 
				+		exit(1);
			
 
				+	}
			
 
				+	users = atoi(argv[1]);
			
 
				+	groups = atoi(argv[2]);
			
 
				+
			
 
				+	sprintf(cmd, "cat /proc/%i/statm", getpid());
			
 
				+
			
 
				+	log.base.attr = NTDB_ATTRIBUTE_LOG;
			
 
				+	log.base.next = NULL;
			
 
				+	log.log.fn = logfn;
			
 
				+
			
 
				+	ntdb = ntdb_open("/tmp/growtdb.ntdb", NTDB_DEFAULT,
			
 
				+		       O_RDWR|O_CREAT|O_TRUNC, 0600, &log);
			
 
				+
			
 
				+	idxkey.dptr = (unsigned char *)"User index";
			
 
				+	idxkey.dsize = strlen("User index");
			
 
				+	idxdata.dsize = 51;
			
 
				+	idxdata.dptr = calloc(idxdata.dsize, 1);
			
 
				+	if (idxdata.dptr == NULL) {
			
 
				+		fprintf(stderr, "Unable to allocate memory for idxdata.dptr\n");
			
 
				+		return -1;
			
 
				+	}
			
 
				+
			
 
				+	/* Create users. */
			
 
				+	k.dsize = 48;
			
 
				+	k.dptr = calloc(k.dsize, 1);
			
 
				+	if (k.dptr == NULL) {
			
 
				+		fprintf(stderr, "Unable to allocate memory for k.dptr\n");
			
 
				+		return -1;
			
 
				+	}
			
 
				+	d.dsize = 64;
			
 
				+	d.dptr = calloc(d.dsize, 1);
			
 
				+	if (d.dptr == NULL) {
			
 
				+		fprintf(stderr, "Unable to allocate memory for d.dptr\n");
			
 
				+		return -1;
			
 
				+	}
			
 
				+
			
 
				+	ntdb_transaction_start(ntdb);
			
 
				+	for (i = 0; i < users; i++) {
			
 
				+		memcpy(k.dptr, &i, sizeof(i));
			
 
				+		ecode = ntdb_store(ntdb, k, d, NTDB_INSERT);
			
 
				+		if (ecode != NTDB_SUCCESS)
			
 
				+			errx(1, "ntdb insert failed: %s", ntdb_errorstr(ecode));
			
 
				+
			
 
				+		/* This simulates a growing index record. */
			
 
				+		ecode = ntdb_append(ntdb, idxkey, idxdata);
			
 
				+		if (ecode != NTDB_SUCCESS)
			
 
				+			errx(1, "ntdb append failed: %s", ntdb_errorstr(ecode));
			
 
				+	}
			
 
				+	if ((ecode = ntdb_transaction_commit(ntdb)) != 0)
			
 
				+		errx(1, "ntdb commit1 failed: %s", ntdb_errorstr(ecode));
			
 
				+
			
 
				+	if ((ecode = ntdb_check(ntdb, NULL, NULL)) != 0)
			
 
				+		errx(1, "ntdb_check failed after initial insert!");
			
 
				+
			
 
				+	system(cmd);
			
 
				+
			
 
				+	/* Now put them all in groups: add 32 bytes to each record for
			
 
				+	 * a group. */
			
 
				+	gk.dsize = 48;
			
 
				+	gk.dptr = calloc(k.dsize, 1);
			
 
				+	if (gk.dptr == NULL) {
			
 
				+		fprintf(stderr, "Unable to allocate memory for gk.dptr\n");
			
 
				+		return -1;
			
 
				+	}
			
 
				+	gk.dptr[gk.dsize-1] = 1;
			
 
				+
			
 
				+	d.dsize = 32;
			
 
				+	for (i = 0; i < groups; i++) {
			
 
				+		ntdb_transaction_start(ntdb);
			
 
				+		/* Create the "group". */
			
 
				+		memcpy(gk.dptr, &i, sizeof(i));
			
 
				+		ecode = ntdb_store(ntdb, gk, d, NTDB_INSERT);
			
 
				+		if (ecode != NTDB_SUCCESS)
			
 
				+			errx(1, "ntdb insert failed: %s", ntdb_errorstr(ecode));
			
 
				+
			
 
				+		/* Now populate it. */
			
 
				+		for (j = 0; j < users; j++) {
			
 
				+			/* Append to the user. */
			
 
				+			memcpy(k.dptr, &j, sizeof(j));
			
 
				+			if ((ecode = ntdb_append(ntdb, k, d)) != 0)
			
 
				+				errx(1, "ntdb append failed: %s",
			
 
				+				     ntdb_errorstr(ecode));
			
 
				+
			
 
				+			/* Append to the group. */
			
 
				+			if ((ecode = ntdb_append(ntdb, gk, d)) != 0)
			
 
				+				errx(1, "ntdb append failed: %s",
			
 
				+				     ntdb_errorstr(ecode));
			
 
				+		}
			
 
				+		if ((ecode = ntdb_transaction_commit(ntdb)) != 0)
			
 
				+			errx(1, "ntdb commit2 failed: %s", ntdb_errorstr(ecode));
			
 
				+		if ((ecode = ntdb_check(ntdb, NULL, NULL)) != 0)
			
 
				+			errx(1, "ntdb_check failed after iteration %i!", i);
			
 
				+		system(cmd);
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}