Pchen0
/
ccan
mirror of https://git.ozlabs.org/~ccan/ccan


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725
							#include <ccan/tdb/tdb.h>
#include <ccan/grab_file/grab_file.h>
#include <ccan/hash/hash.h>
#include <ccan/talloc/talloc.h>
#include <ccan/str_talloc/str_talloc.h>
#include <ccan/str/str.h>
#include <ccan/list/list.h>
#include <err.h>
#include <ctype.h>
#include <string.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <sys/time.h>
#include <errno.h>
#include <signal.h>
#include <assert.h>

#define STRINGIFY2(x) #x
#define STRINGIFY(x) STRINGIFY2(x)

/* Avoid mod by zero */
static unsigned int total_keys = 1;

/* #define DEBUG_DEPS 1 */

/* Traversals block transactions in the current implementation. */
#define TRAVERSALS_TAKE_TRANSACTION_LOCK 1

struct pipe {
	int fd[2];
};
static struct pipe *pipes;

static void __attribute__((noreturn)) fail(const char *filename,
					   unsigned int line,
					   const char *fmt, ...)
{
	va_list ap;

	va_start(ap, fmt);
	fprintf(stderr, "%s:%u: FAIL: ", filename, line);
	vfprintf(stderr, fmt, ap);
	fprintf(stderr, "\n");
	va_end(ap);
	exit(1);
}
	
/* Try or die. */
#define try(expr, expect)						\
	do {								\
		int ret = (expr);					\
		if (ret != (expect))					\
			fail(filename[file], i+1,			\
			     STRINGIFY(expr) "= %i", ret);		\
	} while (0)

/* Try or imitate results. */
#define unreliable(expr, expect, force, undo)				\
	do {								\
		int ret = expr;						\
		if (ret != expect) {					\
			fprintf(stderr, "%s:%u: %s gave %i not %i",	\
				filename[file], i+1, STRINGIFY(expr),	\
				ret, expect);				\
			if (expect == 0)				\
				force;					\
			else						\
				undo;					\
		}							\
	} while (0)

static bool key_eq(TDB_DATA a, TDB_DATA b)
{
	if (a.dsize != b.dsize)
		return false;
	return memcmp(a.dptr, b.dptr, a.dsize) == 0;
}

/* This is based on the hash algorithm from gdbm */
static unsigned int hash_key(TDB_DATA *key)
{
	uint32_t value;	/* Used to compute the hash value.  */
	uint32_t   i;	/* Used to cycle through random values. */

	/* Set the initial value from the key size. */
	for (value = 0x238F13AF ^ key->dsize, i=0; i < key->dsize; i++)
		value = (value + (key->dptr[i] << (i*5 % 24)));

	return (1103515243 * value + 12345);  
}

enum op_type {
	OP_TDB_LOCKALL,
	OP_TDB_LOCKALL_MARK,
	OP_TDB_LOCKALL_UNMARK,
	OP_TDB_LOCKALL_NONBLOCK,
	OP_TDB_UNLOCKALL,
	OP_TDB_LOCKALL_READ,
	OP_TDB_LOCKALL_READ_NONBLOCK,
	OP_TDB_UNLOCKALL_READ,
	OP_TDB_CHAINLOCK,
	OP_TDB_CHAINLOCK_NONBLOCK,
	OP_TDB_CHAINLOCK_MARK,
	OP_TDB_CHAINLOCK_UNMARK,
	OP_TDB_CHAINUNLOCK,
	OP_TDB_CHAINLOCK_READ,
	OP_TDB_CHAINUNLOCK_READ,
	OP_TDB_PARSE_RECORD,
	OP_TDB_EXISTS,
	OP_TDB_STORE,
	OP_TDB_APPEND,
	OP_TDB_GET_SEQNUM,
	OP_TDB_WIPE_ALL,
	OP_TDB_TRANSACTION_START,
	OP_TDB_TRANSACTION_CANCEL,
	OP_TDB_TRANSACTION_COMMIT,
	OP_TDB_TRAVERSE_READ_START,
	OP_TDB_TRAVERSE_START,
	OP_TDB_TRAVERSE_END,
	OP_TDB_TRAVERSE,
	OP_TDB_FIRSTKEY,
	OP_TDB_NEXTKEY,
	OP_TDB_FETCH,
	OP_TDB_DELETE,
};

struct op {
	unsigned int serial;
	enum op_type op;
	TDB_DATA key;
	TDB_DATA data;
	int ret;

	/* Who is waiting for us? */
	struct list_head post;
	/* What are we waiting for? */
	struct list_head pre;

	/* If I'm part of a group (traverse/transaction) where is
	 * start?  (Otherwise, 0) */
	unsigned int group_start;

	union {
		int flag; /* open and store */
		struct {  /* append */
			TDB_DATA pre;
			TDB_DATA post;
		} append;
		/* transaction/traverse start/chainlock */
		unsigned int group_len;
	};
};

static unsigned char hex_char(const char *filename, unsigned int line, char c)
{
	c = toupper(c);
	if (c >= 'A' && c <= 'F')
		return c - 'A' + 10;
	if (c >= '0' && c <= '9')
		return c - '0';
	fail(filename, line, "invalid hex character '%c'", c);
}

/* TDB data is <size>:<%02x>* */
static TDB_DATA make_tdb_data(const void *ctx,
			      const char *filename, unsigned int line,
			      const char *word)
{
	TDB_DATA data;
	unsigned int i;
	const char *p;

	if (streq(word, "NULL"))
		return tdb_null;

	data.dsize = atoi(word);
	data.dptr = talloc_array(ctx, unsigned char, data.dsize);
	p = strchr(word, ':');
	if (!p)
		fail(filename, line, "invalid tdb data '%s'", word);
	p++;
	for (i = 0; i < data.dsize; i++)
		data.dptr[i] = hex_char(filename, line, p[i*2])*16
			+ hex_char(filename, line, p[i*2+1]);

	return data;
}

static void add_op(const char *filename, struct op **op, unsigned int i,
		   unsigned int serial, enum op_type type)
{
	struct op *new;
	*op = talloc_realloc(NULL, *op, struct op, i+1);
	new = (*op) + i;
	new->op = type;
	new->serial = serial;
	new->ret = 0;
	new->group_start = 0;
}

static void op_add_nothing(const char *filename,
			   struct op op[], unsigned int op_num, char *words[])
{
	if (words[2])
		fail(filename, op_num+1, "Expected no arguments");
	op[op_num].key = tdb_null;
}

static void op_add_key(const char *filename,
		       struct op op[], unsigned int op_num, char *words[])
{
	if (words[2] == NULL || words[3])
		fail(filename, op_num+1, "Expected just a key");

	op[op_num].key = make_tdb_data(op, filename, op_num+1, words[2]);
	total_keys++;
}

static void op_add_key_ret(const char *filename,
			   struct op op[], unsigned int op_num, char *words[])
{
	if (!words[2] || !words[3] || !words[4] || words[5]
	    || !streq(words[3], "="))
		fail(filename, op_num+1, "Expected <key> = <ret>");
	op[op_num].ret = atoi(words[4]);
	op[op_num].key = make_tdb_data(op, filename, op_num+1, words[2]);
	/* May only be a unique key if it fails */
	if (op[op_num].ret != 0)
		total_keys++;
}

static void op_add_key_data(const char *filename,
			    struct op op[], unsigned int op_num, char *words[])
{
	if (!words[2] || !words[3] || !words[4] || words[5]
	    || !streq(words[3], "="))
		fail(filename, op_num+1, "Expected <key> = <data>");
	op[op_num].key = make_tdb_data(op, filename, op_num+1, words[2]);
	op[op_num].data = make_tdb_data(op, filename, op_num+1, words[4]);
	/* May only be a unique key if it fails */
	if (!op[op_num].data.dptr)
		total_keys++;
}

/* We don't record the keys or data for a traverse, as we don't use them. */
static void op_add_traverse(const char *filename,
			    struct op op[], unsigned int op_num, char *words[])
{
	if (!words[2] || !words[3] || !words[4] || words[5]
	    || !streq(words[3], "="))
		fail(filename, op_num+1, "Expected <key> = <data>");
	op[op_num].key = tdb_null;
}

/* Full traverse info is useful for debugging, but changing it to
 * "traversefn" without the data makes the traces *much* smaller! */
static void op_add_traversefn(const char *filename,
			    struct op op[], unsigned int op_num, char *words[])
{
	if (words[2])
		fail(filename, op_num+1, "Expected no values");
	op[op_num].key = tdb_null;
}

/* <serial> tdb_store <rec> <rec> <flag> = <ret> */
static void op_add_store(const char *filename,
			 struct op op[], unsigned int op_num, char *words[])
{
	if (!words[2] || !words[3] || !words[4] || !words[5] || !words[6]
	    || words[7] || !streq(words[5], "="))
		fail(filename, op_num+1, "Expect <key> <data> <flag> = <ret>");

	op[op_num].flag = strtoul(words[4], NULL, 0);
	op[op_num].ret = atoi(words[6]);
	op[op_num].key = make_tdb_data(op, filename, op_num+1, words[2]);
	op[op_num].data = make_tdb_data(op, filename, op_num+1, words[3]);
	total_keys++;
}

/* <serial> tdb_append <rec> <rec> = <rec> */
static void op_add_append(const char *filename,
			  struct op op[], unsigned int op_num, char *words[])
{
	if (!words[2] || !words[3] || !words[4] || !words[5] || words[6]
	    || !streq(words[4], "="))
		fail(filename, op_num+1, "Expect <key> <data> = <rec>");

	op[op_num].key = make_tdb_data(op, filename, op_num+1, words[2]);
	op[op_num].data = make_tdb_data(op, filename, op_num+1, words[3]);

	op[op_num].append.post
		= make_tdb_data(op, filename, op_num+1, words[5]);

	/* By subtraction, figure out what previous data was. */
	op[op_num].append.pre.dptr = op[op_num].append.post.dptr;
	op[op_num].append.pre.dsize
		= op[op_num].append.post.dsize - op[op_num].data.dsize;
	total_keys++;
}

/* <serial> tdb_get_seqnum = <ret> */
static void op_add_seqnum(const char *filename,
			  struct op op[], unsigned int op_num, char *words[])
{
	if (!words[2] || !words[3] || words[4] || !streq(words[2], "="))
		fail(filename, op_num+1, "Expect = <ret>");

	op[op_num].key = tdb_null;
	op[op_num].ret = atoi(words[3]);
}

static void op_add_traverse_start(const char *filename,
				  struct op op[],
				  unsigned int op_num, char *words[])
{
	if (words[2])
		fail(filename, op_num+1, "Expect no arguments");

	op[op_num].key = tdb_null;
	op[op_num].group_len = 0;
}

static void op_add_transaction(const char *filename, struct op op[],
			       unsigned int op_num, char *words[])
{
	if (words[2])
		fail(filename, op_num+1, "Expect no arguments");

	op[op_num].key = tdb_null;
	op[op_num].group_len = 0;
}

static void op_add_chainlock(const char *filename,
			     struct op op[], unsigned int op_num, char *words[])
{
	if (words[2] == NULL || words[3])
		fail(filename, op_num+1, "Expected just a key");

	/* A chainlock key isn't a key in the normal sense; it doesn't
	 * have to be in the db at all.  Also, we don't want to hash this op. */
	op[op_num].data = make_tdb_data(op, filename, op_num+1, words[2]);
	op[op_num].key = tdb_null;
	op[op_num].group_len = 0;
}

static void op_add_chainlock_ret(const char *filename,
				 struct op op[], unsigned int op_num,
				 char *words[])
{
	if (!words[2] || !words[3] || !words[4] || words[5]
	    || !streq(words[3], "="))
		fail(filename, op_num+1, "Expected <key> = <ret>");
	op[op_num].ret = atoi(words[4]);
	op[op_num].data = make_tdb_data(op, filename, op_num+1, words[2]);
	op[op_num].key = tdb_null;
	op[op_num].group_len = 0;
	total_keys++;
}

static int op_find_start(struct op op[], unsigned int op_num, enum op_type type)
{
	unsigned int i;

	for (i = op_num-1; i > 0; i--) {
		if (op[i].op == type && !op[i].group_len)
			return i;
	}
	return 0;
}

static void op_analyze_transaction(const char *filename,
				   struct op op[], unsigned int op_num,
				   char *words[])
{
	unsigned int start, i;

	op[op_num].key = tdb_null;

	if (words[2])
		fail(filename, op_num+1, "Expect no arguments");

	start = op_find_start(op, op_num, OP_TDB_TRANSACTION_START);
	if (!start)
		fail(filename, op_num+1, "no transaction start found");

	op[start].group_len = op_num - start;

	/* This rolls in nested transactions.  I think that's right. */
	for (i = start; i <= op_num; i++)
		op[i].group_start = start;
}

/* We treat chainlocks a lot like transactions, even though that's overkill */
static void op_analyze_chainlock(const char *filename,
				 struct op op[], unsigned int op_num,
				 char *words[])
{
	unsigned int i, start;

	if (words[2] == NULL || words[3])
		fail(filename, op_num+1, "Expected just a key");

	op[op_num].data = make_tdb_data(op, filename, op_num+1, words[2]);
	op[op_num].key = tdb_null;
	total_keys++;

	start = op_find_start(op, op_num, OP_TDB_CHAINLOCK);
	if (!start)
		start = op_find_start(op, op_num, OP_TDB_CHAINLOCK_READ);
	if (!start)
		fail(filename, op_num+1, "no initial chainlock found");

	/* FIXME: We'd have to do something clever to make this work
	 * vs. deadlock. */
	if (!key_eq(op[start].data, op[op_num].data))
		fail(filename, op_num+1, "nested chainlock calls?");

	op[start].group_len = op_num - start;
	for (i = start; i <= op_num; i++)
		op[i].group_start = start;
}

static void op_analyze_traverse(const char *filename,
				struct op op[], unsigned int op_num,
				char *words[])
{
	int i, start;

	op[op_num].key = tdb_null;

	/* = %u means traverse function terminated. */
	if (words[2]) {
		if (!streq(words[2], "=") || !words[3] || words[4])
			fail(filename, op_num+1, "expect = <num>");
		op[op_num].ret = atoi(words[3]);
	} else
		op[op_num].ret = 0;

	start = op_find_start(op, op_num, OP_TDB_TRAVERSE_START);
	if (!start)
		start = op_find_start(op, op_num, OP_TDB_TRAVERSE_READ_START);
	if (!start)
		fail(filename, op_num+1, "no traversal start found");

	op[start].group_len = op_num - start;

	/* Don't roll in nested traverse/chainlock */
	for (i = start; i <= op_num; i++)
		if (!op[i].group_start)
			op[i].group_start = start;
}

/* Keep -Wmissing-declarations happy: */
const struct op_table *
find_keyword (register const char *str, register unsigned int len);

#include "keywords.c"

struct depend {
	/* We can have more than one */
	struct list_node pre_list;
	struct list_node post_list;
	unsigned int needs_file;
	unsigned int needs_opnum;
	unsigned int satisfies_file;
	unsigned int satisfies_opnum;
};

static void check_deps(const char *filename, struct op op[], unsigned int num)
{
#ifdef DEBUG_DEPS
	unsigned int i;

	for (i = 1; i < num; i++)
		if (!list_empty(&op[i].pre))
			fail(filename, i+1, "Still has dependencies");
#endif
}

static void dump_pre(char *filename[], struct op *op[],
		     unsigned int file, unsigned int i)
{
	struct depend *dep;

	printf("%s:%u (%u) still waiting for:\n", filename[file], i+1,
		op[file][i].serial);
	list_for_each(&op[file][i].pre, dep, pre_list)
		printf("    %s:%u (%u)\n",
		       filename[dep->satisfies_file], dep->satisfies_opnum+1,
		       op[dep->satisfies_file][dep->satisfies_opnum].serial);
	check_deps(filename[file], op[file], i);
}

/* We simply read/write pointers, since we all are children. */
static bool do_pre(struct tdb_context *tdb,
		   char *filename[], struct op *op[],
		   unsigned int file, int pre_fd, unsigned int i,
		   bool backoff)
{
	while (!list_empty(&op[file][i].pre)) {
		struct depend *dep;

#if DEBUG_DEPS
		printf("%s:%u:waiting for pre\n", filename[file], i+1);
		fflush(stdout);
#endif
		if (backoff)
			alarm(2);
		else
			alarm(10);
		while (read(pre_fd, &dep, sizeof(dep)) != sizeof(dep)) {
			if (errno == EINTR) {
				if (backoff) {
					warnx("%s:%u:avoiding deadlock",
					      filename[file], i+1);
					return false;
				}
				dump_pre(filename, op, file, i);
				exit(1);
			} else
				errx(1, "Reading from pipe");
		}
		alarm(0);

#if DEBUG_DEPS
		printf("%s:%u:got pre %u from %s:%u\n", filename[file], i+1,
		       dep->needs_opnum+1, filename[dep->satisfies_file],
		       dep->satisfies_opnum+1);
		fflush(stdout);
#endif
		/* This could be any op, not just this one. */
		talloc_free(dep);
	}
	return true;
}

static void do_post(char *filename[], struct op *op[],
		    unsigned int file, unsigned int i)
{
	struct depend *dep;

	list_for_each(&op[file][i].post, dep, post_list) {
#if DEBUG_DEPS
		printf("%s:%u:sending to file %s:%u\n", filename[file], i+1,
		       filename[dep->needs_file], dep->needs_opnum+1);
#endif
		if (write(pipes[dep->needs_file].fd[1], &dep, sizeof(dep))
		    != sizeof(dep))
			err(1, "%s:%u failed to tell file %s",
			    filename[file], i+1, filename[dep->needs_file]);
	}
}

static int get_len(TDB_DATA key, TDB_DATA data, void *private_data)
{
	return data.dsize;
}

static unsigned run_ops(struct tdb_context *tdb,
			int pre_fd,
			char *filename[],
			struct op *op[],
			unsigned int file,
			unsigned int start, unsigned int stop,
			bool backoff);

struct traverse_info {
	struct op **op;
	char **filename;
	unsigned file;
	int pre_fd;
	unsigned int start;
	unsigned int i;
};

/* More complex.  Just do whatever's they did at the n'th entry. */
static int nontrivial_traverse(struct tdb_context *tdb,
			       TDB_DATA key, TDB_DATA data,
			       void *_tinfo)
{
	struct traverse_info *tinfo = _tinfo;
	unsigned int trav_len = tinfo->op[tinfo->file][tinfo->start].group_len;
	bool avoid_deadlock = false;

	if (tinfo->i == tinfo->start + trav_len) {
		/* This can happen if traverse expects to be empty. */
		if (trav_len == 1)
			return 1;
		fail(tinfo->filename[tinfo->file], tinfo->start + 1,
		     "traverse did not terminate");
	}

	if (tinfo->op[tinfo->file][tinfo->i].op != OP_TDB_TRAVERSE)
		fail(tinfo->filename[tinfo->file], tinfo->start + 1,
		     "%s:%u:traverse terminated early");

#if TRAVERSALS_TAKE_TRANSACTION_LOCK
	avoid_deadlock = true;
#endif

	/* Run any normal ops. */
	tinfo->i = run_ops(tdb, tinfo->pre_fd, tinfo->filename, tinfo->op,
			   tinfo->file, tinfo->i+1, tinfo->start + trav_len,
			   avoid_deadlock);

	/* We backed off, or we hit OP_TDB_TRAVERSE_END. */
	if (tinfo->op[tinfo->file][tinfo->i].op != OP_TDB_TRAVERSE)
		return 1;

	return 0;
}

static unsigned op_traverse(struct tdb_context *tdb,
			    int pre_fd,
			    char *filename[],
			    unsigned int file,
			    int (*traversefn)(struct tdb_context *,
					      tdb_traverse_func, void *),
			    struct op *op[],
			    unsigned int start)
{
	struct traverse_info tinfo = { op, filename, file, pre_fd,
				       start, start+1 };

	traversefn(tdb, nontrivial_traverse, &tinfo);

	/* Traversing in wrong order can have strange effects: eg. if
	 * original traverse went A (delete A), B, we might do B
	 * (delete A).  So if we have ops left over, we do it now. */
	while (tinfo.i != start + op[file][start].group_len) {
		if (op[file][tinfo.i].op == OP_TDB_TRAVERSE)
			tinfo.i++;
		else
			tinfo.i = run_ops(tdb, pre_fd, filename, op, file,
					  tinfo.i,
					  start + op[file][start].group_len,
					  false);
	}

	return tinfo.i;
}

static void break_out(int sig)
{
}

static __attribute__((noinline))
unsigned run_ops(struct tdb_context *tdb,
		 int pre_fd,
		 char *filename[],
		 struct op *op[],
		 unsigned int file,
		 unsigned int start, unsigned int stop,
		 bool backoff)
{
	unsigned int i;
	struct sigaction sa;

	sa.sa_handler = break_out;
	sa.sa_flags = 0;

	sigaction(SIGALRM, &sa, NULL);
	for (i = start; i < stop; i++) {
		if (!do_pre(tdb, filename, op, file, pre_fd, i, backoff))
			return i;

		switch (op[file][i].op) {
		case OP_TDB_LOCKALL:
			try(tdb_lockall(tdb), op[file][i].ret);
			break;
		case OP_TDB_LOCKALL_MARK:
			try(tdb_lockall_mark(tdb), op[file][i].ret);
			break;
		case OP_TDB_LOCKALL_UNMARK:
			try(tdb_lockall_unmark(tdb), op[file][i].ret);
			break;
		case OP_TDB_LOCKALL_NONBLOCK:
			unreliable(tdb_lockall_nonblock(tdb), op[file][i].ret,
				   tdb_lockall(tdb), tdb_unlockall(tdb));
			break;
		case OP_TDB_UNLOCKALL:
			try(tdb_unlockall(tdb), op[file][i].ret);
			break;
		case OP_TDB_LOCKALL_READ:
			try(tdb_lockall_read(tdb), op[file][i].ret);
			break;
		case OP_TDB_LOCKALL_READ_NONBLOCK:
			unreliable(tdb_lockall_read_nonblock(tdb),
				   op[file][i].ret,
				   tdb_lockall_read(tdb),
				   tdb_unlockall_read(tdb));
			break;
		case OP_TDB_UNLOCKALL_READ:
			try(tdb_unlockall_read(tdb), op[file][i].ret);
			break;
		case OP_TDB_CHAINLOCK:
			try(tdb_chainlock(tdb, op[file][i].key),
			    op[file][i].ret);
			break;
		case OP_TDB_CHAINLOCK_NONBLOCK:
			unreliable(tdb_chainlock_nonblock(tdb, op[file][i].key),
				   op[file][i].ret,
				   tdb_chainlock(tdb, op[file][i].key),
				   tdb_chainunlock(tdb, op[file][i].key));
			break;
		case OP_TDB_CHAINLOCK_MARK:
			try(tdb_chainlock_mark(tdb, op[file][i].key),
			    op[file][i].ret);
			break;
		case OP_TDB_CHAINLOCK_UNMARK:
			try(tdb_chainlock_unmark(tdb, op[file][i].key),
			    op[file][i].ret);
			break;
		case OP_TDB_CHAINUNLOCK:
			try(tdb_chainunlock(tdb, op[file][i].key),
			    op[file][i].ret);
			break;
		case OP_TDB_CHAINLOCK_READ:
			try(tdb_chainlock_read(tdb, op[file][i].key),
			    op[file][i].ret);
			break;
		case OP_TDB_CHAINUNLOCK_READ:
			try(tdb_chainunlock_read(tdb, op[file][i].key),
			    op[file][i].ret);
			break;
		case OP_TDB_PARSE_RECORD:
			try(tdb_parse_record(tdb, op[file][i].key, get_len,
					     NULL),
			    op[file][i].ret);
			break;
		case OP_TDB_EXISTS:
			try(tdb_exists(tdb, op[file][i].key), op[file][i].ret);
			break;
		case OP_TDB_STORE:
			try(tdb_store(tdb, op[file][i].key, op[file][i].data,
				      op[file][i].flag),
			    op[file][i].ret);
			break;
		case OP_TDB_APPEND:
			try(tdb_append(tdb, op[file][i].key, op[file][i].data),
			    op[file][i].ret);
			break;
		case OP_TDB_GET_SEQNUM:
			try(tdb_get_seqnum(tdb), op[file][i].ret);
			break;
		case OP_TDB_WIPE_ALL:
			try(tdb_wipe_all(tdb), op[file][i].ret);
			break;
		case OP_TDB_TRANSACTION_START:
			try(tdb_transaction_start(tdb), op[file][i].ret);
			break;
		case OP_TDB_TRANSACTION_CANCEL:
			try(tdb_transaction_cancel(tdb), op[file][i].ret);
			break;
		case OP_TDB_TRANSACTION_COMMIT:
			try(tdb_transaction_commit(tdb), op[file][i].ret);
			break;
		case OP_TDB_TRAVERSE_READ_START:
			i = op_traverse(tdb, pre_fd, filename, file,
					tdb_traverse_read, op, i);
			break;
		case OP_TDB_TRAVERSE_START:
			i = op_traverse(tdb, pre_fd, filename, file,
					tdb_traverse, op, i);
			break;
		case OP_TDB_TRAVERSE:
			/* Terminate: we're in a traverse, and we've
			 * done our ops. */
			return i;
		case OP_TDB_TRAVERSE_END:
			fail(filename[file], i+1, "unexpected end traverse");
		/* FIXME: These must be treated like traverse. */
		case OP_TDB_FIRSTKEY:
			if (!key_eq(tdb_firstkey(tdb), op[file][i].data))
				fail(filename[file], i+1, "bad firstkey");
			break;
		case OP_TDB_NEXTKEY:
			if (!key_eq(tdb_nextkey(tdb, op[file][i].key),
				    op[file][i].data))
				fail(filename[file], i+1, "bad nextkey");
			break;
		case OP_TDB_FETCH: {
			TDB_DATA f = tdb_fetch(tdb, op[file][i].key);
			if (!key_eq(f, op[file][i].data))
				fail(filename[file], i+1, "bad fetch %u",
				     f.dsize);
			break;
		}
		case OP_TDB_DELETE:
			try(tdb_delete(tdb, op[file][i].key), op[file][i].ret);
			break;
		}
		do_post(filename, op, file, i);
	}
	return i;
}

/* tdbtorture, in particular, can do a tdb_close with a transaction in
 * progress. */
static struct op *maybe_cancel_transaction(const char *filename,
					   struct op *op, unsigned int *num)
{
	unsigned int start = op_find_start(op, *num, OP_TDB_TRANSACTION_START);

	if (start) {
		char *words[] = { "<unknown>", "tdb_close", NULL };
		add_op(filename, &op, *num, op[start].serial,
		       OP_TDB_TRANSACTION_CANCEL);
		op_analyze_transaction(filename, op, *num, words);
		(*num)++;
	}
	return op;
}

static struct op *load_tracefile(const char *filename, unsigned int *num,
				 unsigned int *hashsize,
				 unsigned int *tdb_flags,
				 unsigned int *open_flags)
{
	unsigned int i;
	struct op *op = talloc_array(NULL, struct op, 1);
	char **words;
	char **lines;
	char *file;

	file = grab_file(NULL, filename, NULL);
	if (!file)
		err(1, "Reading %s", filename);

	lines = strsplit(file, file, "\n", NULL);
	if (!lines[0])
		errx(1, "%s is empty", filename);

	words = strsplit(lines, lines[0], " ", NULL);
	if (!streq(words[1], "tdb_open"))
		fail(filename, 1, "does not start with tdb_open");

	*hashsize = atoi(words[2]);
	*tdb_flags = strtoul(words[3], NULL, 0);
	*open_flags = strtoul(words[4], NULL, 0);

	for (i = 1; lines[i]; i++) {
		const struct op_table *opt;

		words = strsplit(lines, lines[i], " ", NULL);
		if (!words[0] || !words[1])
			fail(filename, i+1, "Expected serial number and op");
	       
		opt = find_keyword(words[1], strlen(words[1]));
		if (!opt) {
			if (streq(words[1], "tdb_close")) {
				if (lines[i+1])
					fail(filename, i+2,
					     "lines after tdb_close");
				*num = i;
				talloc_free(lines);
				return maybe_cancel_transaction(filename,
								op, num);
			}
			fail(filename, i+1, "Unknown operation '%s'", words[1]);
		}

		add_op(filename, &op, i, atoi(words[0]), opt->type);
		opt->enhance_op(filename, op, i, words);
	}

	fprintf(stderr, "%s:%u:last operation is not tdb_close: incomplete?",
	      filename, i);
	talloc_free(lines);
	*num = i - 1;
	return maybe_cancel_transaction(filename, op, num);
}

/* We remember all the keys we've ever seen, and who has them. */
struct key_user {
	unsigned int file;
	unsigned int op_num;
};

struct keyinfo {
	TDB_DATA key;
	unsigned int num_users;
	struct key_user *user;
};

static const TDB_DATA must_not_exist;
static const TDB_DATA must_exist;
static const TDB_DATA not_exists_or_empty;

/* NULL means doesn't care if it exists or not, &must_exist means
 * it must exist but we don't care what, &must_not_exist means it must
 * not exist, otherwise the data it needs. */
static const TDB_DATA *needs(const struct op *op)
{
	switch (op->op) {
	/* FIXME: Pull forward deps, since we can deadlock */
	case OP_TDB_CHAINLOCK:
	case OP_TDB_CHAINLOCK_NONBLOCK:
	case OP_TDB_CHAINLOCK_MARK:
	case OP_TDB_CHAINLOCK_UNMARK:
	case OP_TDB_CHAINUNLOCK:
	case OP_TDB_CHAINLOCK_READ:
	case OP_TDB_CHAINUNLOCK_READ:
		return NULL;

	case OP_TDB_APPEND:
		if (op->append.pre.dsize == 0)
			return &not_exists_or_empty;
		return &op->append.pre;

	case OP_TDB_STORE:
		if (op->flag == TDB_INSERT) {
			if (op->ret < 0)
				return &must_exist;
			else
				return &must_not_exist;
		} else if (op->flag == TDB_MODIFY) {
			if (op->ret < 0)
				return &must_not_exist;
			else
				return &must_exist;
		}
		/* No flags?  Don't care */
		return NULL;

	case OP_TDB_EXISTS:
		if (op->ret == 1)
			return &must_exist;
		else
			return &must_not_exist;

	case OP_TDB_PARSE_RECORD:
		if (op->ret < 0)
			return &must_not_exist;
		return &must_exist;

	/* FIXME: handle these. */
	case OP_TDB_WIPE_ALL:
	case OP_TDB_FIRSTKEY:
	case OP_TDB_NEXTKEY:
	case OP_TDB_GET_SEQNUM:
	case OP_TDB_TRAVERSE:
	case OP_TDB_TRANSACTION_COMMIT:
	case OP_TDB_TRANSACTION_CANCEL:
	case OP_TDB_TRANSACTION_START:
		return NULL;

	case OP_TDB_FETCH:
		if (!op->data.dptr)
			return &must_not_exist;
		return &op->data;

	case OP_TDB_DELETE:
		if (op->ret < 0)
			return &must_not_exist;
		return &must_exist;

	default:
		errx(1, "Unexpected op %i", op->op);
	}
	
}

static bool starts_transaction(const struct op *op)
{
	return op->op == OP_TDB_TRANSACTION_START;
}

static bool in_transaction(const struct op op[], unsigned int i)
{
	return op[i].group_start && starts_transaction(&op[op[i].group_start]);
}

static bool successful_transaction(const struct op *op)
{
	return starts_transaction(op)
		&& op[op->group_len].op == OP_TDB_TRANSACTION_COMMIT;
}

static bool starts_traverse(const struct op *op)
{
	return op->op == OP_TDB_TRAVERSE_START
		|| op->op == OP_TDB_TRAVERSE_READ_START;
}

static bool in_traverse(const struct op op[], unsigned int i)
{
	return op[i].group_start && starts_traverse(&op[op[i].group_start]);
}

static bool starts_chainlock(const struct op *op)
{
	return op->op == OP_TDB_CHAINLOCK_READ || op->op == OP_TDB_CHAINLOCK;
}

static bool in_chainlock(const struct op op[], unsigned int i)
{
	return op[i].group_start && starts_chainlock(&op[op[i].group_start]);
}

/* What's the data after this op?  pre if nothing changed. */
static const TDB_DATA *gives(const TDB_DATA *key, const TDB_DATA *pre,
			     const struct op *op)
{
	if (starts_transaction(op) || starts_chainlock(op)) {
		unsigned int i;

		/* Cancelled transactions don't change anything. */
		if (op[op->group_len].op == OP_TDB_TRANSACTION_CANCEL)
			return pre;
		assert(op[op->group_len].op == OP_TDB_TRANSACTION_COMMIT
		       || op[op->group_len].op == OP_TDB_CHAINUNLOCK_READ
		       || op[op->group_len].op == OP_TDB_CHAINUNLOCK);

		for (i = 1; i < op->group_len; i++) {
			/* This skips nested transactions, too */
			if (key_eq(op[i].key, *key))
				pre = gives(key, pre, &op[i]);
		}
		return pre;
	}

	/* Failed ops don't change state of db. */
	if (op->ret < 0)
		return pre;

	if (op->op == OP_TDB_DELETE || op->op == OP_TDB_WIPE_ALL)
		return &tdb_null;

	if (op->op == OP_TDB_APPEND)
		return &op->append.post;

	if (op->op == OP_TDB_STORE)
		return &op->data;

	return pre;
}

static struct keyinfo *hash_ops(struct op *op[], unsigned int num_ops[],
				unsigned int num)
{
	unsigned int i, j, h;
	struct keyinfo *hash;

	hash = talloc_zero_array(op[0], struct keyinfo, total_keys*2);
	for (i = 0; i < num; i++) {
		for (j = 1; j < num_ops[i]; j++) {
			/* We can't do this on allocation, due to realloc. */
			list_head_init(&op[i][j].post);
			list_head_init(&op[i][j].pre);

			if (!op[i][j].key.dptr)
				continue;

			h = hash_key(&op[i][j].key) % (total_keys * 2);
			while (!key_eq(hash[h].key, op[i][j].key)) {
				if (!hash[h].key.dptr) {
					hash[h].key = op[i][j].key;
					break;
				}
				h = (h + 1) % (total_keys * 2);
			}
			/* Might as well save some memory if we can. */
			if (op[i][j].key.dptr != hash[h].key.dptr) {
				talloc_free(op[i][j].key.dptr);
				op[i][j].key.dptr = hash[h].key.dptr;
			}
			hash[h].user = talloc_realloc(hash, hash[h].user,
						     struct key_user,
						     hash[h].num_users+1);

			/* If it's in a transaction, it's the transaction which
			 * matters from an analysis POV. */
			if (in_transaction(op[i], j)
			    || in_chainlock(op[i], j)) {
				unsigned start = op[i][j].group_start;

				/* Don't include twice. */
				if (hash[h].num_users
				    && hash[h].user[hash[h].num_users-1].file
					== i
				    && hash[h].user[hash[h].num_users-1].op_num
					== start)
					continue;

				hash[h].user[hash[h].num_users].op_num = start;
			} else
				hash[h].user[hash[h].num_users].op_num = j;
			hash[h].user[hash[h].num_users].file = i;
			hash[h].num_users++;
		}
	}

	return hash;
}

static bool satisfies(const TDB_DATA *key, const TDB_DATA *data,
		      const struct op *op)
{
	const TDB_DATA *need = NULL;

	if (starts_transaction(op) || starts_chainlock(op)) {
		unsigned int i;

		/* Look through for an op in this transaction which
		 * needs this key. */
		for (i = 1; i < op->group_len; i++) {
			if (key_eq(op[i].key, *key)) {
				need = needs(&op[i]);
				/* tdb_exists() is special: there might be
				 * something in the transaction with more
				 * specific requirements.  Other ops don't have
				 * specific requirements (eg. store or delete),
				 * but they change the value so we can't get
				 * more information from future ops. */
				if (op[i].op != OP_TDB_EXISTS)
					break;
			}
		}
	} else
		need = needs(op);

	/* Don't need anything?  Cool. */
	if (!need)
		return true;

	/* This should be tdb_null or a real value. */
	assert(data != &must_exist);
	assert(data != &must_not_exist);
	assert(data != &not_exists_or_empty);

	/* Must not exist?  data must not exist. */
	if (need == &must_not_exist)
		return data == &tdb_null;

	/* Must exist? */
	if (need == &must_exist)
		return data != &tdb_null;

	/* Either noexist or empty. */
	if (need == &not_exists_or_empty)
		return data->dsize == 0;

	/* Needs something specific. */
	return key_eq(*data, *need);
}

static void move_to_front(struct key_user res[], unsigned off, unsigned elem)
{
	if (elem != off) {
		struct key_user tmp = res[elem];
		memmove(res + off + 1, res + off, (elem - off)*sizeof(res[0]));
		res[off] = tmp;
	}
}

static void restore_to_pos(struct key_user res[], unsigned off, unsigned elem)
{
	if (elem != off) {
		struct key_user tmp = res[off];
		memmove(res + off, res + off + 1, (elem - off)*sizeof(res[0]));
		res[elem] = tmp;
	}
}

static bool sort_deps(char *filename[], struct op *op[],
		      struct key_user res[],
		      unsigned off, unsigned num,
		      const TDB_DATA *key, const TDB_DATA *data,
		      unsigned num_files, unsigned fuzz)
{
	unsigned int i, files_done;
	struct op *this_op;
	bool done[num_files];

	/* None left?  We're sorted. */
	if (off == num)
		return true;

	/* Does this make serial numbers go backwards?  Allow a little fuzz. */
	if (off > 0) {
		int serial1 = op[res[off-1].file][res[off-1].op_num].serial;
		int serial2 = op[res[off].file][res[off].op_num].serial;

		if (serial1 - serial2 > (int)fuzz) {
#if DEBUG_DEPS
			printf("Serial jump too far (%u -> %u)\n",
			       serial1, serial2);
#endif
			return false;
		}
	}

	memset(done, 0, sizeof(done));

	/* Since ops within a trace file are ordered, we just need to figure
	 * out which file to try next.  Since we don't take into account
	 * inter-key relationships (which exist by virtue of trace file order),
	 * we minimize the chance of harm by trying to keep in serial order. */
	for (files_done = 0, i = off; i < num && files_done < num_files; i++) {
		if (done[res[i].file])
			continue;

		this_op = &op[res[i].file][res[i].op_num];

		/* Is what we have good enough for this op? */
		if (satisfies(key, data, this_op)) {
			move_to_front(res, off, i);
			if (sort_deps(filename, op, res, off+1, num,
				      key, gives(key, data, this_op),
				      num_files, fuzz))
				return true;
			restore_to_pos(res, off, i);
		}
		done[res[i].file] = true;
		files_done++;
	}

	/* No combination worked. */
	return false;
}

static void check_dep_sorting(struct key_user user[], unsigned num_users,
			      unsigned num_files)
{
#if DEBUG_DEPS
	unsigned int i;
	unsigned minima[num_files];

	memset(minima, 0, sizeof(minima));
	for (i = 0; i < num_users; i++) {
		assert(minima[user[i].file] < user[i].op_num);
		minima[user[i].file] = user[i].op_num;
	}
#endif
}

/* All these ops happen on the same key.  Which comes first?
 *
 * This can happen both because read ops or failed write ops don't
 * change serial number, and also due to race since we access the
 * number unlocked (the race can cause less detectable ordering problems,
 * in which case we'll deadlock and report: fix manually in that case).
 */
static void figure_deps(char *filename[], struct op *op[],
			const TDB_DATA *key, struct key_user user[],
			unsigned num_users, unsigned num_files)
{
	/* We assume database starts empty. */
	const struct TDB_DATA *data = &tdb_null;
	unsigned int fuzz;

	/* We prefer to keep strict serial order if possible: it's the
	 * most likely.  We get more lax if that fails. */
	for (fuzz = 0; fuzz < 100; fuzz = (fuzz + 1)*2) {
		if (sort_deps(filename, op, user, 0, num_users, key, data,
			      num_files, fuzz))
			break;
	}

	if (fuzz >= 100)
		fail(filename[user[0].file], user[0].op_num+1,
		     "Could not resolve inter-dependencies");

	check_dep_sorting(user, num_users, num_files);
}

static void sort_ops(struct keyinfo hash[], char *filename[], struct op *op[],
		     unsigned int num)
{
	unsigned int h;

	/* Gcc nexted function extension.  How cool is this? */
	int compare_serial(const void *_a, const void *_b)
	{
		const struct key_user *a = _a, *b = _b;

		/* First, maintain order within any trace file. */
		if (a->file == b->file)
			return a->op_num - b->op_num;

		/* Otherwise, arrange by serial order. */
		if (op[a->file][a->op_num].serial !=
		    op[b->file][b->op_num].serial)
			return op[a->file][a->op_num].serial
				- op[b->file][b->op_num].serial;

		/* Cancelled transactions are assumed to happen first. */
		if (starts_transaction(&op[a->file][a->op_num])
		    && !successful_transaction(&op[a->file][a->op_num]))
			return -1;
		if (starts_transaction(&op[b->file][b->op_num])
		    && !successful_transaction(&op[b->file][b->op_num]))
			return 1;

		/* No idea. */
		return 0;
	}

	/* Now sort into serial order. */
	for (h = 0; h < total_keys * 2; h++) {
		struct key_user *user = hash[h].user;

		qsort(user, hash[h].num_users, sizeof(user[0]), compare_serial);
		figure_deps(filename, op, &hash[h].key, user, hash[h].num_users,
			    num);
	}
}

static int destroy_depend(struct depend *dep)
{
	list_del(&dep->pre_list);
	list_del(&dep->post_list);
	return 0;
}

static void add_dependency(void *ctx,
			   struct op *op[],
			   char *filename[],
			   unsigned int needs_file,
			   unsigned int needs_opnum,
			   unsigned int satisfies_file,
			   unsigned int satisfies_opnum)
{
	struct depend *dep;

	/* We don't depend on ourselves. */
	if (needs_file == satisfies_file) {
		assert(satisfies_opnum < needs_opnum);
		return;
	}

#if DEBUG_DEPS
	printf("%s:%u: depends on %s:%u\n",
	       filename[needs_file], needs_opnum+1,
	       filename[satisfies_file], satisfies_opnum+1);
#endif

#if TRAVERSALS_TAKE_TRANSACTION_LOCK
	/* If something in a traverse depends on something in another
	 * traverse/transaction, it creates a dependency between the
	 * two groups. */
	if ((in_traverse(op[satisfies_file], satisfies_opnum)
	     && (starts_transaction(&op[needs_file][needs_opnum])
		 || starts_traverse(&op[needs_file][needs_opnum])))
	    || (in_traverse(op[needs_file], needs_opnum)
		&& (starts_transaction(&op[satisfies_file][satisfies_opnum])
		    || starts_traverse(&op[satisfies_file][satisfies_opnum])))){
		unsigned int sat;

		/* We are satisfied by end of group. */
		sat = op[satisfies_file][satisfies_opnum].group_start;
		satisfies_opnum = sat + op[satisfies_file][sat].group_len;
		/* And we need that done by start of our group. */
		needs_opnum = op[needs_file][needs_opnum].group_start;
	}

	/* There is also this case:
	 *  <traverse> <read foo> ...
	 *  <transaction> ... </transaction> <create foo>
	 * Where if we start the traverse then wait, we could block
	 * the transaction and deadlock.
	 *
	 * We try to address this by ensuring that where seqnum indicates it's
	 * possible, we wait for <create foo> before *starting* traverse.
	 */
	else if (in_traverse(op[needs_file], needs_opnum)) {
		struct op *need = &op[needs_file][needs_opnum];
		if (op[needs_file][need->group_start].serial >
		    op[satisfies_file][satisfies_opnum].serial) {
			needs_opnum = need->group_start;
		}
	}
#endif

 	/* If you depend on a transaction or chainlock, you actually
	 * depend on it ending. */
 	if (starts_transaction(&op[satisfies_file][satisfies_opnum])
	    || starts_chainlock(&op[satisfies_file][satisfies_opnum])) {
 		satisfies_opnum
 			+= op[satisfies_file][satisfies_opnum].group_len;
#if DEBUG_DEPS
		printf("-> Actually end of transaction %s:%u\n",
		       filename[satisfies_file], satisfies_opnum+1);
#endif
 	} else
		/* We should never create a dependency from middle of
		 * a transaction. */
 		assert(!in_transaction(op[satisfies_file], satisfies_opnum)
		       || op[satisfies_file][satisfies_opnum].op
 		       == OP_TDB_TRANSACTION_COMMIT
 		       || op[satisfies_file][satisfies_opnum].op
 		       == OP_TDB_TRANSACTION_CANCEL);

	assert(op[needs_file][needs_opnum].op != OP_TDB_TRAVERSE);
	assert(op[satisfies_file][satisfies_opnum].op != OP_TDB_TRAVERSE);

	dep = talloc(ctx, struct depend);
	dep->needs_file = needs_file;
	dep->needs_opnum = needs_opnum;
	dep->satisfies_file = satisfies_file;
	dep->satisfies_opnum = satisfies_opnum;
	list_add(&op[satisfies_file][satisfies_opnum].post, &dep->post_list);
	list_add(&op[needs_file][needs_opnum].pre, &dep->pre_list);
	talloc_set_destructor(dep, destroy_depend);
}

static bool changes_db(const TDB_DATA *key, const struct op *op)
{
	return gives(key, NULL, op) != NULL;
}

static void depend_on_previous(struct op *op[],
			       char *filename[],
			       unsigned int num,
			       struct key_user user[],
			       unsigned int i,
			       int prev)
{
	bool deps[num];
	int j;

	if (i == 0)
		return;

	if (prev == i - 1) {
		/* Just depend on previous. */
		add_dependency(NULL, op, filename,
			       user[i].file, user[i].op_num,
			       user[prev].file, user[prev].op_num);
		return;
	}

	/* We have to wait for the readers.  Find last one in *each* file. */
	memset(deps, 0, sizeof(deps));
	deps[user[i].file] = true;
	for (j = i - 1; j > prev; j--) {
		if (!deps[user[j].file]) {
			add_dependency(NULL, op, filename,
				       user[i].file, user[i].op_num,
				       user[j].file, user[j].op_num);
			deps[user[j].file] = true;
		}
	}
}

/* This is simple, but not complete.  We don't take into account
 * indirect dependencies. */
static void optimize_dependencies(struct op *op[], unsigned int num_ops[],
				  unsigned int num)
{
	unsigned int i, j;

	/* There can only be one real dependency on each file */
	for (i = 0; i < num; i++) {
		for (j = 1; j < num_ops[i]; j++) {
			struct depend *dep, *next;
			struct depend *prev[num];

			memset(prev, 0, sizeof(prev));

			list_for_each_safe(&op[i][j].pre, dep, next, pre_list) {
				if (!prev[dep->satisfies_file]) {
					prev[dep->satisfies_file] = dep;
					continue;
				}
				if (prev[dep->satisfies_file]->satisfies_opnum
				    < dep->satisfies_opnum) {
					talloc_free(prev[dep->satisfies_file]);
					prev[dep->satisfies_file] = dep;
				} else
					talloc_free(dep);
			}
		}
	}

	for (i = 0; i < num; i++) {
		int deps[num];

		for (j = 0; j < num; j++)
			deps[j] = -1;

		for (j = 1; j < num_ops[i]; j++) {
			struct depend *dep, *next;

			list_for_each_safe(&op[i][j].pre, dep, next, pre_list) {
				if (deps[dep->satisfies_file]
				    >= (int)dep->satisfies_opnum)
					talloc_free(dep);
				else
					deps[dep->satisfies_file]
						= dep->satisfies_opnum;
			}
		}
	}
}

#if TRAVERSALS_TAKE_TRANSACTION_LOCK
struct traverse_dep {
	unsigned int file;
	unsigned int op_num;
};

/* Force an order among the traversals, so they don't deadlock (as much) */
static void make_traverse_depends(char *filename[],
				  struct op *op[], unsigned int num_ops[],
				  unsigned int num)
{
	unsigned int i, num_traversals = 0;
	int j;
	struct traverse_dep *dep;

	/* Sort by which one runs first. */
	int compare_traverse_dep(const void *_a, const void *_b)
	{
		const struct traverse_dep *ta = _a, *tb = _b;
		const struct op *a = &op[ta->file][ta->op_num],
			*b = &op[tb->file][tb->op_num];

		if (a->serial != b->serial)
			return a->serial - b->serial;

		/* If they have same serial, it means one didn't make any
		 * changes.  Thus sort by end in that case. */
		return a[a->group_len].serial - b[b->group_len].serial;
	}

	dep = talloc_array(NULL, struct traverse_dep, 1);

	/* Count them. */
	for (i = 0; i < num; i++) {
		for (j = 1; j < num_ops[i]; j++) {
 			/* Traverse start (ignore those in
			 * transactions; they're already covered by
			 * transaction dependencies). */
			if (starts_traverse(&op[i][j])
			    && !in_transaction(op[i], j)) {
				dep = talloc_realloc(NULL, dep,
						     struct traverse_dep,
						     num_traversals+1);
				dep[num_traversals].file = i;
				dep[num_traversals].op_num = j;
				num_traversals++;
			}
		}
	}
	qsort(dep, num_traversals, sizeof(dep[0]), compare_traverse_dep);

	for (i = 1; i < num_traversals; i++) {
		const struct op *prev = &op[dep[i-1].file][dep[i-1].op_num];
		const struct op *curr = &op[dep[i].file][dep[i].op_num];

		/* Read traverses don't depend on each other (read lock). */
		if (prev->op == OP_TDB_TRAVERSE_READ_START
		    && curr->op == OP_TDB_TRAVERSE_READ_START)
			continue;

		/* Only make dependency if it's clear. */
		if (compare_traverse_dep(&dep[i], &dep[i-1])) {
			/* i depends on end of traverse i-1. */
			add_dependency(NULL, op, filename,
				       dep[i].file, dep[i].op_num,
				       dep[i-1].file, dep[i-1].op_num
				       + prev->group_len);
		}
	}
	talloc_free(dep);
}
#endif

static void derive_dependencies(char *filename[],
				struct op *op[], unsigned int num_ops[],
				unsigned int num)
{
	struct keyinfo *hash;
	unsigned int h, i;

	/* Create hash table for faster key lookup. */
	hash = hash_ops(op, num_ops, num);

	/* Sort them by serial number. */
	sort_ops(hash, filename, op, num);

	/* Create dependencies back to the last change, rather than
	 * creating false dependencies by naively making each one
	 * depend on the previous.  This has two purposes: it makes
	 * later optimization simpler, and it also avoids deadlock with
	 * same sequence number ops inside traversals (if one
	 * traversal doesn't write anything, two ops can have the same
	 * sequence number yet we can create a traversal dependency
	 * the other way). */
	for (h = 0; h < total_keys * 2; h++) {
		int prev = -1;

		if (hash[h].num_users < 2)
			continue;

		for (i = 0; i < hash[h].num_users; i++) {
			if (changes_db(&hash[h].key, &op[hash[h].user[i].file]
				       [hash[h].user[i].op_num])) {
				depend_on_previous(op, filename, num,
						   hash[h].user, i, prev);
				prev = i;
			} else if (prev >= 0)
				add_dependency(hash, op, filename,
					       hash[h].user[i].file,
					       hash[h].user[i].op_num,
					       hash[h].user[prev].file,
					       hash[h].user[prev].op_num);
		}
	}

#if TRAVERSALS_TAKE_TRANSACTION_LOCK
	make_traverse_depends(filename, op, num_ops, num);
#endif

	optimize_dependencies(op, num_ops, num);
}

int main(int argc, char *argv[])
{
	struct timeval start, end;
	unsigned int i, num_ops[argc], hashsize[argc], tdb_flags[argc], open_flags[argc];
	struct op *op[argc];
	int fds[2];
	char c;
	bool ok = true;

	if (argc < 3)
		errx(1, "Usage: %s <tdbfile> <tracefile>...", argv[0]);

	pipes = talloc_array(NULL, struct pipe, argc - 2);
	for (i = 0; i < argc - 2; i++) {
		printf("Loading tracefile %s...", argv[2+i]);
		fflush(stdout);
		op[i] = load_tracefile(argv[2+i], &num_ops[i], &hashsize[i],
				       &tdb_flags[i], &open_flags[i]);
		if (pipe(pipes[i].fd) != 0)
			err(1, "creating pipe");
		printf("done\n");
	}

	printf("Calculating inter-dependencies...");
	fflush(stdout);
	derive_dependencies(argv+2, op, num_ops, i);
	printf("done\n");

	/* Don't fork for single arg case: simple debugging. */
	if (argc == 3) {
		struct tdb_context *tdb;
		tdb = tdb_open_ex(argv[1], hashsize[0], tdb_flags[0]|TDB_NOSYNC,
				  open_flags[0], 0600, NULL, hash_key);
		printf("Single threaded run...");
		fflush(stdout);

		run_ops(tdb, pipes[0].fd[0], argv+2, op, 0, 1, num_ops[0],
			false);
		check_deps(argv[2], op[0], num_ops[0]);

		printf("done\n");
		exit(0);
	}

	if (pipe(fds) != 0)
		err(1, "creating pipe");

	for (i = 0; i < argc - 2; i++) {
		struct tdb_context *tdb;

		switch (fork()) {
		case -1:
			err(1, "fork failed");
		case 0:
			close(fds[1]);
			tdb = tdb_open_ex(argv[1], hashsize[i],
					  tdb_flags[i]|TDB_NOSYNC,
					  open_flags[i], 0600, NULL, hash_key);
			if (!tdb)
				err(1, "Opening tdb %s", argv[1]);

			/* This catches parent exiting. */
			if (read(fds[0], &c, 1) != 1)
				exit(1);
			run_ops(tdb, pipes[i].fd[0], argv+2, op, i, 1,
				num_ops[i], false);
			check_deps(argv[2+i], op[i], num_ops[i]);
			exit(0);
		default:
			break;
		}
	}

	/* Let everything settle. */
	sleep(1);

	printf("Starting run...");
	fflush(stdout);
	gettimeofday(&start, NULL);
	/* Tell them all to go!  Any write of sufficient length will do. */
	if (write(fds[1], hashsize, i) != i)
		err(1, "Writing to wakeup pipe");

	for (i = 0; i < argc - 2; i++) {
		int status;
		wait(&status);
		if (!WIFEXITED(status)) {
			warnx("Child died with signal %i", WTERMSIG(status));
			ok = false;
		} else if (WEXITSTATUS(status) != 0)
			/* Assume child spat out error. */
			ok = false;
	}
	if (!ok)
		exit(1);

	gettimeofday(&end, NULL);
	printf("done\n");

	end.tv_sec -= start.tv_sec;
	printf("Time replaying: %lu usec\n",
	       end.tv_sec * 1000000UL + (end.tv_usec - start.tv_usec));
	
	exit(0);
}