replay_trace.c 47 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800
  1. #include <ccan/tdb/tdb.h>
  2. #include <ccan/grab_file/grab_file.h>
  3. #include <ccan/hash/hash.h>
  4. #include <ccan/talloc/talloc.h>
  5. #include <ccan/str_talloc/str_talloc.h>
  6. #include <ccan/str/str.h>
  7. #include <ccan/list/list.h>
  8. #include <err.h>
  9. #include <ctype.h>
  10. #include <string.h>
  11. #include <unistd.h>
  12. #include <sys/types.h>
  13. #include <sys/wait.h>
  14. #include <sys/time.h>
  15. #include <errno.h>
  16. #include <signal.h>
  17. #include <assert.h>
  18. #include <fcntl.h>
  19. #define STRINGIFY2(x) #x
  20. #define STRINGIFY(x) STRINGIFY2(x)
  21. /* Avoid mod by zero */
  22. static unsigned int total_keys = 1;
  23. /* #define DEBUG_DEPS 1 */
  24. /* Traversals block transactions in the current implementation. */
  25. #define TRAVERSALS_TAKE_TRANSACTION_LOCK 1
  26. struct pipe {
  27. int fd[2];
  28. };
  29. static struct pipe *pipes;
  30. static int backoff_fd = -1;
  31. static void __attribute__((noreturn)) fail(const char *filename,
  32. unsigned int line,
  33. const char *fmt, ...)
  34. {
  35. va_list ap;
  36. va_start(ap, fmt);
  37. fprintf(stderr, "%s:%u: FAIL: ", filename, line);
  38. vfprintf(stderr, fmt, ap);
  39. fprintf(stderr, "\n");
  40. va_end(ap);
  41. exit(1);
  42. }
  43. /* Try or die. */
  44. #define try(expr, expect) \
  45. do { \
  46. int ret = (expr); \
  47. if (ret != (expect)) \
  48. fail(filename[file], i+1, \
  49. STRINGIFY(expr) "= %i", ret); \
  50. } while (0)
  51. /* Try or imitate results. */
  52. #define unreliable(expr, expect, force, undo) \
  53. do { \
  54. int ret = expr; \
  55. if (ret != expect) { \
  56. fprintf(stderr, "%s:%u: %s gave %i not %i", \
  57. filename[file], i+1, STRINGIFY(expr), \
  58. ret, expect); \
  59. if (expect == 0) \
  60. force; \
  61. else \
  62. undo; \
  63. } \
  64. } while (0)
  65. static bool key_eq(TDB_DATA a, TDB_DATA b)
  66. {
  67. if (a.dsize != b.dsize)
  68. return false;
  69. return memcmp(a.dptr, b.dptr, a.dsize) == 0;
  70. }
  71. /* This is based on the hash algorithm from gdbm */
  72. static unsigned int hash_key(TDB_DATA *key)
  73. {
  74. uint32_t value; /* Used to compute the hash value. */
  75. uint32_t i; /* Used to cycle through random values. */
  76. /* Set the initial value from the key size. */
  77. for (value = 0x238F13AF ^ key->dsize, i=0; i < key->dsize; i++)
  78. value = (value + (key->dptr[i] << (i*5 % 24)));
  79. return (1103515243 * value + 12345);
  80. }
  81. enum op_type {
  82. OP_TDB_LOCKALL,
  83. OP_TDB_LOCKALL_MARK,
  84. OP_TDB_LOCKALL_UNMARK,
  85. OP_TDB_LOCKALL_NONBLOCK,
  86. OP_TDB_UNLOCKALL,
  87. OP_TDB_LOCKALL_READ,
  88. OP_TDB_LOCKALL_READ_NONBLOCK,
  89. OP_TDB_UNLOCKALL_READ,
  90. OP_TDB_CHAINLOCK,
  91. OP_TDB_CHAINLOCK_NONBLOCK,
  92. OP_TDB_CHAINLOCK_MARK,
  93. OP_TDB_CHAINLOCK_UNMARK,
  94. OP_TDB_CHAINUNLOCK,
  95. OP_TDB_CHAINLOCK_READ,
  96. OP_TDB_CHAINUNLOCK_READ,
  97. OP_TDB_PARSE_RECORD,
  98. OP_TDB_EXISTS,
  99. OP_TDB_STORE,
  100. OP_TDB_APPEND,
  101. OP_TDB_GET_SEQNUM,
  102. OP_TDB_WIPE_ALL,
  103. OP_TDB_TRANSACTION_START,
  104. OP_TDB_TRANSACTION_CANCEL,
  105. OP_TDB_TRANSACTION_PREPARE_COMMIT,
  106. OP_TDB_TRANSACTION_COMMIT,
  107. OP_TDB_TRAVERSE_READ_START,
  108. OP_TDB_TRAVERSE_START,
  109. OP_TDB_TRAVERSE_END,
  110. OP_TDB_TRAVERSE,
  111. OP_TDB_TRAVERSE_END_EARLY,
  112. OP_TDB_FIRSTKEY,
  113. OP_TDB_NEXTKEY,
  114. OP_TDB_FETCH,
  115. OP_TDB_DELETE,
  116. };
  117. struct op {
  118. unsigned int seqnum;
  119. enum op_type type;
  120. TDB_DATA key;
  121. TDB_DATA data;
  122. int ret;
  123. /* Who is waiting for us? */
  124. struct list_head post;
  125. /* What are we waiting for? */
  126. struct list_head pre;
  127. /* If I'm part of a group (traverse/transaction) where is
  128. * start? (Otherwise, 0) */
  129. unsigned int group_start;
  130. union {
  131. int flag; /* open and store */
  132. struct { /* append */
  133. TDB_DATA pre;
  134. TDB_DATA post;
  135. } append;
  136. /* transaction/traverse start/chainlock */
  137. unsigned int group_len;
  138. };
  139. };
  140. struct op_desc {
  141. unsigned int file;
  142. unsigned int op_num;
  143. };
  144. static unsigned char hex_char(const char *filename, unsigned int line, char c)
  145. {
  146. c = toupper(c);
  147. if (c >= 'A' && c <= 'F')
  148. return c - 'A' + 10;
  149. if (c >= '0' && c <= '9')
  150. return c - '0';
  151. fail(filename, line, "invalid hex character '%c'", c);
  152. }
  153. /* TDB data is <size>:<%02x>* */
  154. static TDB_DATA make_tdb_data(const void *ctx,
  155. const char *filename, unsigned int line,
  156. const char *word)
  157. {
  158. TDB_DATA data;
  159. unsigned int i;
  160. const char *p;
  161. if (streq(word, "NULL"))
  162. return tdb_null;
  163. data.dsize = atoi(word);
  164. data.dptr = talloc_array(ctx, unsigned char, data.dsize);
  165. p = strchr(word, ':');
  166. if (!p)
  167. fail(filename, line, "invalid tdb data '%s'", word);
  168. p++;
  169. for (i = 0; i < data.dsize; i++)
  170. data.dptr[i] = hex_char(filename, line, p[i*2])*16
  171. + hex_char(filename, line, p[i*2+1]);
  172. return data;
  173. }
  174. static void add_op(const char *filename, struct op **op, unsigned int i,
  175. unsigned int seqnum, enum op_type type)
  176. {
  177. struct op *new;
  178. *op = talloc_realloc(NULL, *op, struct op, i+1);
  179. new = (*op) + i;
  180. new->type = type;
  181. new->seqnum = seqnum;
  182. new->ret = 0;
  183. new->group_start = 0;
  184. }
  185. static void op_add_nothing(const char *filename,
  186. struct op op[], unsigned int op_num, char *words[])
  187. {
  188. if (words[2])
  189. fail(filename, op_num+1, "Expected no arguments");
  190. op[op_num].key = tdb_null;
  191. }
  192. static void op_add_key(const char *filename,
  193. struct op op[], unsigned int op_num, char *words[])
  194. {
  195. if (words[2] == NULL || words[3])
  196. fail(filename, op_num+1, "Expected just a key");
  197. op[op_num].key = make_tdb_data(op, filename, op_num+1, words[2]);
  198. total_keys++;
  199. }
  200. static void op_add_key_ret(const char *filename,
  201. struct op op[], unsigned int op_num, char *words[])
  202. {
  203. if (!words[2] || !words[3] || !words[4] || words[5]
  204. || !streq(words[3], "="))
  205. fail(filename, op_num+1, "Expected <key> = <ret>");
  206. op[op_num].ret = atoi(words[4]);
  207. op[op_num].key = make_tdb_data(op, filename, op_num+1, words[2]);
  208. /* May only be a unique key if it fails */
  209. if (op[op_num].ret != 0)
  210. total_keys++;
  211. }
  212. static void op_add_key_data(const char *filename,
  213. struct op op[], unsigned int op_num, char *words[])
  214. {
  215. if (!words[2] || !words[3] || !words[4] || words[5]
  216. || !streq(words[3], "="))
  217. fail(filename, op_num+1, "Expected <key> = <data>");
  218. op[op_num].key = make_tdb_data(op, filename, op_num+1, words[2]);
  219. op[op_num].data = make_tdb_data(op, filename, op_num+1, words[4]);
  220. /* May only be a unique key if it fails */
  221. if (!op[op_num].data.dptr)
  222. total_keys++;
  223. }
  224. /* We don't record the keys or data for a traverse, as we don't use them. */
  225. static void op_add_traverse(const char *filename,
  226. struct op op[], unsigned int op_num, char *words[])
  227. {
  228. if (!words[2] || !words[3] || !words[4] || words[5]
  229. || !streq(words[3], "="))
  230. fail(filename, op_num+1, "Expected <key> = <data>");
  231. op[op_num].key = tdb_null;
  232. }
  233. /* Full traverse info is useful for debugging, but changing it to
  234. * "traversefn" without the data makes the traces *much* smaller! */
  235. static void op_add_traversefn(const char *filename,
  236. struct op op[], unsigned int op_num, char *words[])
  237. {
  238. if (words[2])
  239. fail(filename, op_num+1, "Expected no values");
  240. op[op_num].key = tdb_null;
  241. }
  242. /* <seqnum> tdb_store <rec> <rec> <flag> = <ret> */
  243. static void op_add_store(const char *filename,
  244. struct op op[], unsigned int op_num, char *words[])
  245. {
  246. if (!words[2] || !words[3] || !words[4] || !words[5] || !words[6]
  247. || words[7] || !streq(words[5], "="))
  248. fail(filename, op_num+1, "Expect <key> <data> <flag> = <ret>");
  249. op[op_num].flag = strtoul(words[4], NULL, 0);
  250. op[op_num].ret = atoi(words[6]);
  251. op[op_num].key = make_tdb_data(op, filename, op_num+1, words[2]);
  252. op[op_num].data = make_tdb_data(op, filename, op_num+1, words[3]);
  253. total_keys++;
  254. }
  255. /* <seqnum> tdb_append <rec> <rec> = <rec> */
  256. static void op_add_append(const char *filename,
  257. struct op op[], unsigned int op_num, char *words[])
  258. {
  259. if (!words[2] || !words[3] || !words[4] || !words[5] || words[6]
  260. || !streq(words[4], "="))
  261. fail(filename, op_num+1, "Expect <key> <data> = <rec>");
  262. op[op_num].key = make_tdb_data(op, filename, op_num+1, words[2]);
  263. op[op_num].data = make_tdb_data(op, filename, op_num+1, words[3]);
  264. op[op_num].append.post
  265. = make_tdb_data(op, filename, op_num+1, words[5]);
  266. /* By subtraction, figure out what previous data was. */
  267. op[op_num].append.pre.dptr = op[op_num].append.post.dptr;
  268. op[op_num].append.pre.dsize
  269. = op[op_num].append.post.dsize - op[op_num].data.dsize;
  270. total_keys++;
  271. }
  272. /* <seqnum> tdb_get_seqnum = <ret> */
  273. static void op_add_seqnum(const char *filename,
  274. struct op op[], unsigned int op_num, char *words[])
  275. {
  276. if (!words[2] || !words[3] || words[4] || !streq(words[2], "="))
  277. fail(filename, op_num+1, "Expect = <ret>");
  278. op[op_num].key = tdb_null;
  279. op[op_num].ret = atoi(words[3]);
  280. }
  281. static void op_add_traverse_start(const char *filename,
  282. struct op op[],
  283. unsigned int op_num, char *words[])
  284. {
  285. if (words[2])
  286. fail(filename, op_num+1, "Expect no arguments");
  287. op[op_num].key = tdb_null;
  288. op[op_num].group_len = 0;
  289. }
  290. static void op_add_transaction(const char *filename, struct op op[],
  291. unsigned int op_num, char *words[])
  292. {
  293. if (words[2])
  294. fail(filename, op_num+1, "Expect no arguments");
  295. op[op_num].key = tdb_null;
  296. op[op_num].group_len = 0;
  297. }
  298. static void op_add_chainlock(const char *filename,
  299. struct op op[], unsigned int op_num, char *words[])
  300. {
  301. if (words[2] == NULL || words[3])
  302. fail(filename, op_num+1, "Expected just a key");
  303. /* A chainlock key isn't a key in the normal sense; it doesn't
  304. * have to be in the db at all. Also, we don't want to hash this op. */
  305. op[op_num].data = make_tdb_data(op, filename, op_num+1, words[2]);
  306. op[op_num].key = tdb_null;
  307. op[op_num].group_len = 0;
  308. }
  309. static void op_add_chainlock_ret(const char *filename,
  310. struct op op[], unsigned int op_num,
  311. char *words[])
  312. {
  313. if (!words[2] || !words[3] || !words[4] || words[5]
  314. || !streq(words[3], "="))
  315. fail(filename, op_num+1, "Expected <key> = <ret>");
  316. op[op_num].ret = atoi(words[4]);
  317. op[op_num].data = make_tdb_data(op, filename, op_num+1, words[2]);
  318. op[op_num].key = tdb_null;
  319. op[op_num].group_len = 0;
  320. total_keys++;
  321. }
  322. static int op_find_start(struct op op[], unsigned int op_num, enum op_type type)
  323. {
  324. unsigned int i;
  325. for (i = op_num-1; i > 0; i--) {
  326. if (op[i].type == type && !op[i].group_len)
  327. return i;
  328. }
  329. return 0;
  330. }
  331. static void op_analyze_transaction(const char *filename,
  332. struct op op[], unsigned int op_num,
  333. char *words[])
  334. {
  335. unsigned int start, i;
  336. op[op_num].key = tdb_null;
  337. if (words[2])
  338. fail(filename, op_num+1, "Expect no arguments");
  339. start = op_find_start(op, op_num, OP_TDB_TRANSACTION_START);
  340. if (!start)
  341. fail(filename, op_num+1, "no transaction start found");
  342. op[start].group_len = op_num - start;
  343. /* This rolls in nested transactions. I think that's right. */
  344. for (i = start; i <= op_num; i++)
  345. op[i].group_start = start;
  346. }
  347. /* We treat chainlocks a lot like transactions, even though that's overkill */
  348. static void op_analyze_chainlock(const char *filename,
  349. struct op op[], unsigned int op_num,
  350. char *words[])
  351. {
  352. unsigned int i, start;
  353. if (words[2] == NULL || words[3])
  354. fail(filename, op_num+1, "Expected just a key");
  355. op[op_num].data = make_tdb_data(op, filename, op_num+1, words[2]);
  356. op[op_num].key = tdb_null;
  357. total_keys++;
  358. start = op_find_start(op, op_num, OP_TDB_CHAINLOCK);
  359. if (!start)
  360. start = op_find_start(op, op_num, OP_TDB_CHAINLOCK_READ);
  361. if (!start)
  362. fail(filename, op_num+1, "no initial chainlock found");
  363. /* FIXME: We'd have to do something clever to make this work
  364. * vs. deadlock. */
  365. if (!key_eq(op[start].data, op[op_num].data))
  366. fail(filename, op_num+1, "nested chainlock calls?");
  367. op[start].group_len = op_num - start;
  368. for (i = start; i <= op_num; i++)
  369. op[i].group_start = start;
  370. }
  371. static void op_analyze_traverse(const char *filename,
  372. struct op op[], unsigned int op_num,
  373. char *words[])
  374. {
  375. int i, start;
  376. op[op_num].key = tdb_null;
  377. /* = %u means traverse function terminated. */
  378. if (words[2]) {
  379. if (!streq(words[2], "=") || !words[3] || words[4])
  380. fail(filename, op_num+1, "expect = <num>");
  381. op[op_num].ret = atoi(words[3]);
  382. } else
  383. op[op_num].ret = 0;
  384. start = op_find_start(op, op_num, OP_TDB_TRAVERSE_START);
  385. if (!start)
  386. start = op_find_start(op, op_num, OP_TDB_TRAVERSE_READ_START);
  387. if (!start)
  388. fail(filename, op_num+1, "no traversal start found");
  389. op[start].group_len = op_num - start;
  390. /* Don't roll in nested traverse/chainlock */
  391. for (i = start; i <= op_num; i++)
  392. if (!op[i].group_start)
  393. op[i].group_start = start;
  394. }
  395. /* Keep -Wmissing-declarations happy: */
  396. const struct op_table *
  397. find_keyword (register const char *str, register unsigned int len);
  398. #include "keywords.c"
  399. struct depend {
  400. /* We can have more than one */
  401. struct list_node pre_list;
  402. struct list_node post_list;
  403. struct op_desc needs;
  404. struct op_desc prereq;
  405. };
  406. static void check_deps(const char *filename, struct op op[], unsigned int num)
  407. {
  408. #ifdef DEBUG_DEPS
  409. unsigned int i;
  410. for (i = 1; i < num; i++)
  411. if (!list_empty(&op[i].pre))
  412. fail(filename, i+1, "Still has dependencies");
  413. #endif
  414. }
  415. static void dump_pre(char *filename[], struct op *op[],
  416. unsigned int file, unsigned int i)
  417. {
  418. struct depend *dep;
  419. printf("%s:%u (%u) still waiting for:\n", filename[file], i+1,
  420. op[file][i].seqnum);
  421. list_for_each(&op[file][i].pre, dep, pre_list)
  422. printf(" %s:%u (%u)\n",
  423. filename[dep->prereq.file], dep->prereq.op_num+1,
  424. op[dep->prereq.file][dep->prereq.op_num].seqnum);
  425. check_deps(filename[file], op[file], i);
  426. }
  427. /* We simply read/write pointers, since we all are children. */
  428. static bool do_pre(struct tdb_context *tdb,
  429. char *filename[], struct op *op[],
  430. unsigned int file, int pre_fd, unsigned int i,
  431. bool backoff)
  432. {
  433. while (!list_empty(&op[file][i].pre)) {
  434. struct depend *dep;
  435. #if DEBUG_DEPS
  436. printf("%s:%u:waiting for pre\n", filename[file], i+1);
  437. fflush(stdout);
  438. #endif
  439. if (backoff)
  440. alarm(2);
  441. else
  442. alarm(10);
  443. while (read(pre_fd, &dep, sizeof(dep)) != sizeof(dep)) {
  444. if (errno == EINTR) {
  445. if (backoff) {
  446. struct op_desc desc = { file,i };
  447. warnx("%s:%u:avoiding deadlock",
  448. filename[file], i+1);
  449. if (write(backoff_fd, &desc,
  450. sizeof(desc)) != sizeof(desc))
  451. err(1, "writing backoff_fd");
  452. return false;
  453. }
  454. dump_pre(filename, op, file, i);
  455. exit(1);
  456. } else
  457. errx(1, "Reading from pipe");
  458. }
  459. alarm(0);
  460. #if DEBUG_DEPS
  461. printf("%s:%u:got pre %u from %s:%u\n", filename[file], i+1,
  462. dep->needs.op_num+1, filename[dep->prereq.file],
  463. dep->prereq.op_num+1);
  464. fflush(stdout);
  465. #endif
  466. /* This could be any op, not just this one. */
  467. talloc_free(dep);
  468. }
  469. return true;
  470. }
  471. static void do_post(char *filename[], struct op *op[],
  472. unsigned int file, unsigned int i)
  473. {
  474. struct depend *dep;
  475. list_for_each(&op[file][i].post, dep, post_list) {
  476. #if DEBUG_DEPS
  477. printf("%s:%u:sending to file %s:%u\n", filename[file], i+1,
  478. filename[dep->needs.file], dep->needs.op_num+1);
  479. #endif
  480. if (write(pipes[dep->needs.file].fd[1], &dep, sizeof(dep))
  481. != sizeof(dep))
  482. err(1, "%s:%u failed to tell file %s",
  483. filename[file], i+1, filename[dep->needs.file]);
  484. }
  485. }
  486. static int get_len(TDB_DATA key, TDB_DATA data, void *private_data)
  487. {
  488. return data.dsize;
  489. }
  490. static unsigned run_ops(struct tdb_context *tdb,
  491. int pre_fd,
  492. char *filename[],
  493. struct op *op[],
  494. unsigned int file,
  495. unsigned int start, unsigned int stop,
  496. bool backoff);
  497. struct traverse_info {
  498. struct op **op;
  499. char **filename;
  500. unsigned file;
  501. int pre_fd;
  502. unsigned int start;
  503. unsigned int i;
  504. };
  505. /* More complex. Just do whatever's they did at the n'th entry. */
  506. static int nontrivial_traverse(struct tdb_context *tdb,
  507. TDB_DATA key, TDB_DATA data,
  508. void *_tinfo)
  509. {
  510. struct traverse_info *tinfo = _tinfo;
  511. unsigned int trav_len = tinfo->op[tinfo->file][tinfo->start].group_len;
  512. bool avoid_deadlock = false;
  513. if (tinfo->i == tinfo->start + trav_len) {
  514. /* This can happen if traverse expects to be empty. */
  515. if (trav_len == 1)
  516. return 1;
  517. fail(tinfo->filename[tinfo->file], tinfo->start + 1,
  518. "traverse did not terminate");
  519. }
  520. if (tinfo->op[tinfo->file][tinfo->i].type != OP_TDB_TRAVERSE)
  521. fail(tinfo->filename[tinfo->file], tinfo->start + 1,
  522. "%s:%u:traverse terminated early");
  523. #if TRAVERSALS_TAKE_TRANSACTION_LOCK
  524. avoid_deadlock = true;
  525. #endif
  526. /* Run any normal ops. */
  527. tinfo->i = run_ops(tdb, tinfo->pre_fd, tinfo->filename, tinfo->op,
  528. tinfo->file, tinfo->i+1, tinfo->start + trav_len,
  529. avoid_deadlock);
  530. /* We backed off, or we hit OP_TDB_TRAVERSE_END/EARLY. */
  531. if (tinfo->op[tinfo->file][tinfo->i].type != OP_TDB_TRAVERSE)
  532. return 1;
  533. return 0;
  534. }
  535. static unsigned op_traverse(struct tdb_context *tdb,
  536. int pre_fd,
  537. char *filename[],
  538. unsigned int file,
  539. int (*traversefn)(struct tdb_context *,
  540. tdb_traverse_func, void *),
  541. struct op *op[],
  542. unsigned int start)
  543. {
  544. struct traverse_info tinfo = { op, filename, file, pre_fd,
  545. start, start+1 };
  546. traversefn(tdb, nontrivial_traverse, &tinfo);
  547. /* Traversing in wrong order can have strange effects: eg. if
  548. * original traverse went A (delete A), B, we might do B
  549. * (delete A). So if we have ops left over, we do it now. */
  550. while (tinfo.i != start + op[file][start].group_len) {
  551. if (op[file][tinfo.i].type == OP_TDB_TRAVERSE
  552. || op[file][tinfo.i].type == OP_TDB_TRAVERSE_END_EARLY)
  553. tinfo.i++;
  554. else
  555. tinfo.i = run_ops(tdb, pre_fd, filename, op, file,
  556. tinfo.i,
  557. start + op[file][start].group_len,
  558. false);
  559. }
  560. return tinfo.i;
  561. }
  562. static void break_out(int sig)
  563. {
  564. }
  565. static __attribute__((noinline))
  566. unsigned run_ops(struct tdb_context *tdb,
  567. int pre_fd,
  568. char *filename[],
  569. struct op *op[],
  570. unsigned int file,
  571. unsigned int start, unsigned int stop,
  572. bool backoff)
  573. {
  574. unsigned int i;
  575. struct sigaction sa;
  576. sa.sa_handler = break_out;
  577. sa.sa_flags = 0;
  578. sigaction(SIGALRM, &sa, NULL);
  579. for (i = start; i < stop; i++) {
  580. if (!do_pre(tdb, filename, op, file, pre_fd, i, backoff))
  581. return i;
  582. switch (op[file][i].type) {
  583. case OP_TDB_LOCKALL:
  584. try(tdb_lockall(tdb), op[file][i].ret);
  585. break;
  586. case OP_TDB_LOCKALL_MARK:
  587. try(tdb_lockall_mark(tdb), op[file][i].ret);
  588. break;
  589. case OP_TDB_LOCKALL_UNMARK:
  590. try(tdb_lockall_unmark(tdb), op[file][i].ret);
  591. break;
  592. case OP_TDB_LOCKALL_NONBLOCK:
  593. unreliable(tdb_lockall_nonblock(tdb), op[file][i].ret,
  594. tdb_lockall(tdb), tdb_unlockall(tdb));
  595. break;
  596. case OP_TDB_UNLOCKALL:
  597. try(tdb_unlockall(tdb), op[file][i].ret);
  598. break;
  599. case OP_TDB_LOCKALL_READ:
  600. try(tdb_lockall_read(tdb), op[file][i].ret);
  601. break;
  602. case OP_TDB_LOCKALL_READ_NONBLOCK:
  603. unreliable(tdb_lockall_read_nonblock(tdb),
  604. op[file][i].ret,
  605. tdb_lockall_read(tdb),
  606. tdb_unlockall_read(tdb));
  607. break;
  608. case OP_TDB_UNLOCKALL_READ:
  609. try(tdb_unlockall_read(tdb), op[file][i].ret);
  610. break;
  611. case OP_TDB_CHAINLOCK:
  612. try(tdb_chainlock(tdb, op[file][i].key),
  613. op[file][i].ret);
  614. break;
  615. case OP_TDB_CHAINLOCK_NONBLOCK:
  616. unreliable(tdb_chainlock_nonblock(tdb, op[file][i].key),
  617. op[file][i].ret,
  618. tdb_chainlock(tdb, op[file][i].key),
  619. tdb_chainunlock(tdb, op[file][i].key));
  620. break;
  621. case OP_TDB_CHAINLOCK_MARK:
  622. try(tdb_chainlock_mark(tdb, op[file][i].key),
  623. op[file][i].ret);
  624. break;
  625. case OP_TDB_CHAINLOCK_UNMARK:
  626. try(tdb_chainlock_unmark(tdb, op[file][i].key),
  627. op[file][i].ret);
  628. break;
  629. case OP_TDB_CHAINUNLOCK:
  630. try(tdb_chainunlock(tdb, op[file][i].key),
  631. op[file][i].ret);
  632. break;
  633. case OP_TDB_CHAINLOCK_READ:
  634. try(tdb_chainlock_read(tdb, op[file][i].key),
  635. op[file][i].ret);
  636. break;
  637. case OP_TDB_CHAINUNLOCK_READ:
  638. try(tdb_chainunlock_read(tdb, op[file][i].key),
  639. op[file][i].ret);
  640. break;
  641. case OP_TDB_PARSE_RECORD:
  642. try(tdb_parse_record(tdb, op[file][i].key, get_len,
  643. NULL),
  644. op[file][i].ret);
  645. break;
  646. case OP_TDB_EXISTS:
  647. try(tdb_exists(tdb, op[file][i].key), op[file][i].ret);
  648. break;
  649. case OP_TDB_STORE:
  650. try(tdb_store(tdb, op[file][i].key, op[file][i].data,
  651. op[file][i].flag),
  652. op[file][i].ret);
  653. break;
  654. case OP_TDB_APPEND:
  655. try(tdb_append(tdb, op[file][i].key, op[file][i].data),
  656. op[file][i].ret);
  657. break;
  658. case OP_TDB_GET_SEQNUM:
  659. try(tdb_get_seqnum(tdb), op[file][i].ret);
  660. break;
  661. case OP_TDB_WIPE_ALL:
  662. try(tdb_wipe_all(tdb), op[file][i].ret);
  663. break;
  664. case OP_TDB_TRANSACTION_START:
  665. try(tdb_transaction_start(tdb), op[file][i].ret);
  666. break;
  667. case OP_TDB_TRANSACTION_CANCEL:
  668. try(tdb_transaction_cancel(tdb), op[file][i].ret);
  669. break;
  670. case OP_TDB_TRANSACTION_PREPARE_COMMIT:
  671. try(tdb_transaction_prepare_commit(tdb),
  672. op[file][i].ret);
  673. break;
  674. case OP_TDB_TRANSACTION_COMMIT:
  675. try(tdb_transaction_commit(tdb), op[file][i].ret);
  676. break;
  677. case OP_TDB_TRAVERSE_READ_START:
  678. i = op_traverse(tdb, pre_fd, filename, file,
  679. tdb_traverse_read, op, i);
  680. break;
  681. case OP_TDB_TRAVERSE_START:
  682. i = op_traverse(tdb, pre_fd, filename, file,
  683. tdb_traverse, op, i);
  684. break;
  685. case OP_TDB_TRAVERSE:
  686. case OP_TDB_TRAVERSE_END_EARLY:
  687. /* Terminate: we're in a traverse, and we've
  688. * done our ops. */
  689. return i;
  690. case OP_TDB_TRAVERSE_END:
  691. fail(filename[file], i+1, "unexpected end traverse");
  692. /* FIXME: These must be treated like traverse. */
  693. case OP_TDB_FIRSTKEY:
  694. if (!key_eq(tdb_firstkey(tdb), op[file][i].data))
  695. fail(filename[file], i+1, "bad firstkey");
  696. break;
  697. case OP_TDB_NEXTKEY:
  698. if (!key_eq(tdb_nextkey(tdb, op[file][i].key),
  699. op[file][i].data))
  700. fail(filename[file], i+1, "bad nextkey");
  701. break;
  702. case OP_TDB_FETCH: {
  703. TDB_DATA f = tdb_fetch(tdb, op[file][i].key);
  704. if (!key_eq(f, op[file][i].data))
  705. fail(filename[file], i+1, "bad fetch %u",
  706. f.dsize);
  707. break;
  708. }
  709. case OP_TDB_DELETE:
  710. try(tdb_delete(tdb, op[file][i].key), op[file][i].ret);
  711. break;
  712. }
  713. do_post(filename, op, file, i);
  714. }
  715. return i;
  716. }
  717. /* tdbtorture, in particular, can do a tdb_close with a transaction in
  718. * progress. */
  719. static struct op *maybe_cancel_transaction(const char *filename,
  720. struct op *op, unsigned int *num)
  721. {
  722. unsigned int start = op_find_start(op, *num, OP_TDB_TRANSACTION_START);
  723. if (start) {
  724. char *words[] = { "<unknown>", "tdb_close", NULL };
  725. add_op(filename, &op, *num, op[start].seqnum,
  726. OP_TDB_TRANSACTION_CANCEL);
  727. op_analyze_transaction(filename, op, *num, words);
  728. (*num)++;
  729. }
  730. return op;
  731. }
  732. static struct op *load_tracefile(const char *filename, unsigned int *num,
  733. unsigned int *hashsize,
  734. unsigned int *tdb_flags,
  735. unsigned int *open_flags)
  736. {
  737. unsigned int i;
  738. struct op *op = talloc_array(NULL, struct op, 1);
  739. char **words;
  740. char **lines;
  741. char *file;
  742. file = grab_file(NULL, filename, NULL);
  743. if (!file)
  744. err(1, "Reading %s", filename);
  745. lines = strsplit(file, file, "\n", NULL);
  746. if (!lines[0])
  747. errx(1, "%s is empty", filename);
  748. words = strsplit(lines, lines[0], " ", NULL);
  749. if (!streq(words[1], "tdb_open"))
  750. fail(filename, 1, "does not start with tdb_open");
  751. *hashsize = atoi(words[2]);
  752. *tdb_flags = strtoul(words[3], NULL, 0);
  753. *open_flags = strtoul(words[4], NULL, 0);
  754. for (i = 1; lines[i]; i++) {
  755. const struct op_table *opt;
  756. words = strsplit(lines, lines[i], " ", NULL);
  757. if (!words[0] || !words[1])
  758. fail(filename, i+1, "Expected seqnum number and op");
  759. opt = find_keyword(words[1], strlen(words[1]));
  760. if (!opt) {
  761. if (streq(words[1], "tdb_close")) {
  762. if (lines[i+1])
  763. fail(filename, i+2,
  764. "lines after tdb_close");
  765. *num = i;
  766. talloc_free(lines);
  767. return maybe_cancel_transaction(filename,
  768. op, num);
  769. }
  770. fail(filename, i+1, "Unknown operation '%s'", words[1]);
  771. }
  772. add_op(filename, &op, i, atoi(words[0]), opt->type);
  773. opt->enhance_op(filename, op, i, words);
  774. }
  775. fprintf(stderr, "%s:%u:last operation is not tdb_close: incomplete?",
  776. filename, i);
  777. talloc_free(lines);
  778. *num = i - 1;
  779. return maybe_cancel_transaction(filename, op, num);
  780. }
  781. /* We remember all the keys we've ever seen, and who has them. */
  782. struct keyinfo {
  783. TDB_DATA key;
  784. unsigned int num_users;
  785. struct op_desc *user;
  786. };
  787. static const TDB_DATA must_not_exist;
  788. static const TDB_DATA must_exist;
  789. static const TDB_DATA not_exists_or_empty;
  790. /* NULL means doesn't care if it exists or not, &must_exist means
  791. * it must exist but we don't care what, &must_not_exist means it must
  792. * not exist, otherwise the data it needs. */
  793. static const TDB_DATA *needs(const struct op *op)
  794. {
  795. switch (op->type) {
  796. /* FIXME: Pull forward deps, since we can deadlock */
  797. case OP_TDB_CHAINLOCK:
  798. case OP_TDB_CHAINLOCK_NONBLOCK:
  799. case OP_TDB_CHAINLOCK_MARK:
  800. case OP_TDB_CHAINLOCK_UNMARK:
  801. case OP_TDB_CHAINUNLOCK:
  802. case OP_TDB_CHAINLOCK_READ:
  803. case OP_TDB_CHAINUNLOCK_READ:
  804. return NULL;
  805. case OP_TDB_APPEND:
  806. if (op->append.pre.dsize == 0)
  807. return &not_exists_or_empty;
  808. return &op->append.pre;
  809. case OP_TDB_STORE:
  810. if (op->flag == TDB_INSERT) {
  811. if (op->ret < 0)
  812. return &must_exist;
  813. else
  814. return &must_not_exist;
  815. } else if (op->flag == TDB_MODIFY) {
  816. if (op->ret < 0)
  817. return &must_not_exist;
  818. else
  819. return &must_exist;
  820. }
  821. /* No flags? Don't care */
  822. return NULL;
  823. case OP_TDB_EXISTS:
  824. if (op->ret == 1)
  825. return &must_exist;
  826. else
  827. return &must_not_exist;
  828. case OP_TDB_PARSE_RECORD:
  829. if (op->ret < 0)
  830. return &must_not_exist;
  831. return &must_exist;
  832. /* FIXME: handle these. */
  833. case OP_TDB_WIPE_ALL:
  834. case OP_TDB_FIRSTKEY:
  835. case OP_TDB_NEXTKEY:
  836. case OP_TDB_GET_SEQNUM:
  837. case OP_TDB_TRAVERSE:
  838. case OP_TDB_TRANSACTION_COMMIT:
  839. case OP_TDB_TRANSACTION_CANCEL:
  840. case OP_TDB_TRANSACTION_START:
  841. return NULL;
  842. case OP_TDB_FETCH:
  843. if (!op->data.dptr)
  844. return &must_not_exist;
  845. return &op->data;
  846. case OP_TDB_DELETE:
  847. if (op->ret < 0)
  848. return &must_not_exist;
  849. return &must_exist;
  850. default:
  851. errx(1, "Unexpected op type %i", op->type);
  852. }
  853. }
  854. static bool starts_transaction(const struct op *op)
  855. {
  856. return op->type == OP_TDB_TRANSACTION_START;
  857. }
  858. static bool in_transaction(const struct op op[], unsigned int i)
  859. {
  860. return op[i].group_start && starts_transaction(&op[op[i].group_start]);
  861. }
  862. static bool successful_transaction(const struct op *op)
  863. {
  864. return starts_transaction(op)
  865. && op[op->group_len].type == OP_TDB_TRANSACTION_COMMIT;
  866. }
  867. static bool starts_traverse(const struct op *op)
  868. {
  869. return op->type == OP_TDB_TRAVERSE_START
  870. || op->type == OP_TDB_TRAVERSE_READ_START;
  871. }
  872. static bool in_traverse(const struct op op[], unsigned int i)
  873. {
  874. return op[i].group_start && starts_traverse(&op[op[i].group_start]);
  875. }
  876. static bool starts_chainlock(const struct op *op)
  877. {
  878. return op->type == OP_TDB_CHAINLOCK_READ
  879. || op->type == OP_TDB_CHAINLOCK;
  880. }
  881. static bool in_chainlock(const struct op op[], unsigned int i)
  882. {
  883. return op[i].group_start && starts_chainlock(&op[op[i].group_start]);
  884. }
  885. /* What's the data after this op? pre if nothing changed. */
  886. static const TDB_DATA *gives(const TDB_DATA *key, const TDB_DATA *pre,
  887. const struct op *op)
  888. {
  889. if (starts_transaction(op) || starts_chainlock(op)) {
  890. unsigned int i;
  891. /* Cancelled transactions don't change anything. */
  892. if (op[op->group_len].type == OP_TDB_TRANSACTION_CANCEL)
  893. return pre;
  894. assert(op[op->group_len].type == OP_TDB_TRANSACTION_COMMIT
  895. || op[op->group_len].type == OP_TDB_CHAINUNLOCK_READ
  896. || op[op->group_len].type == OP_TDB_CHAINUNLOCK);
  897. for (i = 1; i < op->group_len; i++) {
  898. /* This skips nested transactions, too */
  899. if (key_eq(op[i].key, *key))
  900. pre = gives(key, pre, &op[i]);
  901. }
  902. return pre;
  903. }
  904. /* Failed ops don't change state of db. */
  905. if (op->ret < 0)
  906. return pre;
  907. if (op->type == OP_TDB_DELETE || op->type == OP_TDB_WIPE_ALL)
  908. return &tdb_null;
  909. if (op->type == OP_TDB_APPEND)
  910. return &op->append.post;
  911. if (op->type == OP_TDB_STORE)
  912. return &op->data;
  913. return pre;
  914. }
  915. static struct keyinfo *hash_ops(struct op *op[], unsigned int num_ops[],
  916. unsigned int num)
  917. {
  918. unsigned int i, j, h;
  919. struct keyinfo *hash;
  920. hash = talloc_zero_array(op[0], struct keyinfo, total_keys*2);
  921. for (i = 0; i < num; i++) {
  922. for (j = 1; j < num_ops[i]; j++) {
  923. /* We can't do this on allocation, due to realloc. */
  924. list_head_init(&op[i][j].post);
  925. list_head_init(&op[i][j].pre);
  926. if (!op[i][j].key.dptr)
  927. continue;
  928. h = hash_key(&op[i][j].key) % (total_keys * 2);
  929. while (!key_eq(hash[h].key, op[i][j].key)) {
  930. if (!hash[h].key.dptr) {
  931. hash[h].key = op[i][j].key;
  932. break;
  933. }
  934. h = (h + 1) % (total_keys * 2);
  935. }
  936. /* Might as well save some memory if we can. */
  937. if (op[i][j].key.dptr != hash[h].key.dptr) {
  938. talloc_free(op[i][j].key.dptr);
  939. op[i][j].key.dptr = hash[h].key.dptr;
  940. }
  941. hash[h].user = talloc_realloc(hash, hash[h].user,
  942. struct op_desc,
  943. hash[h].num_users+1);
  944. /* If it's in a transaction, it's the transaction which
  945. * matters from an analysis POV. */
  946. if (in_transaction(op[i], j)
  947. || in_chainlock(op[i], j)) {
  948. unsigned start = op[i][j].group_start;
  949. /* Don't include twice. */
  950. if (hash[h].num_users
  951. && hash[h].user[hash[h].num_users-1].file
  952. == i
  953. && hash[h].user[hash[h].num_users-1].op_num
  954. == start)
  955. continue;
  956. hash[h].user[hash[h].num_users].op_num = start;
  957. } else
  958. hash[h].user[hash[h].num_users].op_num = j;
  959. hash[h].user[hash[h].num_users].file = i;
  960. hash[h].num_users++;
  961. }
  962. }
  963. return hash;
  964. }
  965. static bool satisfies(const TDB_DATA *key, const TDB_DATA *data,
  966. const struct op *op)
  967. {
  968. const TDB_DATA *need = NULL;
  969. if (starts_transaction(op) || starts_chainlock(op)) {
  970. unsigned int i;
  971. /* Look through for an op in this transaction which
  972. * needs this key. */
  973. for (i = 1; i < op->group_len; i++) {
  974. if (key_eq(op[i].key, *key)) {
  975. need = needs(&op[i]);
  976. /* tdb_exists() is special: there might be
  977. * something in the transaction with more
  978. * specific requirements. Other ops don't have
  979. * specific requirements (eg. store or delete),
  980. * but they change the value so we can't get
  981. * more information from future ops. */
  982. if (op[i].type != OP_TDB_EXISTS)
  983. break;
  984. }
  985. }
  986. } else
  987. need = needs(op);
  988. /* Don't need anything? Cool. */
  989. if (!need)
  990. return true;
  991. /* This should be tdb_null or a real value. */
  992. assert(data != &must_exist);
  993. assert(data != &must_not_exist);
  994. assert(data != &not_exists_or_empty);
  995. /* Must not exist? data must not exist. */
  996. if (need == &must_not_exist)
  997. return data == &tdb_null;
  998. /* Must exist? */
  999. if (need == &must_exist)
  1000. return data != &tdb_null;
  1001. /* Either noexist or empty. */
  1002. if (need == &not_exists_or_empty)
  1003. return data->dsize == 0;
  1004. /* Needs something specific. */
  1005. return key_eq(*data, *need);
  1006. }
  1007. static void move_to_front(struct op_desc res[], unsigned off, unsigned elem)
  1008. {
  1009. if (elem != off) {
  1010. struct op_desc tmp = res[elem];
  1011. memmove(res + off + 1, res + off, (elem - off)*sizeof(res[0]));
  1012. res[off] = tmp;
  1013. }
  1014. }
  1015. static void restore_to_pos(struct op_desc res[], unsigned off, unsigned elem)
  1016. {
  1017. if (elem != off) {
  1018. struct op_desc tmp = res[off];
  1019. memmove(res + off, res + off + 1, (elem - off)*sizeof(res[0]));
  1020. res[elem] = tmp;
  1021. }
  1022. }
  1023. static bool sort_deps(char *filename[], struct op *op[],
  1024. struct op_desc res[],
  1025. unsigned off, unsigned num,
  1026. const TDB_DATA *key, const TDB_DATA *data,
  1027. unsigned num_files, unsigned fuzz)
  1028. {
  1029. unsigned int i, files_done;
  1030. struct op *this_op;
  1031. bool done[num_files];
  1032. /* None left? We're sorted. */
  1033. if (off == num)
  1034. return true;
  1035. /* Does this make sequence number go backwards? Allow a little fuzz. */
  1036. if (off > 0) {
  1037. int seqnum1 = op[res[off-1].file][res[off-1].op_num].seqnum;
  1038. int seqnum2 = op[res[off].file][res[off].op_num].seqnum;
  1039. if (seqnum1 - seqnum2 > (int)fuzz) {
  1040. #if DEBUG_DEPS
  1041. printf("Seqnum jump too far (%u -> %u)\n",
  1042. seqnum1, seqnum2);
  1043. #endif
  1044. return false;
  1045. }
  1046. }
  1047. memset(done, 0, sizeof(done));
  1048. /* Since ops within a trace file are ordered, we just need to figure
  1049. * out which file to try next. Since we don't take into account
  1050. * inter-key relationships (which exist by virtue of trace file order),
  1051. * we minimize the chance of harm by trying to keep in seqnum order. */
  1052. for (files_done = 0, i = off; i < num && files_done < num_files; i++) {
  1053. if (done[res[i].file])
  1054. continue;
  1055. this_op = &op[res[i].file][res[i].op_num];
  1056. /* Is what we have good enough for this op? */
  1057. if (satisfies(key, data, this_op)) {
  1058. move_to_front(res, off, i);
  1059. if (sort_deps(filename, op, res, off+1, num,
  1060. key, gives(key, data, this_op),
  1061. num_files, fuzz))
  1062. return true;
  1063. restore_to_pos(res, off, i);
  1064. }
  1065. done[res[i].file] = true;
  1066. files_done++;
  1067. }
  1068. /* No combination worked. */
  1069. return false;
  1070. }
  1071. static void check_dep_sorting(struct op_desc user[], unsigned num_users,
  1072. unsigned num_files)
  1073. {
  1074. #if DEBUG_DEPS
  1075. unsigned int i;
  1076. unsigned minima[num_files];
  1077. memset(minima, 0, sizeof(minima));
  1078. for (i = 0; i < num_users; i++) {
  1079. assert(minima[user[i].file] < user[i].op_num);
  1080. minima[user[i].file] = user[i].op_num;
  1081. }
  1082. #endif
  1083. }
  1084. /* All these ops happen on the same key. Which comes first?
  1085. *
  1086. * This can happen both because read ops or failed write ops don't
  1087. * change sequence number, and also due to race since we access the
  1088. * number unlocked (the race can cause less detectable ordering problems,
  1089. * in which case we'll deadlock and report: fix manually in that case).
  1090. */
  1091. static void figure_deps(char *filename[], struct op *op[],
  1092. const TDB_DATA *key, struct op_desc user[],
  1093. unsigned num_users, unsigned num_files)
  1094. {
  1095. /* We assume database starts empty. */
  1096. const struct TDB_DATA *data = &tdb_null;
  1097. unsigned int fuzz;
  1098. /* We prefer to keep strict seqnum order if possible: it's the
  1099. * most likely. We get more lax if that fails. */
  1100. for (fuzz = 0; fuzz < 100; fuzz = (fuzz + 1)*2) {
  1101. if (sort_deps(filename, op, user, 0, num_users, key, data,
  1102. num_files, fuzz))
  1103. break;
  1104. }
  1105. if (fuzz >= 100)
  1106. fail(filename[user[0].file], user[0].op_num+1,
  1107. "Could not resolve inter-dependencies");
  1108. check_dep_sorting(user, num_users, num_files);
  1109. }
  1110. static void sort_ops(struct keyinfo hash[], char *filename[], struct op *op[],
  1111. unsigned int num)
  1112. {
  1113. unsigned int h;
  1114. /* Gcc nexted function extension. How cool is this? */
  1115. int compare_seqnum(const void *_a, const void *_b)
  1116. {
  1117. const struct op_desc *a = _a, *b = _b;
  1118. /* First, maintain order within any trace file. */
  1119. if (a->file == b->file)
  1120. return a->op_num - b->op_num;
  1121. /* Otherwise, arrange by seqnum order. */
  1122. if (op[a->file][a->op_num].seqnum !=
  1123. op[b->file][b->op_num].seqnum)
  1124. return op[a->file][a->op_num].seqnum
  1125. - op[b->file][b->op_num].seqnum;
  1126. /* Cancelled transactions are assumed to happen first. */
  1127. if (starts_transaction(&op[a->file][a->op_num])
  1128. && !successful_transaction(&op[a->file][a->op_num]))
  1129. return -1;
  1130. if (starts_transaction(&op[b->file][b->op_num])
  1131. && !successful_transaction(&op[b->file][b->op_num]))
  1132. return 1;
  1133. /* No idea. */
  1134. return 0;
  1135. }
  1136. /* Now sort into seqnum order. */
  1137. for (h = 0; h < total_keys * 2; h++) {
  1138. struct op_desc *user = hash[h].user;
  1139. qsort(user, hash[h].num_users, sizeof(user[0]), compare_seqnum);
  1140. figure_deps(filename, op, &hash[h].key, user, hash[h].num_users,
  1141. num);
  1142. }
  1143. }
  1144. static int destroy_depend(struct depend *dep)
  1145. {
  1146. list_del(&dep->pre_list);
  1147. list_del(&dep->post_list);
  1148. return 0;
  1149. }
  1150. static void add_dependency(void *ctx,
  1151. struct op *op[],
  1152. char *filename[],
  1153. const struct op_desc *needs,
  1154. const struct op_desc *prereq)
  1155. {
  1156. struct depend *dep;
  1157. /* We don't depend on ourselves. */
  1158. if (needs->file == prereq->file) {
  1159. assert(prereq->op_num < needs->op_num);
  1160. return;
  1161. }
  1162. #if DEBUG_DEPS
  1163. printf("%s:%u: depends on %s:%u\n",
  1164. filename[needs->file], needs->op_num+1,
  1165. filename[prereq->file], prereq->op_num+1);
  1166. #endif
  1167. dep = talloc(ctx, struct depend);
  1168. dep->needs = *needs;
  1169. dep->prereq = *prereq;
  1170. #if TRAVERSALS_TAKE_TRANSACTION_LOCK
  1171. /* If something in a traverse depends on something in another
  1172. * traverse/transaction, it creates a dependency between the
  1173. * two groups. */
  1174. if ((in_traverse(op[prereq->file], prereq->op_num)
  1175. && (starts_transaction(&op[needs->file][needs->op_num])
  1176. || starts_traverse(&op[needs->file][needs->op_num])))
  1177. || (in_traverse(op[needs->file], needs->op_num)
  1178. && (starts_transaction(&op[prereq->file][prereq->op_num])
  1179. || starts_traverse(&op[prereq->file][prereq->op_num])))) {
  1180. unsigned int start;
  1181. /* We are satisfied by end of group. */
  1182. start = op[prereq->file][prereq->op_num].group_start;
  1183. dep->prereq.op_num = start + op[prereq->file][start].group_len;
  1184. /* And we need that done by start of our group. */
  1185. dep->needs.op_num = op[needs->file][needs->op_num].group_start;
  1186. }
  1187. /* There is also this case:
  1188. * <traverse> <read foo> ...
  1189. * <transaction> ... </transaction> <create foo>
  1190. * Where if we start the traverse then wait, we could block
  1191. * the transaction and deadlock.
  1192. *
  1193. * We try to address this by ensuring that where seqnum indicates it's
  1194. * possible, we wait for <create foo> before *starting* traverse.
  1195. */
  1196. else if (in_traverse(op[needs->file], needs->op_num)) {
  1197. struct op *need = &op[needs->file][needs->op_num];
  1198. if (op[needs->file][need->group_start].seqnum >
  1199. op[prereq->file][prereq->op_num].seqnum) {
  1200. dep->needs.op_num = need->group_start;
  1201. }
  1202. }
  1203. #endif
  1204. /* If you depend on a transaction or chainlock, you actually
  1205. * depend on it ending. */
  1206. if (starts_transaction(&op[prereq->file][dep->prereq.op_num])
  1207. || starts_chainlock(&op[prereq->file][dep->prereq.op_num])) {
  1208. dep->prereq.op_num
  1209. += op[dep->prereq.file][dep->prereq.op_num].group_len;
  1210. #if DEBUG_DEPS
  1211. printf("-> Actually end of transaction %s:%u\n",
  1212. filename[dep->prereq->file], dep->prereq->op_num+1);
  1213. #endif
  1214. } else
  1215. /* We should never create a dependency from middle of
  1216. * a transaction. */
  1217. assert(!in_transaction(op[prereq->file], dep->prereq.op_num)
  1218. || op[prereq->file][dep->prereq.op_num].type
  1219. == OP_TDB_TRANSACTION_COMMIT
  1220. || op[prereq->file][dep->prereq.op_num].type
  1221. == OP_TDB_TRANSACTION_CANCEL);
  1222. list_add(&op[dep->prereq.file][dep->prereq.op_num].post,
  1223. &dep->post_list);
  1224. list_add(&op[dep->needs.file][dep->needs.op_num].pre,
  1225. &dep->pre_list);
  1226. talloc_set_destructor(dep, destroy_depend);
  1227. }
  1228. static bool changes_db(const TDB_DATA *key, const struct op *op)
  1229. {
  1230. return gives(key, NULL, op) != NULL;
  1231. }
  1232. static void depend_on_previous(struct op *op[],
  1233. char *filename[],
  1234. unsigned int num,
  1235. struct op_desc user[],
  1236. unsigned int i,
  1237. int prev)
  1238. {
  1239. bool deps[num];
  1240. int j;
  1241. if (i == 0)
  1242. return;
  1243. if (prev == i - 1) {
  1244. /* Just depend on previous. */
  1245. add_dependency(NULL, op, filename, &user[i], &user[prev]);
  1246. return;
  1247. }
  1248. /* We have to wait for the readers. Find last one in *each* file. */
  1249. memset(deps, 0, sizeof(deps));
  1250. deps[user[i].file] = true;
  1251. for (j = i - 1; j > prev; j--) {
  1252. if (!deps[user[j].file]) {
  1253. add_dependency(NULL, op, filename, &user[i], &user[j]);
  1254. deps[user[j].file] = true;
  1255. }
  1256. }
  1257. }
  1258. /* This is simple, but not complete. We don't take into account
  1259. * indirect dependencies. */
  1260. static void optimize_dependencies(struct op *op[], unsigned int num_ops[],
  1261. unsigned int num)
  1262. {
  1263. unsigned int i, j;
  1264. /* There can only be one real dependency on each file */
  1265. for (i = 0; i < num; i++) {
  1266. for (j = 1; j < num_ops[i]; j++) {
  1267. struct depend *dep, *next;
  1268. struct depend *prev[num];
  1269. memset(prev, 0, sizeof(prev));
  1270. list_for_each_safe(&op[i][j].pre, dep, next, pre_list) {
  1271. if (!prev[dep->prereq.file]) {
  1272. prev[dep->prereq.file] = dep;
  1273. continue;
  1274. }
  1275. if (prev[dep->prereq.file]->prereq.op_num
  1276. < dep->prereq.op_num) {
  1277. talloc_free(prev[dep->prereq.file]);
  1278. prev[dep->prereq.file] = dep;
  1279. } else
  1280. talloc_free(dep);
  1281. }
  1282. }
  1283. }
  1284. for (i = 0; i < num; i++) {
  1285. int deps[num];
  1286. for (j = 0; j < num; j++)
  1287. deps[j] = -1;
  1288. for (j = 1; j < num_ops[i]; j++) {
  1289. struct depend *dep, *next;
  1290. list_for_each_safe(&op[i][j].pre, dep, next, pre_list) {
  1291. if (deps[dep->prereq.file]
  1292. >= (int)dep->prereq.op_num)
  1293. talloc_free(dep);
  1294. else
  1295. deps[dep->prereq.file]
  1296. = dep->prereq.op_num;
  1297. }
  1298. }
  1299. }
  1300. }
  1301. #if TRAVERSALS_TAKE_TRANSACTION_LOCK
  1302. /* Force an order among the traversals, so they don't deadlock (as much) */
  1303. static void make_traverse_depends(char *filename[],
  1304. struct op *op[], unsigned int num_ops[],
  1305. unsigned int num)
  1306. {
  1307. unsigned int i, num_traversals = 0;
  1308. int j;
  1309. struct op_desc *desc;
  1310. /* Sort by which one runs first. */
  1311. int compare_traverse_desc(const void *_a, const void *_b)
  1312. {
  1313. const struct op_desc *da = _a, *db = _b;
  1314. const struct op *a = &op[da->file][da->op_num],
  1315. *b = &op[db->file][db->op_num];
  1316. if (a->seqnum != b->seqnum)
  1317. return a->seqnum - b->seqnum;
  1318. /* If they have same seqnum, it means one didn't make any
  1319. * changes. Thus sort by end in that case. */
  1320. return a[a->group_len].seqnum - b[b->group_len].seqnum;
  1321. }
  1322. desc = talloc_array(NULL, struct op_desc, 1);
  1323. /* Count them. */
  1324. for (i = 0; i < num; i++) {
  1325. for (j = 1; j < num_ops[i]; j++) {
  1326. /* Traverse start (ignore those in
  1327. * transactions; they're already covered by
  1328. * transaction dependencies). */
  1329. if (starts_traverse(&op[i][j])
  1330. && !in_transaction(op[i], j)) {
  1331. desc = talloc_realloc(NULL, desc,
  1332. struct op_desc,
  1333. num_traversals+1);
  1334. desc[num_traversals].file = i;
  1335. desc[num_traversals].op_num = j;
  1336. num_traversals++;
  1337. }
  1338. }
  1339. }
  1340. qsort(desc, num_traversals, sizeof(desc[0]), compare_traverse_desc);
  1341. for (i = 1; i < num_traversals; i++) {
  1342. const struct op *prev = &op[desc[i-1].file][desc[i-1].op_num];
  1343. const struct op *curr = &op[desc[i].file][desc[i].op_num];
  1344. /* Read traverses don't depend on each other (read lock). */
  1345. if (prev->type == OP_TDB_TRAVERSE_READ_START
  1346. && curr->type == OP_TDB_TRAVERSE_READ_START)
  1347. continue;
  1348. /* Only make dependency if it's clear. */
  1349. if (compare_traverse_desc(&desc[i], &desc[i-1])) {
  1350. /* i depends on end of traverse i-1. */
  1351. struct op_desc end = desc[i-1];
  1352. end.op_num += prev->group_len;
  1353. add_dependency(NULL, op, filename, &desc[i], &end);
  1354. }
  1355. }
  1356. talloc_free(desc);
  1357. }
  1358. static void set_nonblock(int fd)
  1359. {
  1360. if (fcntl(fd, F_SETFL, fcntl(fd, F_GETFL)|O_NONBLOCK) != 0)
  1361. err(1, "Setting pipe nonblocking");
  1362. }
  1363. static bool handle_backoff(struct op *op[], int fd)
  1364. {
  1365. struct op_desc desc;
  1366. bool handled = false;
  1367. /* Sloppy coding: we assume PIPEBUF never fills. */
  1368. while (read(fd, &desc, sizeof(desc)) != -1) {
  1369. unsigned int i;
  1370. handled = true;
  1371. for (i = desc.op_num; i > 0; i--) {
  1372. if (op[desc.file][i].type == OP_TDB_TRAVERSE) {
  1373. /* We insert a fake end here. */
  1374. op[desc.file][i].type
  1375. = OP_TDB_TRAVERSE_END_EARLY;
  1376. break;
  1377. } else if (starts_traverse(&op[desc.file][i])) {
  1378. unsigned int start = i;
  1379. struct op tmp = op[desc.file][i];
  1380. /* Move the ops outside traverse. */
  1381. memmove(&op[desc.file][i],
  1382. &op[desc.file][i+1],
  1383. (desc.op_num-i-1) * sizeof(op[0][0]));
  1384. op[desc.file][desc.op_num] = tmp;
  1385. while (op[desc.file][i].group_start == start) {
  1386. op[desc.file][i++].group_start
  1387. = desc.op_num;
  1388. }
  1389. break;
  1390. }
  1391. }
  1392. }
  1393. return handled;
  1394. }
  1395. #else /* !TRAVERSALS_TAKE_TRANSACTION_LOCK */
  1396. static bool handle_backoff(struct op *op[], int fd)
  1397. {
  1398. return false;
  1399. }
  1400. #endif
  1401. static void derive_dependencies(char *filename[],
  1402. struct op *op[], unsigned int num_ops[],
  1403. unsigned int num)
  1404. {
  1405. struct keyinfo *hash;
  1406. unsigned int h, i;
  1407. /* Create hash table for faster key lookup. */
  1408. hash = hash_ops(op, num_ops, num);
  1409. /* Sort them by sequence number. */
  1410. sort_ops(hash, filename, op, num);
  1411. /* Create dependencies back to the last change, rather than
  1412. * creating false dependencies by naively making each one
  1413. * depend on the previous. This has two purposes: it makes
  1414. * later optimization simpler, and it also avoids deadlock with
  1415. * same sequence number ops inside traversals (if one
  1416. * traversal doesn't write anything, two ops can have the same
  1417. * sequence number yet we can create a traversal dependency
  1418. * the other way). */
  1419. for (h = 0; h < total_keys * 2; h++) {
  1420. int prev = -1;
  1421. if (hash[h].num_users < 2)
  1422. continue;
  1423. for (i = 0; i < hash[h].num_users; i++) {
  1424. if (changes_db(&hash[h].key, &op[hash[h].user[i].file]
  1425. [hash[h].user[i].op_num])) {
  1426. depend_on_previous(op, filename, num,
  1427. hash[h].user, i, prev);
  1428. prev = i;
  1429. } else if (prev >= 0)
  1430. add_dependency(hash, op, filename,
  1431. &hash[h].user[i],
  1432. &hash[h].user[prev]);
  1433. }
  1434. }
  1435. #if TRAVERSALS_TAKE_TRANSACTION_LOCK
  1436. make_traverse_depends(filename, op, num_ops, num);
  1437. #endif
  1438. optimize_dependencies(op, num_ops, num);
  1439. }
  1440. static struct timeval run_test(char *argv[],
  1441. unsigned int num_ops[],
  1442. unsigned int hashsize[],
  1443. unsigned int tdb_flags[],
  1444. unsigned int open_flags[],
  1445. struct op *op[],
  1446. int fds[2])
  1447. {
  1448. unsigned int i;
  1449. struct timeval start, end, diff;
  1450. bool ok = true;
  1451. for (i = 0; argv[i+2]; i++) {
  1452. struct tdb_context *tdb;
  1453. char c;
  1454. switch (fork()) {
  1455. case -1:
  1456. err(1, "fork failed");
  1457. case 0:
  1458. close(fds[1]);
  1459. tdb = tdb_open_ex(argv[1], hashsize[i],
  1460. tdb_flags[i]|TDB_NOSYNC,
  1461. open_flags[i], 0600, NULL, hash_key);
  1462. if (!tdb)
  1463. err(1, "Opening tdb %s", argv[1]);
  1464. /* This catches parent exiting. */
  1465. if (read(fds[0], &c, 1) != 1)
  1466. exit(1);
  1467. run_ops(tdb, pipes[i].fd[0], argv+2, op, i, 1,
  1468. num_ops[i], false);
  1469. check_deps(argv[2+i], op[i], num_ops[i]);
  1470. exit(0);
  1471. default:
  1472. break;
  1473. }
  1474. }
  1475. /* Let everything settle. */
  1476. sleep(1);
  1477. printf("Starting run...");
  1478. fflush(stdout);
  1479. gettimeofday(&start, NULL);
  1480. /* Tell them all to go! Any write of sufficient length will do. */
  1481. if (write(fds[1], hashsize, i) != i)
  1482. err(1, "Writing to wakeup pipe");
  1483. for (i = 0; argv[i + 2]; i++) {
  1484. int status;
  1485. wait(&status);
  1486. if (!WIFEXITED(status)) {
  1487. warnx("Child died with signal %i", WTERMSIG(status));
  1488. ok = false;
  1489. } else if (WEXITSTATUS(status) != 0)
  1490. /* Assume child spat out error. */
  1491. ok = false;
  1492. }
  1493. if (!ok)
  1494. exit(1);
  1495. gettimeofday(&end, NULL);
  1496. printf("done\n");
  1497. if (end.tv_usec < start.tv_usec) {
  1498. end.tv_usec += 1000000;
  1499. end.tv_sec--;
  1500. }
  1501. diff.tv_sec = end.tv_sec - start.tv_sec;
  1502. diff.tv_usec = end.tv_usec - start.tv_usec;
  1503. return diff;
  1504. }
  1505. int main(int argc, char *argv[])
  1506. {
  1507. struct timeval diff;
  1508. unsigned int i, num_ops[argc], hashsize[argc], tdb_flags[argc], open_flags[argc];
  1509. struct op *op[argc];
  1510. int fds[2];
  1511. if (argc < 3)
  1512. errx(1, "Usage: %s <tdbfile> <tracefile>...", argv[0]);
  1513. pipes = talloc_array(NULL, struct pipe, argc - 1);
  1514. for (i = 0; i < argc - 2; i++) {
  1515. printf("Loading tracefile %s...", argv[2+i]);
  1516. fflush(stdout);
  1517. op[i] = load_tracefile(argv[2+i], &num_ops[i], &hashsize[i],
  1518. &tdb_flags[i], &open_flags[i]);
  1519. if (pipe(pipes[i].fd) != 0)
  1520. err(1, "creating pipe");
  1521. printf("done\n");
  1522. }
  1523. printf("Calculating inter-dependencies...");
  1524. fflush(stdout);
  1525. derive_dependencies(argv+2, op, num_ops, i);
  1526. printf("done\n");
  1527. /* Don't fork for single arg case: simple debugging. */
  1528. if (argc == 3) {
  1529. struct tdb_context *tdb;
  1530. tdb = tdb_open_ex(argv[1], hashsize[0], tdb_flags[0]|TDB_NOSYNC,
  1531. open_flags[0], 0600, NULL, hash_key);
  1532. printf("Single threaded run...");
  1533. fflush(stdout);
  1534. run_ops(tdb, pipes[0].fd[0], argv+2, op, 0, 1, num_ops[0],
  1535. false);
  1536. check_deps(argv[2], op[0], num_ops[0]);
  1537. printf("done\n");
  1538. exit(0);
  1539. }
  1540. if (pipe(fds) != 0)
  1541. err(1, "creating pipe");
  1542. #if TRAVERSALS_TAKE_TRANSACTION_LOCK
  1543. if (pipe(pipes[argc-2].fd) != 0)
  1544. err(1, "creating pipe");
  1545. backoff_fd = pipes[argc-2].fd[1];
  1546. set_nonblock(pipes[argc-2].fd[1]);
  1547. set_nonblock(pipes[argc-2].fd[0]);
  1548. #endif
  1549. do {
  1550. diff = run_test(argv, num_ops, hashsize, tdb_flags, open_flags,
  1551. op, fds);
  1552. } while (handle_backoff(op, pipes[argc-2].fd[0]));
  1553. printf("Time replaying: %lu usec\n",
  1554. diff.tv_sec * 1000000UL + diff.tv_usec);
  1555. exit(0);
  1556. }