free.c 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972
  1. /*
  2. Trivial Database 2: free list/block handling
  3. Copyright (C) Rusty Russell 2010
  4. This library is free software; you can redistribute it and/or
  5. modify it under the terms of the GNU Lesser General Public
  6. License as published by the Free Software Foundation; either
  7. version 3 of the License, or (at your option) any later version.
  8. This library is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  11. Lesser General Public License for more details.
  12. You should have received a copy of the GNU Lesser General Public
  13. License along with this library; if not, see <http://www.gnu.org/licenses/>.
  14. */
  15. #include "private.h"
  16. #include <ccan/likely/likely.h>
  17. #include <ccan/ilog/ilog.h>
  18. #include <time.h>
  19. #include <limits.h>
  20. static unsigned fls64(uint64_t val)
  21. {
  22. return ilog64(val);
  23. }
  24. /* In which bucket would we find a particular record size? (ignoring header) */
  25. unsigned int size_to_bucket(ntdb_len_t data_len)
  26. {
  27. unsigned int bucket;
  28. /* We can't have records smaller than this. */
  29. assert(data_len >= NTDB_MIN_DATA_LEN);
  30. /* Ignoring the header... */
  31. if (data_len - NTDB_MIN_DATA_LEN <= 64) {
  32. /* 0 in bucket 0, 8 in bucket 1... 64 in bucket 8. */
  33. bucket = (data_len - NTDB_MIN_DATA_LEN) / 8;
  34. } else {
  35. /* After that we go power of 2. */
  36. bucket = fls64(data_len - NTDB_MIN_DATA_LEN) + 2;
  37. }
  38. if (unlikely(bucket >= NTDB_FREE_BUCKETS))
  39. bucket = NTDB_FREE_BUCKETS - 1;
  40. return bucket;
  41. }
  42. ntdb_off_t first_ftable(struct ntdb_context *ntdb)
  43. {
  44. return ntdb_read_off(ntdb, offsetof(struct ntdb_header, free_table));
  45. }
  46. ntdb_off_t next_ftable(struct ntdb_context *ntdb, ntdb_off_t ftable)
  47. {
  48. return ntdb_read_off(ntdb, ftable + offsetof(struct ntdb_freetable,next));
  49. }
  50. enum NTDB_ERROR ntdb_ftable_init(struct ntdb_context *ntdb)
  51. {
  52. /* Use reservoir sampling algorithm to select a free list at random. */
  53. unsigned int rnd, max = 0, count = 0;
  54. ntdb_off_t off;
  55. ntdb->ftable_off = off = first_ftable(ntdb);
  56. ntdb->ftable = 0;
  57. while (off) {
  58. if (NTDB_OFF_IS_ERR(off)) {
  59. return NTDB_OFF_TO_ERR(off);
  60. }
  61. rnd = random();
  62. if (rnd >= max) {
  63. ntdb->ftable_off = off;
  64. ntdb->ftable = count;
  65. max = rnd;
  66. }
  67. off = next_ftable(ntdb, off);
  68. count++;
  69. }
  70. return NTDB_SUCCESS;
  71. }
  72. /* Offset of a given bucket. */
  73. ntdb_off_t bucket_off(ntdb_off_t ftable_off, unsigned bucket)
  74. {
  75. return ftable_off + offsetof(struct ntdb_freetable, buckets)
  76. + bucket * sizeof(ntdb_off_t);
  77. }
  78. /* Returns free_buckets + 1, or list number to search, or -ve error. */
  79. static ntdb_off_t find_free_head(struct ntdb_context *ntdb,
  80. ntdb_off_t ftable_off,
  81. ntdb_off_t bucket)
  82. {
  83. /* Speculatively search for a non-zero bucket. */
  84. return ntdb_find_nonzero_off(ntdb, bucket_off(ftable_off, 0),
  85. bucket, NTDB_FREE_BUCKETS);
  86. }
  87. static void check_list(struct ntdb_context *ntdb, ntdb_off_t b_off)
  88. {
  89. #ifdef CCAN_NTDB_DEBUG
  90. ntdb_off_t off, prev = 0, first;
  91. struct ntdb_free_record r;
  92. first = off = (ntdb_read_off(ntdb, b_off) & NTDB_OFF_MASK);
  93. while (off != 0) {
  94. ntdb_read_convert(ntdb, off, &r, sizeof(r));
  95. if (frec_magic(&r) != NTDB_FREE_MAGIC)
  96. abort();
  97. if (prev && frec_prev(&r) != prev)
  98. abort();
  99. prev = off;
  100. off = r.next;
  101. }
  102. if (first) {
  103. ntdb_read_convert(ntdb, first, &r, sizeof(r));
  104. if (frec_prev(&r) != prev)
  105. abort();
  106. }
  107. #endif
  108. }
  109. /* Remove from free bucket. */
  110. static enum NTDB_ERROR remove_from_list(struct ntdb_context *ntdb,
  111. ntdb_off_t b_off, ntdb_off_t r_off,
  112. const struct ntdb_free_record *r)
  113. {
  114. ntdb_off_t off, prev_next, head;
  115. enum NTDB_ERROR ecode;
  116. /* Is this only element in list? Zero out bucket, and we're done. */
  117. if (frec_prev(r) == r_off)
  118. return ntdb_write_off(ntdb, b_off, 0);
  119. /* off = &r->prev->next */
  120. off = frec_prev(r) + offsetof(struct ntdb_free_record, next);
  121. /* Get prev->next */
  122. prev_next = ntdb_read_off(ntdb, off);
  123. if (NTDB_OFF_IS_ERR(prev_next))
  124. return NTDB_OFF_TO_ERR(prev_next);
  125. /* If prev->next == 0, we were head: update bucket to point to next. */
  126. if (prev_next == 0) {
  127. /* We must preserve upper bits. */
  128. head = ntdb_read_off(ntdb, b_off);
  129. if (NTDB_OFF_IS_ERR(head))
  130. return NTDB_OFF_TO_ERR(head);
  131. if ((head & NTDB_OFF_MASK) != r_off) {
  132. return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
  133. "remove_from_list:"
  134. " %llu head %llu on list %llu",
  135. (long long)r_off,
  136. (long long)head,
  137. (long long)b_off);
  138. }
  139. head = ((head & ~NTDB_OFF_MASK) | r->next);
  140. ecode = ntdb_write_off(ntdb, b_off, head);
  141. if (ecode != NTDB_SUCCESS)
  142. return ecode;
  143. } else {
  144. /* r->prev->next = r->next */
  145. ecode = ntdb_write_off(ntdb, off, r->next);
  146. if (ecode != NTDB_SUCCESS)
  147. return ecode;
  148. }
  149. /* If we were the tail, off = &head->prev. */
  150. if (r->next == 0) {
  151. head = ntdb_read_off(ntdb, b_off);
  152. if (NTDB_OFF_IS_ERR(head))
  153. return NTDB_OFF_TO_ERR(head);
  154. head &= NTDB_OFF_MASK;
  155. off = head + offsetof(struct ntdb_free_record, magic_and_prev);
  156. } else {
  157. /* off = &r->next->prev */
  158. off = r->next + offsetof(struct ntdb_free_record,
  159. magic_and_prev);
  160. }
  161. #ifdef CCAN_NTDB_DEBUG
  162. /* *off == r */
  163. if ((ntdb_read_off(ntdb, off) & NTDB_OFF_MASK) != r_off) {
  164. return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
  165. "remove_from_list:"
  166. " %llu bad prev in list %llu",
  167. (long long)r_off, (long long)b_off);
  168. }
  169. #endif
  170. /* r->next->prev = r->prev */
  171. return ntdb_write_off(ntdb, off, r->magic_and_prev);
  172. }
  173. /* Enqueue in this free bucket: sets coalesce if we've added 128
  174. * entries to it. */
  175. static enum NTDB_ERROR enqueue_in_free(struct ntdb_context *ntdb,
  176. ntdb_off_t b_off,
  177. ntdb_off_t off,
  178. ntdb_len_t len,
  179. bool *coalesce)
  180. {
  181. struct ntdb_free_record new;
  182. enum NTDB_ERROR ecode;
  183. ntdb_off_t prev, head;
  184. uint64_t magic = (NTDB_FREE_MAGIC << (64 - NTDB_OFF_UPPER_STEAL));
  185. head = ntdb_read_off(ntdb, b_off);
  186. if (NTDB_OFF_IS_ERR(head))
  187. return NTDB_OFF_TO_ERR(head);
  188. /* We only need to set ftable_and_len; rest is set in enqueue_in_free */
  189. new.ftable_and_len = ((uint64_t)ntdb->ftable
  190. << (64 - NTDB_OFF_UPPER_STEAL))
  191. | len;
  192. /* new->next = head. */
  193. new.next = (head & NTDB_OFF_MASK);
  194. /* First element? Prev points to ourselves. */
  195. if (!new.next) {
  196. new.magic_and_prev = (magic | off);
  197. } else {
  198. /* new->prev = next->prev */
  199. prev = ntdb_read_off(ntdb,
  200. new.next + offsetof(struct ntdb_free_record,
  201. magic_and_prev));
  202. new.magic_and_prev = prev;
  203. if (frec_magic(&new) != NTDB_FREE_MAGIC) {
  204. return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
  205. "enqueue_in_free: %llu bad head"
  206. " prev %llu",
  207. (long long)new.next,
  208. (long long)prev);
  209. }
  210. /* next->prev = new. */
  211. ecode = ntdb_write_off(ntdb, new.next
  212. + offsetof(struct ntdb_free_record,
  213. magic_and_prev),
  214. off | magic);
  215. if (ecode != NTDB_SUCCESS) {
  216. return ecode;
  217. }
  218. #ifdef CCAN_NTDB_DEBUG
  219. prev = ntdb_read_off(ntdb, frec_prev(&new)
  220. + offsetof(struct ntdb_free_record, next));
  221. if (prev != 0) {
  222. return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
  223. "enqueue_in_free:"
  224. " %llu bad tail next ptr %llu",
  225. (long long)frec_prev(&new)
  226. + offsetof(struct ntdb_free_record,
  227. next),
  228. (long long)prev);
  229. }
  230. #endif
  231. }
  232. /* Update enqueue count, but don't set high bit: see NTDB_OFF_IS_ERR */
  233. if (*coalesce)
  234. head += (1ULL << (64 - NTDB_OFF_UPPER_STEAL));
  235. head &= ~(NTDB_OFF_MASK | (1ULL << 63));
  236. head |= off;
  237. ecode = ntdb_write_off(ntdb, b_off, head);
  238. if (ecode != NTDB_SUCCESS) {
  239. return ecode;
  240. }
  241. /* It's time to coalesce if counter wrapped. */
  242. if (*coalesce)
  243. *coalesce = ((head & ~NTDB_OFF_MASK) == 0);
  244. return ntdb_write_convert(ntdb, off, &new, sizeof(new));
  245. }
  246. static ntdb_off_t ftable_offset(struct ntdb_context *ntdb, unsigned int ftable)
  247. {
  248. ntdb_off_t off;
  249. unsigned int i;
  250. if (likely(ntdb->ftable == ftable))
  251. return ntdb->ftable_off;
  252. off = first_ftable(ntdb);
  253. for (i = 0; i < ftable; i++) {
  254. if (NTDB_OFF_IS_ERR(off)) {
  255. break;
  256. }
  257. off = next_ftable(ntdb, off);
  258. }
  259. return off;
  260. }
  261. /* Note: we unlock the current bucket if fail (-ve), or coalesce (+ve) and
  262. * need to blatt the *protect record (which is set to an error). */
  263. static ntdb_len_t coalesce(struct ntdb_context *ntdb,
  264. ntdb_off_t off, ntdb_off_t b_off,
  265. ntdb_len_t data_len,
  266. ntdb_off_t *protect)
  267. {
  268. ntdb_off_t end;
  269. struct ntdb_free_record rec;
  270. enum NTDB_ERROR ecode;
  271. ntdb->stats.alloc_coalesce_tried++;
  272. end = off + sizeof(struct ntdb_used_record) + data_len;
  273. while (end < ntdb->file->map_size) {
  274. const struct ntdb_free_record *r;
  275. ntdb_off_t nb_off;
  276. unsigned ftable, bucket;
  277. r = ntdb_access_read(ntdb, end, sizeof(*r), true);
  278. if (NTDB_PTR_IS_ERR(r)) {
  279. ecode = NTDB_PTR_ERR(r);
  280. goto err;
  281. }
  282. if (frec_magic(r) != NTDB_FREE_MAGIC
  283. || frec_ftable(r) == NTDB_FTABLE_NONE) {
  284. ntdb_access_release(ntdb, r);
  285. break;
  286. }
  287. ftable = frec_ftable(r);
  288. bucket = size_to_bucket(frec_len(r));
  289. nb_off = ftable_offset(ntdb, ftable);
  290. if (NTDB_OFF_IS_ERR(nb_off)) {
  291. ntdb_access_release(ntdb, r);
  292. ecode = NTDB_OFF_TO_ERR(nb_off);
  293. goto err;
  294. }
  295. nb_off = bucket_off(nb_off, bucket);
  296. ntdb_access_release(ntdb, r);
  297. /* We may be violating lock order here, so best effort. */
  298. if (ntdb_lock_free_bucket(ntdb, nb_off, NTDB_LOCK_NOWAIT)
  299. != NTDB_SUCCESS) {
  300. ntdb->stats.alloc_coalesce_lockfail++;
  301. break;
  302. }
  303. /* Now we have lock, re-check. */
  304. ecode = ntdb_read_convert(ntdb, end, &rec, sizeof(rec));
  305. if (ecode != NTDB_SUCCESS) {
  306. ntdb_unlock_free_bucket(ntdb, nb_off);
  307. goto err;
  308. }
  309. if (unlikely(frec_magic(&rec) != NTDB_FREE_MAGIC)) {
  310. ntdb->stats.alloc_coalesce_race++;
  311. ntdb_unlock_free_bucket(ntdb, nb_off);
  312. break;
  313. }
  314. if (unlikely(frec_ftable(&rec) != ftable)
  315. || unlikely(size_to_bucket(frec_len(&rec)) != bucket)) {
  316. ntdb->stats.alloc_coalesce_race++;
  317. ntdb_unlock_free_bucket(ntdb, nb_off);
  318. break;
  319. }
  320. /* Did we just mess up a record you were hoping to use? */
  321. if (end == *protect) {
  322. ntdb->stats.alloc_coalesce_iterate_clash++;
  323. *protect = NTDB_ERR_TO_OFF(NTDB_ERR_NOEXIST);
  324. }
  325. ecode = remove_from_list(ntdb, nb_off, end, &rec);
  326. check_list(ntdb, nb_off);
  327. if (ecode != NTDB_SUCCESS) {
  328. ntdb_unlock_free_bucket(ntdb, nb_off);
  329. goto err;
  330. }
  331. end += sizeof(struct ntdb_used_record) + frec_len(&rec);
  332. ntdb_unlock_free_bucket(ntdb, nb_off);
  333. ntdb->stats.alloc_coalesce_num_merged++;
  334. }
  335. /* Didn't find any adjacent free? */
  336. if (end == off + sizeof(struct ntdb_used_record) + data_len)
  337. return 0;
  338. /* Before we expand, check this isn't one you wanted protected? */
  339. if (off == *protect) {
  340. *protect = NTDB_ERR_TO_OFF(NTDB_ERR_EXISTS);
  341. ntdb->stats.alloc_coalesce_iterate_clash++;
  342. }
  343. /* OK, expand initial record */
  344. ecode = ntdb_read_convert(ntdb, off, &rec, sizeof(rec));
  345. if (ecode != NTDB_SUCCESS) {
  346. goto err;
  347. }
  348. if (frec_len(&rec) != data_len) {
  349. ecode = ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
  350. "coalesce: expected data len %zu not %zu",
  351. (size_t)data_len, (size_t)frec_len(&rec));
  352. goto err;
  353. }
  354. ecode = remove_from_list(ntdb, b_off, off, &rec);
  355. check_list(ntdb, b_off);
  356. if (ecode != NTDB_SUCCESS) {
  357. goto err;
  358. }
  359. /* Try locking violation first. We don't allow coalesce recursion! */
  360. ecode = add_free_record(ntdb, off, end - off, NTDB_LOCK_NOWAIT, false);
  361. if (ecode != NTDB_SUCCESS) {
  362. /* Need to drop lock. Can't rely on anything stable. */
  363. ntdb->stats.alloc_coalesce_lockfail++;
  364. *protect = NTDB_ERR_TO_OFF(NTDB_ERR_CORRUPT);
  365. /* We have to drop this to avoid deadlocks, so make sure record
  366. * doesn't get coalesced by someone else! */
  367. rec.ftable_and_len = (NTDB_FTABLE_NONE
  368. << (64 - NTDB_OFF_UPPER_STEAL))
  369. | (end - off - sizeof(struct ntdb_used_record));
  370. ecode = ntdb_write_off(ntdb,
  371. off + offsetof(struct ntdb_free_record,
  372. ftable_and_len),
  373. rec.ftable_and_len);
  374. if (ecode != NTDB_SUCCESS) {
  375. goto err;
  376. }
  377. ntdb_unlock_free_bucket(ntdb, b_off);
  378. ecode = add_free_record(ntdb, off, end - off, NTDB_LOCK_WAIT,
  379. false);
  380. if (ecode != NTDB_SUCCESS) {
  381. return NTDB_ERR_TO_OFF(ecode);
  382. }
  383. } else if (NTDB_OFF_IS_ERR(*protect)) {
  384. /* For simplicity, we always drop lock if they can't continue */
  385. ntdb_unlock_free_bucket(ntdb, b_off);
  386. }
  387. ntdb->stats.alloc_coalesce_succeeded++;
  388. /* Return usable length. */
  389. return end - off - sizeof(struct ntdb_used_record);
  390. err:
  391. /* To unify error paths, we *always* unlock bucket on error. */
  392. ntdb_unlock_free_bucket(ntdb, b_off);
  393. return NTDB_ERR_TO_OFF(ecode);
  394. }
  395. /* List is locked: we unlock it. */
  396. static enum NTDB_ERROR coalesce_list(struct ntdb_context *ntdb,
  397. ntdb_off_t ftable_off,
  398. ntdb_off_t b_off,
  399. unsigned int limit)
  400. {
  401. enum NTDB_ERROR ecode;
  402. ntdb_off_t off;
  403. off = ntdb_read_off(ntdb, b_off);
  404. if (NTDB_OFF_IS_ERR(off)) {
  405. ecode = NTDB_OFF_TO_ERR(off);
  406. goto unlock_err;
  407. }
  408. /* A little bit of paranoia: counter should be 0. */
  409. off &= NTDB_OFF_MASK;
  410. while (off && limit--) {
  411. struct ntdb_free_record rec;
  412. ntdb_len_t coal;
  413. ntdb_off_t next;
  414. ecode = ntdb_read_convert(ntdb, off, &rec, sizeof(rec));
  415. if (ecode != NTDB_SUCCESS)
  416. goto unlock_err;
  417. next = rec.next;
  418. coal = coalesce(ntdb, off, b_off, frec_len(&rec), &next);
  419. if (NTDB_OFF_IS_ERR(coal)) {
  420. /* This has already unlocked on error. */
  421. return NTDB_OFF_TO_ERR(coal);
  422. }
  423. if (NTDB_OFF_IS_ERR(next)) {
  424. /* Coalescing had to unlock, so stop. */
  425. return NTDB_SUCCESS;
  426. }
  427. /* Keep going if we're doing well... */
  428. limit += size_to_bucket(coal / 16 + NTDB_MIN_DATA_LEN);
  429. off = next;
  430. }
  431. /* Now, move those elements to the tail of the list so we get something
  432. * else next time. */
  433. if (off) {
  434. struct ntdb_free_record oldhrec, newhrec, oldtrec, newtrec;
  435. ntdb_off_t oldhoff, oldtoff, newtoff;
  436. /* The record we were up to is the new head. */
  437. ecode = ntdb_read_convert(ntdb, off, &newhrec, sizeof(newhrec));
  438. if (ecode != NTDB_SUCCESS)
  439. goto unlock_err;
  440. /* Get the new tail. */
  441. newtoff = frec_prev(&newhrec);
  442. ecode = ntdb_read_convert(ntdb, newtoff, &newtrec,
  443. sizeof(newtrec));
  444. if (ecode != NTDB_SUCCESS)
  445. goto unlock_err;
  446. /* Get the old head. */
  447. oldhoff = ntdb_read_off(ntdb, b_off);
  448. if (NTDB_OFF_IS_ERR(oldhoff)) {
  449. ecode = NTDB_OFF_TO_ERR(oldhoff);
  450. goto unlock_err;
  451. }
  452. /* This could happen if they all coalesced away. */
  453. if (oldhoff == off)
  454. goto out;
  455. ecode = ntdb_read_convert(ntdb, oldhoff, &oldhrec,
  456. sizeof(oldhrec));
  457. if (ecode != NTDB_SUCCESS)
  458. goto unlock_err;
  459. /* Get the old tail. */
  460. oldtoff = frec_prev(&oldhrec);
  461. ecode = ntdb_read_convert(ntdb, oldtoff, &oldtrec,
  462. sizeof(oldtrec));
  463. if (ecode != NTDB_SUCCESS)
  464. goto unlock_err;
  465. /* Old tail's next points to old head. */
  466. oldtrec.next = oldhoff;
  467. /* Old head's prev points to old tail. */
  468. oldhrec.magic_and_prev
  469. = (NTDB_FREE_MAGIC << (64 - NTDB_OFF_UPPER_STEAL))
  470. | oldtoff;
  471. /* New tail's next is 0. */
  472. newtrec.next = 0;
  473. /* Write out the modified versions. */
  474. ecode = ntdb_write_convert(ntdb, oldtoff, &oldtrec,
  475. sizeof(oldtrec));
  476. if (ecode != NTDB_SUCCESS)
  477. goto unlock_err;
  478. ecode = ntdb_write_convert(ntdb, oldhoff, &oldhrec,
  479. sizeof(oldhrec));
  480. if (ecode != NTDB_SUCCESS)
  481. goto unlock_err;
  482. ecode = ntdb_write_convert(ntdb, newtoff, &newtrec,
  483. sizeof(newtrec));
  484. if (ecode != NTDB_SUCCESS)
  485. goto unlock_err;
  486. /* And finally link in new head. */
  487. ecode = ntdb_write_off(ntdb, b_off, off);
  488. if (ecode != NTDB_SUCCESS)
  489. goto unlock_err;
  490. }
  491. out:
  492. ntdb_unlock_free_bucket(ntdb, b_off);
  493. return NTDB_SUCCESS;
  494. unlock_err:
  495. ntdb_unlock_free_bucket(ntdb, b_off);
  496. return ecode;
  497. }
  498. /* List must not be locked if coalesce_ok is set. */
  499. enum NTDB_ERROR add_free_record(struct ntdb_context *ntdb,
  500. ntdb_off_t off, ntdb_len_t len_with_header,
  501. enum ntdb_lock_flags waitflag,
  502. bool coalesce_ok)
  503. {
  504. ntdb_off_t b_off;
  505. ntdb_len_t len;
  506. enum NTDB_ERROR ecode;
  507. assert(len_with_header >= sizeof(struct ntdb_free_record));
  508. len = len_with_header - sizeof(struct ntdb_used_record);
  509. b_off = bucket_off(ntdb->ftable_off, size_to_bucket(len));
  510. ecode = ntdb_lock_free_bucket(ntdb, b_off, waitflag);
  511. if (ecode != NTDB_SUCCESS) {
  512. return ecode;
  513. }
  514. ecode = enqueue_in_free(ntdb, b_off, off, len, &coalesce_ok);
  515. check_list(ntdb, b_off);
  516. /* Coalescing unlocks free list. */
  517. if (!ecode && coalesce_ok)
  518. ecode = coalesce_list(ntdb, ntdb->ftable_off, b_off, 2);
  519. else
  520. ntdb_unlock_free_bucket(ntdb, b_off);
  521. return ecode;
  522. }
  523. static size_t adjust_size(size_t keylen, size_t datalen)
  524. {
  525. size_t size = keylen + datalen;
  526. if (size < NTDB_MIN_DATA_LEN)
  527. size = NTDB_MIN_DATA_LEN;
  528. /* Round to next uint64_t boundary. */
  529. return (size + (sizeof(uint64_t) - 1ULL)) & ~(sizeof(uint64_t) - 1ULL);
  530. }
  531. /* If we have enough left over to be useful, split that off. */
  532. static size_t record_leftover(size_t keylen, size_t datalen,
  533. bool want_extra, size_t total_len)
  534. {
  535. ssize_t leftover;
  536. if (want_extra)
  537. datalen += datalen / 2;
  538. leftover = total_len - adjust_size(keylen, datalen);
  539. if (leftover < (ssize_t)sizeof(struct ntdb_free_record))
  540. return 0;
  541. return leftover;
  542. }
  543. /* We need size bytes to put our key and data in. */
  544. static ntdb_off_t lock_and_alloc(struct ntdb_context *ntdb,
  545. ntdb_off_t ftable_off,
  546. ntdb_off_t bucket,
  547. size_t keylen, size_t datalen,
  548. bool want_extra,
  549. unsigned magic)
  550. {
  551. ntdb_off_t off, b_off,best_off;
  552. struct ntdb_free_record best = { 0 };
  553. double multiplier;
  554. size_t size = adjust_size(keylen, datalen);
  555. enum NTDB_ERROR ecode;
  556. ntdb->stats.allocs++;
  557. b_off = bucket_off(ftable_off, bucket);
  558. /* FIXME: Try non-blocking wait first, to measure contention. */
  559. /* Lock this bucket. */
  560. ecode = ntdb_lock_free_bucket(ntdb, b_off, NTDB_LOCK_WAIT);
  561. if (ecode != NTDB_SUCCESS) {
  562. return NTDB_ERR_TO_OFF(ecode);
  563. }
  564. best.ftable_and_len = -1ULL;
  565. best_off = 0;
  566. /* Get slack if we're after extra. */
  567. if (want_extra)
  568. multiplier = 1.5;
  569. else
  570. multiplier = 1.0;
  571. /* Walk the list to see if any are large enough, getting less fussy
  572. * as we go. */
  573. off = ntdb_read_off(ntdb, b_off);
  574. if (NTDB_OFF_IS_ERR(off)) {
  575. ecode = NTDB_OFF_TO_ERR(off);
  576. goto unlock_err;
  577. }
  578. off &= NTDB_OFF_MASK;
  579. while (off) {
  580. const struct ntdb_free_record *r;
  581. ntdb_off_t next;
  582. r = ntdb_access_read(ntdb, off, sizeof(*r), true);
  583. if (NTDB_PTR_IS_ERR(r)) {
  584. ecode = NTDB_PTR_ERR(r);
  585. goto unlock_err;
  586. }
  587. if (frec_magic(r) != NTDB_FREE_MAGIC) {
  588. ecode = ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
  589. "lock_and_alloc:"
  590. " %llu non-free 0x%llx",
  591. (long long)off,
  592. (long long)r->magic_and_prev);
  593. ntdb_access_release(ntdb, r);
  594. goto unlock_err;
  595. }
  596. if (frec_len(r) >= size && frec_len(r) < frec_len(&best)) {
  597. best_off = off;
  598. best = *r;
  599. }
  600. if (frec_len(&best) <= size * multiplier && best_off) {
  601. ntdb_access_release(ntdb, r);
  602. break;
  603. }
  604. multiplier *= 1.01;
  605. next = r->next;
  606. ntdb_access_release(ntdb, r);
  607. off = next;
  608. }
  609. /* If we found anything at all, use it. */
  610. if (best_off) {
  611. struct ntdb_used_record rec;
  612. size_t leftover;
  613. /* We're happy with this size: take it. */
  614. ecode = remove_from_list(ntdb, b_off, best_off, &best);
  615. check_list(ntdb, b_off);
  616. if (ecode != NTDB_SUCCESS) {
  617. goto unlock_err;
  618. }
  619. leftover = record_leftover(keylen, datalen, want_extra,
  620. frec_len(&best));
  621. assert(keylen + datalen + leftover <= frec_len(&best));
  622. /* We need to mark non-free before we drop lock, otherwise
  623. * coalesce() could try to merge it! */
  624. ecode = set_header(ntdb, &rec, magic, keylen, datalen,
  625. frec_len(&best) - leftover);
  626. if (ecode != NTDB_SUCCESS) {
  627. goto unlock_err;
  628. }
  629. ecode = ntdb_write_convert(ntdb, best_off, &rec, sizeof(rec));
  630. if (ecode != NTDB_SUCCESS) {
  631. goto unlock_err;
  632. }
  633. /* For futureproofing, we put a 0 in any unused space. */
  634. if (rec_extra_padding(&rec)) {
  635. ecode = ntdb->io->twrite(ntdb, best_off + sizeof(rec)
  636. + keylen + datalen, "", 1);
  637. if (ecode != NTDB_SUCCESS) {
  638. goto unlock_err;
  639. }
  640. }
  641. /* Bucket of leftover will be <= current bucket, so nested
  642. * locking is allowed. */
  643. if (leftover) {
  644. ntdb->stats.alloc_leftover++;
  645. ecode = add_free_record(ntdb,
  646. best_off + sizeof(rec)
  647. + frec_len(&best) - leftover,
  648. leftover, NTDB_LOCK_WAIT, false);
  649. if (ecode != NTDB_SUCCESS) {
  650. best_off = NTDB_ERR_TO_OFF(ecode);
  651. }
  652. }
  653. ntdb_unlock_free_bucket(ntdb, b_off);
  654. return best_off;
  655. }
  656. ntdb_unlock_free_bucket(ntdb, b_off);
  657. return 0;
  658. unlock_err:
  659. ntdb_unlock_free_bucket(ntdb, b_off);
  660. return NTDB_ERR_TO_OFF(ecode);
  661. }
  662. /* Get a free block from current free list, or 0 if none, -ve on error. */
  663. static ntdb_off_t get_free(struct ntdb_context *ntdb,
  664. size_t keylen, size_t datalen, bool want_extra,
  665. unsigned magic)
  666. {
  667. ntdb_off_t off, ftable_off;
  668. ntdb_off_t start_b, b, ftable;
  669. bool wrapped = false;
  670. /* If they are growing, add 50% to get to higher bucket. */
  671. if (want_extra)
  672. start_b = size_to_bucket(adjust_size(keylen,
  673. datalen + datalen / 2));
  674. else
  675. start_b = size_to_bucket(adjust_size(keylen, datalen));
  676. ftable_off = ntdb->ftable_off;
  677. ftable = ntdb->ftable;
  678. while (!wrapped || ftable_off != ntdb->ftable_off) {
  679. /* Start at exact size bucket, and search up... */
  680. for (b = find_free_head(ntdb, ftable_off, start_b);
  681. b < NTDB_FREE_BUCKETS;
  682. b = find_free_head(ntdb, ftable_off, b + 1)) {
  683. /* Try getting one from list. */
  684. off = lock_and_alloc(ntdb, ftable_off,
  685. b, keylen, datalen, want_extra,
  686. magic);
  687. if (NTDB_OFF_IS_ERR(off))
  688. return off;
  689. if (off != 0) {
  690. if (b == start_b)
  691. ntdb->stats.alloc_bucket_exact++;
  692. if (b == NTDB_FREE_BUCKETS - 1)
  693. ntdb->stats.alloc_bucket_max++;
  694. /* Worked? Stay using this list. */
  695. ntdb->ftable_off = ftable_off;
  696. ntdb->ftable = ftable;
  697. return off;
  698. }
  699. /* Didn't work. Try next bucket. */
  700. }
  701. if (NTDB_OFF_IS_ERR(b)) {
  702. return b;
  703. }
  704. /* Hmm, try next table. */
  705. ftable_off = next_ftable(ntdb, ftable_off);
  706. if (NTDB_OFF_IS_ERR(ftable_off)) {
  707. return ftable_off;
  708. }
  709. ftable++;
  710. if (ftable_off == 0) {
  711. wrapped = true;
  712. ftable_off = first_ftable(ntdb);
  713. if (NTDB_OFF_IS_ERR(ftable_off)) {
  714. return ftable_off;
  715. }
  716. ftable = 0;
  717. }
  718. }
  719. return 0;
  720. }
  721. enum NTDB_ERROR set_header(struct ntdb_context *ntdb,
  722. struct ntdb_used_record *rec,
  723. unsigned magic, uint64_t keylen, uint64_t datalen,
  724. uint64_t actuallen)
  725. {
  726. uint64_t keybits = (fls64(keylen) + 1) / 2;
  727. rec->magic_and_meta = ((actuallen - (keylen + datalen)) << 11)
  728. | (keybits << 43)
  729. | ((uint64_t)magic << 48);
  730. rec->key_and_data_len = (keylen | (datalen << (keybits*2)));
  731. /* Encoding can fail on big values. */
  732. if (rec_key_length(rec) != keylen
  733. || rec_data_length(rec) != datalen
  734. || rec_extra_padding(rec) != actuallen - (keylen + datalen)) {
  735. return ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
  736. "Could not encode k=%llu,d=%llu,a=%llu",
  737. (long long)keylen, (long long)datalen,
  738. (long long)actuallen);
  739. }
  740. return NTDB_SUCCESS;
  741. }
  742. /* You need 'size', this tells you how much you should expand by. */
  743. ntdb_off_t ntdb_expand_adjust(ntdb_off_t map_size, ntdb_off_t size)
  744. {
  745. ntdb_off_t new_size, top_size;
  746. /* limit size in order to avoid using up huge amounts of memory for
  747. * in memory tdbs if an oddball huge record creeps in */
  748. if (size > 100 * 1024) {
  749. top_size = map_size + size * 2;
  750. } else {
  751. top_size = map_size + size * 100;
  752. }
  753. /* always make room for at least top_size more records, and at
  754. least 25% more space. if the DB is smaller than 100MiB,
  755. otherwise grow it by 10% only. */
  756. if (map_size > 100 * 1024 * 1024) {
  757. new_size = map_size * 1.10;
  758. } else {
  759. new_size = map_size * 1.25;
  760. }
  761. if (new_size < top_size)
  762. new_size = top_size;
  763. /* We always make the file a multiple of transaction page
  764. * size. This guarantees that the transaction recovery area
  765. * is always aligned, otherwise the transaction code can overwrite
  766. * itself. */
  767. new_size = (new_size + NTDB_PGSIZE-1) & ~(NTDB_PGSIZE-1);
  768. return new_size - map_size;
  769. }
  770. /* Expand the database. */
  771. static enum NTDB_ERROR ntdb_expand(struct ntdb_context *ntdb, ntdb_len_t size)
  772. {
  773. uint64_t old_size;
  774. ntdb_len_t wanted;
  775. enum NTDB_ERROR ecode;
  776. /* Need to hold a hash lock to expand DB: transactions rely on it. */
  777. if (!(ntdb->flags & NTDB_NOLOCK)
  778. && !ntdb->file->allrecord_lock.count && !ntdb_has_hash_locks(ntdb)) {
  779. return ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_ERROR,
  780. "ntdb_expand: must hold lock during expand");
  781. }
  782. /* Only one person can expand file at a time. */
  783. ecode = ntdb_lock_expand(ntdb, F_WRLCK);
  784. if (ecode != NTDB_SUCCESS) {
  785. return ecode;
  786. }
  787. /* Someone else may have expanded the file, so retry. */
  788. old_size = ntdb->file->map_size;
  789. ntdb_oob(ntdb, ntdb->file->map_size, 1, true);
  790. if (ntdb->file->map_size != old_size) {
  791. ntdb_unlock_expand(ntdb, F_WRLCK);
  792. return NTDB_SUCCESS;
  793. }
  794. /* We need room for the record header too. */
  795. size = adjust_size(0, sizeof(struct ntdb_used_record) + size);
  796. /* Overallocate. */
  797. wanted = ntdb_expand_adjust(old_size, size);
  798. ecode = ntdb->io->expand_file(ntdb, wanted);
  799. if (ecode != NTDB_SUCCESS) {
  800. ntdb_unlock_expand(ntdb, F_WRLCK);
  801. return ecode;
  802. }
  803. /* We need to drop this lock before adding free record. */
  804. ntdb_unlock_expand(ntdb, F_WRLCK);
  805. ntdb->stats.expands++;
  806. return add_free_record(ntdb, old_size, wanted, NTDB_LOCK_WAIT, true);
  807. }
  808. /* This won't fail: it will expand the database if it has to. */
  809. ntdb_off_t alloc(struct ntdb_context *ntdb, size_t keylen, size_t datalen,
  810. unsigned magic, bool growing)
  811. {
  812. ntdb_off_t off;
  813. for (;;) {
  814. enum NTDB_ERROR ecode;
  815. off = get_free(ntdb, keylen, datalen, growing, magic);
  816. if (likely(off != 0))
  817. break;
  818. ecode = ntdb_expand(ntdb, adjust_size(keylen, datalen));
  819. if (ecode != NTDB_SUCCESS) {
  820. return NTDB_ERR_TO_OFF(ecode);
  821. }
  822. }
  823. return off;
  824. }