free.c 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766
  1. /*
  2. Trivial Database 2: free list/block handling
  3. Copyright (C) Rusty Russell 2010
  4. This library is free software; you can redistribute it and/or
  5. modify it under the terms of the GNU Lesser General Public
  6. License as published by the Free Software Foundation; either
  7. version 3 of the License, or (at your option) any later version.
  8. This library is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  11. Lesser General Public License for more details.
  12. You should have received a copy of the GNU Lesser General Public
  13. License along with this library; if not, see <http://www.gnu.org/licenses/>.
  14. */
  15. #include "private.h"
  16. #include <ccan/likely/likely.h>
  17. #include <time.h>
  18. #include <assert.h>
  19. #include <limits.h>
  20. /* We have to be able to fit a free record here. */
  21. #define MIN_DATA_LEN \
  22. (sizeof(struct tdb_free_record) - sizeof(struct tdb_used_record))
  23. /* We have a series of free lists, each one covering a "zone" of the file.
  24. *
  25. * For each zone we have a series of per-size buckets, and a final bucket for
  26. * "too big".
  27. *
  28. * It's possible to move the free_list_head, but *only* under the allrecord
  29. * lock. */
  30. static tdb_off_t free_list_off(struct tdb_context *tdb, unsigned int list)
  31. {
  32. return tdb->header.v.free_off + list * sizeof(tdb_off_t);
  33. }
  34. /* We're a library: playing with srandom() is unfriendly. srandom_r
  35. * probably lacks portability. We don't need very random here. */
  36. static unsigned int quick_random(struct tdb_context *tdb)
  37. {
  38. return getpid() + time(NULL) + (unsigned long)tdb;
  39. }
  40. /* Start by using a random zone to spread the load. */
  41. void tdb_zone_init(struct tdb_context *tdb)
  42. {
  43. /*
  44. * We read num_zones without a proper lock, so we could have
  45. * gotten a partial read. Since zone_bits is 1 byte long, we
  46. * can trust that; even if it's increased, the number of zones
  47. * cannot have decreased. And using the map size means we
  48. * will not start with a zone which hasn't been filled yet.
  49. */
  50. tdb->last_zone = quick_random(tdb)
  51. % ((tdb->map_size >> tdb->header.v.zone_bits) + 1);
  52. }
  53. static unsigned fls64(uint64_t val)
  54. {
  55. #if HAVE_BUILTIN_CLZL
  56. if (val <= ULONG_MAX) {
  57. /* This is significantly faster! */
  58. return val ? sizeof(long) * CHAR_BIT - __builtin_clzl(val) : 0;
  59. } else {
  60. #endif
  61. uint64_t r = 64;
  62. if (!val)
  63. return 0;
  64. if (!(val & 0xffffffff00000000ull)) {
  65. val <<= 32;
  66. r -= 32;
  67. }
  68. if (!(val & 0xffff000000000000ull)) {
  69. val <<= 16;
  70. r -= 16;
  71. }
  72. if (!(val & 0xff00000000000000ull)) {
  73. val <<= 8;
  74. r -= 8;
  75. }
  76. if (!(val & 0xf000000000000000ull)) {
  77. val <<= 4;
  78. r -= 4;
  79. }
  80. if (!(val & 0xc000000000000000ull)) {
  81. val <<= 2;
  82. r -= 2;
  83. }
  84. if (!(val & 0x8000000000000000ull)) {
  85. val <<= 1;
  86. r -= 1;
  87. }
  88. return r;
  89. #if HAVE_BUILTIN_CLZL
  90. }
  91. #endif
  92. }
  93. /* In which bucket would we find a particular record size? (ignoring header) */
  94. unsigned int size_to_bucket(struct tdb_context *tdb, tdb_len_t data_len)
  95. {
  96. unsigned int bucket;
  97. /* We can't have records smaller than this. */
  98. assert(data_len >= MIN_DATA_LEN);
  99. /* Ignoring the header... */
  100. if (data_len - MIN_DATA_LEN <= 64) {
  101. /* 0 in bucket 0, 8 in bucket 1... 64 in bucket 6. */
  102. bucket = (data_len - MIN_DATA_LEN) / 8;
  103. } else {
  104. /* After that we go power of 2. */
  105. bucket = fls64(data_len - MIN_DATA_LEN) + 2;
  106. }
  107. if (unlikely(bucket > tdb->header.v.free_buckets))
  108. bucket = tdb->header.v.free_buckets;
  109. return bucket;
  110. }
  111. /* What zone does a block belong in? */
  112. tdb_off_t zone_of(struct tdb_context *tdb, tdb_off_t off)
  113. {
  114. assert(tdb->header_uptodate);
  115. return off >> tdb->header.v.zone_bits;
  116. }
  117. /* Returns free_buckets + 1, or list number to search. */
  118. static tdb_off_t find_free_head(struct tdb_context *tdb, tdb_off_t bucket)
  119. {
  120. tdb_off_t first, off;
  121. /* Speculatively search for a non-zero bucket. */
  122. first = tdb->last_zone * (tdb->header.v.free_buckets+1) + bucket;
  123. off = tdb_find_nonzero_off(tdb, free_list_off(tdb, first),
  124. tdb->header.v.free_buckets + 1 - bucket);
  125. return bucket + off;
  126. }
  127. static int remove_from_list(struct tdb_context *tdb,
  128. tdb_off_t list, struct tdb_free_record *r)
  129. {
  130. tdb_off_t off;
  131. /* Front of list? */
  132. if (r->prev == 0) {
  133. off = free_list_off(tdb, list);
  134. } else {
  135. off = r->prev + offsetof(struct tdb_free_record, next);
  136. }
  137. /* r->prev->next = r->next */
  138. if (tdb_write_off(tdb, off, r->next)) {
  139. return -1;
  140. }
  141. if (r->next != 0) {
  142. off = r->next + offsetof(struct tdb_free_record, prev);
  143. /* r->next->prev = r->prev */
  144. if (tdb_write_off(tdb, off, r->prev)) {
  145. return -1;
  146. }
  147. }
  148. return 0;
  149. }
  150. /* Enqueue in this free list. */
  151. static int enqueue_in_free(struct tdb_context *tdb,
  152. tdb_off_t list,
  153. tdb_off_t off,
  154. struct tdb_free_record *new)
  155. {
  156. new->prev = 0;
  157. /* new->next = head. */
  158. new->next = tdb_read_off(tdb, free_list_off(tdb, list));
  159. if (new->next == TDB_OFF_ERR)
  160. return -1;
  161. if (new->next) {
  162. /* next->prev = new. */
  163. if (tdb_write_off(tdb, new->next
  164. + offsetof(struct tdb_free_record, prev),
  165. off) != 0)
  166. return -1;
  167. }
  168. /* head = new */
  169. if (tdb_write_off(tdb, free_list_off(tdb, list), off) != 0)
  170. return -1;
  171. return tdb_write_convert(tdb, off, new, sizeof(*new));
  172. }
  173. /* List isn't locked. */
  174. int add_free_record(struct tdb_context *tdb,
  175. tdb_off_t off, tdb_len_t len_with_header)
  176. {
  177. struct tdb_free_record new;
  178. tdb_off_t list;
  179. int ret;
  180. assert(len_with_header >= sizeof(new));
  181. new.magic = TDB_FREE_MAGIC;
  182. new.data_len = len_with_header - sizeof(struct tdb_used_record);
  183. tdb->last_zone = zone_of(tdb, off);
  184. list = tdb->last_zone * (tdb->header.v.free_buckets+1)
  185. + size_to_bucket(tdb, new.data_len);
  186. if (tdb_lock_free_list(tdb, list, TDB_LOCK_WAIT) != 0)
  187. return -1;
  188. ret = enqueue_in_free(tdb, list, off, &new);
  189. tdb_unlock_free_list(tdb, list);
  190. return ret;
  191. }
  192. /* If we have enough left over to be useful, split that off. */
  193. static int to_used_record(struct tdb_context *tdb,
  194. tdb_off_t off,
  195. tdb_len_t needed,
  196. tdb_len_t total_len,
  197. tdb_len_t *actual)
  198. {
  199. struct tdb_used_record used;
  200. tdb_len_t leftover;
  201. leftover = total_len - needed;
  202. if (leftover < sizeof(struct tdb_free_record))
  203. leftover = 0;
  204. *actual = total_len - leftover;
  205. if (leftover) {
  206. if (add_free_record(tdb, off + sizeof(used) + *actual,
  207. total_len - needed))
  208. return -1;
  209. }
  210. return 0;
  211. }
  212. /* Note: we unlock the current list if we coalesce or fail. */
  213. static int coalesce(struct tdb_context *tdb, tdb_off_t off,
  214. tdb_off_t list, tdb_len_t data_len)
  215. {
  216. struct tdb_free_record pad, *r;
  217. tdb_off_t end = off + sizeof(struct tdb_used_record) + data_len;
  218. while (!tdb->methods->oob(tdb, end + sizeof(*r), 1)) {
  219. tdb_off_t nlist;
  220. r = tdb_get(tdb, end, &pad, sizeof(pad));
  221. if (!r)
  222. goto err;
  223. if (r->magic != TDB_FREE_MAGIC)
  224. break;
  225. nlist = zone_of(tdb, end) * (tdb->header.v.free_buckets+1)
  226. + size_to_bucket(tdb, r->data_len);
  227. /* We may be violating lock order here, so best effort. */
  228. if (tdb_lock_free_list(tdb, nlist, TDB_LOCK_NOWAIT) == -1)
  229. break;
  230. /* Now we have lock, re-check. */
  231. r = tdb_get(tdb, end, &pad, sizeof(pad));
  232. if (!r) {
  233. tdb_unlock_free_list(tdb, nlist);
  234. goto err;
  235. }
  236. if (unlikely(r->magic != TDB_FREE_MAGIC)) {
  237. tdb_unlock_free_list(tdb, nlist);
  238. break;
  239. }
  240. if (remove_from_list(tdb, nlist, r) == -1) {
  241. tdb_unlock_free_list(tdb, nlist);
  242. goto err;
  243. }
  244. end += sizeof(struct tdb_used_record) + r->data_len;
  245. tdb_unlock_free_list(tdb, nlist);
  246. }
  247. /* Didn't find any adjacent free? */
  248. if (end == off + sizeof(struct tdb_used_record) + data_len)
  249. return 0;
  250. /* OK, expand record */
  251. r = tdb_get(tdb, off, &pad, sizeof(pad));
  252. if (!r)
  253. goto err;
  254. if (r->data_len != data_len) {
  255. tdb->ecode = TDB_ERR_CORRUPT;
  256. tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
  257. "coalesce: expected data len %llu not %llu\n",
  258. (long long)data_len, (long long)r->data_len);
  259. goto err;
  260. }
  261. if (remove_from_list(tdb, list, r) == -1)
  262. goto err;
  263. /* We have to drop this to avoid deadlocks. */
  264. tdb_unlock_free_list(tdb, list);
  265. if (add_free_record(tdb, off, end - off) == -1)
  266. return -1;
  267. return 1;
  268. err:
  269. /* To unify error paths, we *always* unlock list. */
  270. tdb_unlock_free_list(tdb, list);
  271. return -1;
  272. }
  273. /* We need size bytes to put our key and data in. */
  274. static tdb_off_t lock_and_alloc(struct tdb_context *tdb,
  275. tdb_off_t bucket, size_t size,
  276. tdb_len_t *actual)
  277. {
  278. tdb_off_t list;
  279. tdb_off_t off, best_off;
  280. struct tdb_free_record pad, best = { 0 }, *r;
  281. double multiplier;
  282. again:
  283. list = tdb->last_zone * (tdb->header.v.free_buckets+1) + bucket;
  284. /* Lock this list. */
  285. if (tdb_lock_free_list(tdb, list, TDB_LOCK_WAIT) == -1) {
  286. return TDB_OFF_ERR;
  287. }
  288. best.data_len = -1ULL;
  289. best_off = 0;
  290. multiplier = 1.0;
  291. /* Walk the list to see if any are large enough, getting less fussy
  292. * as we go. */
  293. off = tdb_read_off(tdb, free_list_off(tdb, list));
  294. if (unlikely(off == TDB_OFF_ERR))
  295. goto unlock_err;
  296. while (off) {
  297. r = tdb_get(tdb, off, &pad, sizeof(*r));
  298. if (!r)
  299. goto unlock_err;
  300. if (r->magic != TDB_FREE_MAGIC) {
  301. tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
  302. "lock_and_alloc: %llu non-free 0x%llx\n",
  303. (long long)off, (long long)r->magic);
  304. goto unlock_err;
  305. }
  306. if (r->data_len >= size && r->data_len < best.data_len) {
  307. best_off = off;
  308. best = *r;
  309. }
  310. if (best.data_len < size * multiplier && best_off)
  311. goto use_best;
  312. multiplier *= 1.01;
  313. /* Since we're going slow anyway, try coalescing here. */
  314. switch (coalesce(tdb, off, list, r->data_len)) {
  315. case -1:
  316. /* This has already unlocked on error. */
  317. return -1;
  318. case 1:
  319. /* This has unlocked list, restart. */
  320. goto again;
  321. }
  322. off = r->next;
  323. }
  324. /* If we found anything at all, use it. */
  325. if (best_off) {
  326. use_best:
  327. /* We're happy with this size: take it. */
  328. if (remove_from_list(tdb, list, &best) != 0)
  329. goto unlock_err;
  330. tdb_unlock_free_list(tdb, list);
  331. if (to_used_record(tdb, best_off, size, best.data_len,
  332. actual)) {
  333. return -1;
  334. }
  335. return best_off;
  336. }
  337. tdb_unlock_free_list(tdb, list);
  338. return 0;
  339. unlock_err:
  340. tdb_unlock_free_list(tdb, list);
  341. return TDB_OFF_ERR;
  342. }
  343. /* We want a really big chunk. Look through every zone's oversize bucket */
  344. static tdb_off_t huge_alloc(struct tdb_context *tdb, size_t size,
  345. tdb_len_t *actual)
  346. {
  347. tdb_off_t i, off;
  348. for (i = 0; i < tdb->header.v.num_zones; i++) {
  349. /* Try getting one from list. */
  350. off = lock_and_alloc(tdb, tdb->header.v.free_buckets,
  351. size, actual);
  352. if (off == TDB_OFF_ERR)
  353. return TDB_OFF_ERR;
  354. if (off != 0)
  355. return off;
  356. /* FIXME: Coalesce! */
  357. }
  358. return 0;
  359. }
  360. static tdb_off_t get_free(struct tdb_context *tdb, size_t size,
  361. tdb_len_t *actual)
  362. {
  363. tdb_off_t off, bucket;
  364. unsigned int num_empty, step = 0;
  365. bucket = size_to_bucket(tdb, size);
  366. /* If we're after something bigger than a single zone, handle
  367. * specially. */
  368. if (unlikely(sizeof(struct tdb_used_record) + size
  369. >= (1ULL << tdb->header.v.zone_bits))) {
  370. return huge_alloc(tdb, size, actual);
  371. }
  372. /* Number of zones we search is proportional to the log of them. */
  373. for (num_empty = 0; num_empty < fls64(tdb->header.v.num_zones);
  374. num_empty++) {
  375. tdb_off_t b;
  376. /* Start at exact size bucket, and search up... */
  377. for (b = bucket; b <= tdb->header.v.free_buckets; b++) {
  378. b = find_free_head(tdb, b);
  379. /* Non-empty list? Try getting block. */
  380. if (b <= tdb->header.v.free_buckets) {
  381. /* Try getting one from list. */
  382. off = lock_and_alloc(tdb, b, size, actual);
  383. if (off == TDB_OFF_ERR)
  384. return TDB_OFF_ERR;
  385. if (off != 0)
  386. return off;
  387. /* Didn't work. Try next bucket. */
  388. }
  389. }
  390. /* Try another zone, at pseudo random. Avoid duplicates by
  391. using an odd step. */
  392. if (step == 0)
  393. step = ((quick_random(tdb)) % 65536) * 2 + 1;
  394. tdb->last_zone = (tdb->last_zone + step)
  395. % tdb->header.v.num_zones;
  396. }
  397. return 0;
  398. }
  399. int set_header(struct tdb_context *tdb,
  400. struct tdb_used_record *rec,
  401. uint64_t keylen, uint64_t datalen,
  402. uint64_t actuallen, uint64_t hash)
  403. {
  404. uint64_t keybits = (fls64(keylen) + 1) / 2;
  405. /* Use top bits of hash, so it's independent of hash table size. */
  406. rec->magic_and_meta
  407. = (actuallen - (keylen + datalen))
  408. | ((hash >> 53) << 32)
  409. | (keybits << 43)
  410. | (TDB_MAGIC << 48);
  411. rec->key_and_data_len = (keylen | (datalen << (keybits*2)));
  412. /* Encoding can fail on big values. */
  413. if (rec_key_length(rec) != keylen
  414. || rec_data_length(rec) != datalen
  415. || rec_extra_padding(rec) != actuallen - (keylen + datalen)) {
  416. tdb->ecode = TDB_ERR_IO;
  417. tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
  418. "Could not encode k=%llu,d=%llu,a=%llu\n",
  419. (long long)keylen, (long long)datalen,
  420. (long long)actuallen);
  421. return -1;
  422. }
  423. return 0;
  424. }
  425. static tdb_len_t adjust_size(size_t keylen, size_t datalen, bool growing)
  426. {
  427. tdb_len_t size = keylen + datalen;
  428. if (size < MIN_DATA_LEN)
  429. size = MIN_DATA_LEN;
  430. /* Overallocate if this is coming from an enlarging store. */
  431. if (growing)
  432. size += datalen / 2;
  433. /* Round to next uint64_t boundary. */
  434. return (size + (sizeof(uint64_t) - 1ULL)) & ~(sizeof(uint64_t) - 1ULL);
  435. }
  436. /* If this fails, try tdb_expand. */
  437. tdb_off_t alloc(struct tdb_context *tdb, size_t keylen, size_t datalen,
  438. uint64_t hash, bool growing)
  439. {
  440. tdb_off_t off;
  441. tdb_len_t size, actual;
  442. struct tdb_used_record rec;
  443. /* We don't want header to change during this! */
  444. assert(tdb->header_uptodate);
  445. size = adjust_size(keylen, datalen, growing);
  446. off = get_free(tdb, size, &actual);
  447. if (unlikely(off == TDB_OFF_ERR || off == 0))
  448. return off;
  449. /* Some supergiant values can't be encoded. */
  450. if (set_header(tdb, &rec, keylen, datalen, actual, hash) != 0) {
  451. add_free_record(tdb, off, sizeof(rec) + actual);
  452. return TDB_OFF_ERR;
  453. }
  454. if (tdb_write_convert(tdb, off, &rec, sizeof(rec)) != 0)
  455. return TDB_OFF_ERR;
  456. return off;
  457. }
  458. static bool larger_buckets_might_help(struct tdb_context *tdb)
  459. {
  460. /* If our buckets are already covering 1/8 of a zone, don't
  461. * bother (note: might become an 1/16 of a zone if we double
  462. * zone size). */
  463. tdb_len_t size = (1ULL << tdb->header.v.zone_bits) / 8;
  464. if (size >= MIN_DATA_LEN
  465. && size_to_bucket(tdb, size) < tdb->header.v.free_buckets) {
  466. return false;
  467. }
  468. /* FIXME: Put stats in tdb_context or examine db itself! */
  469. /* It's fairly cheap to do as we expand database. */
  470. return true;
  471. }
  472. static bool zones_happy(struct tdb_context *tdb)
  473. {
  474. /* FIXME: look at distribution of zones. */
  475. return true;
  476. }
  477. /* Returns how much extra room we get, or TDB_OFF_ERR. */
  478. static tdb_len_t expand_to_fill_zones(struct tdb_context *tdb)
  479. {
  480. tdb_len_t add;
  481. /* We can enlarge zones without enlarging file to match. */
  482. add = (tdb->header.v.num_zones<<tdb->header.v.zone_bits)
  483. - tdb->map_size;
  484. if (add <= sizeof(struct tdb_free_record))
  485. return 0;
  486. /* Updates tdb->map_size. */
  487. if (tdb->methods->expand_file(tdb, add) == -1)
  488. return TDB_OFF_ERR;
  489. if (add_free_record(tdb, tdb->map_size - add, add) == -1)
  490. return TDB_OFF_ERR;
  491. return add;
  492. }
  493. static int update_zones(struct tdb_context *tdb,
  494. uint64_t new_num_zones,
  495. uint64_t new_zone_bits,
  496. uint64_t new_num_buckets,
  497. tdb_len_t add)
  498. {
  499. tdb_len_t freebucket_size;
  500. const tdb_off_t *oldf;
  501. tdb_off_t i, off, old_num_total, old_free_off;
  502. struct tdb_used_record fhdr;
  503. /* Updates tdb->map_size. */
  504. if (tdb->methods->expand_file(tdb, add) == -1)
  505. return -1;
  506. /* Use first part as new free bucket array. */
  507. off = tdb->map_size - add;
  508. freebucket_size = new_num_zones
  509. * (new_num_buckets + 1) * sizeof(tdb_off_t);
  510. /* Write header. */
  511. if (set_header(tdb, &fhdr, 0, freebucket_size, freebucket_size, 0))
  512. return -1;
  513. if (tdb_write_convert(tdb, off, &fhdr, sizeof(fhdr)) == -1)
  514. return -1;
  515. /* Adjust off to point to start of buckets, add to be remainder. */
  516. add -= freebucket_size + sizeof(fhdr);
  517. off += sizeof(fhdr);
  518. /* Access the old zones. */
  519. old_num_total = tdb->header.v.num_zones*(tdb->header.v.free_buckets+1);
  520. old_free_off = tdb->header.v.free_off;
  521. oldf = tdb_access_read(tdb, old_free_off,
  522. old_num_total * sizeof(tdb_off_t), true);
  523. if (!oldf)
  524. return -1;
  525. /* Switch to using our new zone. */
  526. if (zero_out(tdb, off, freebucket_size) == -1)
  527. goto fail_release;
  528. tdb->header.v.free_off = off;
  529. tdb->header.v.num_zones = new_num_zones;
  530. tdb->header.v.zone_bits = new_zone_bits;
  531. tdb->header.v.free_buckets = new_num_buckets;
  532. /* FIXME: If zone size hasn't changed, can simply copy pointers. */
  533. /* FIXME: Coalesce? */
  534. for (i = 0; i < old_num_total; i++) {
  535. tdb_off_t next;
  536. struct tdb_free_record rec;
  537. tdb_off_t list;
  538. for (off = oldf[i]; off; off = next) {
  539. if (tdb_read_convert(tdb, off, &rec, sizeof(rec)))
  540. goto fail_release;
  541. list = zone_of(tdb, off)
  542. * (tdb->header.v.free_buckets+1)
  543. + size_to_bucket(tdb, rec.data_len);
  544. next = rec.next;
  545. if (enqueue_in_free(tdb, list, off, &rec) == -1)
  546. goto fail_release;
  547. }
  548. }
  549. /* Free up the old free buckets. */
  550. old_free_off -= sizeof(fhdr);
  551. if (tdb_read_convert(tdb, old_free_off, &fhdr, sizeof(fhdr)) == -1)
  552. goto fail_release;
  553. if (add_free_record(tdb, old_free_off,
  554. sizeof(fhdr)
  555. + rec_data_length(&fhdr)
  556. + rec_extra_padding(&fhdr)))
  557. goto fail_release;
  558. /* Add the rest as a new free record. */
  559. if (add_free_record(tdb, tdb->map_size - add, add) == -1)
  560. goto fail_release;
  561. /* Start allocating from where the new space is. */
  562. tdb->last_zone = zone_of(tdb, tdb->map_size - add);
  563. tdb_access_release(tdb, oldf);
  564. return write_header(tdb);
  565. fail_release:
  566. tdb_access_release(tdb, oldf);
  567. return -1;
  568. }
  569. /* Expand the database. */
  570. int tdb_expand(struct tdb_context *tdb, tdb_len_t klen, tdb_len_t dlen,
  571. bool growing)
  572. {
  573. uint64_t new_num_buckets, new_num_zones, new_zone_bits;
  574. uint64_t old_num_zones, old_size, old_zone_bits;
  575. tdb_len_t add, needed;
  576. /* We need room for the record header too. */
  577. needed = sizeof(struct tdb_used_record)
  578. + adjust_size(klen, dlen, growing);
  579. /* tdb_allrecord_lock will update header; did zones change? */
  580. old_zone_bits = tdb->header.v.zone_bits;
  581. old_num_zones = tdb->header.v.num_zones;
  582. /* FIXME: this is overkill. An expand lock? */
  583. if (tdb_allrecord_lock(tdb, F_WRLCK, TDB_LOCK_WAIT, false) == -1)
  584. return -1;
  585. /* Someone may have expanded for us. */
  586. if (old_zone_bits != tdb->header.v.zone_bits
  587. || old_num_zones != tdb->header.v.num_zones)
  588. goto success;
  589. /* They may have also expanded the underlying size (otherwise we'd
  590. * have expanded our mmap to look at those offsets already). */
  591. old_size = tdb->map_size;
  592. tdb->methods->oob(tdb, tdb->map_size + 1, true);
  593. if (tdb->map_size != old_size)
  594. goto success;
  595. add = expand_to_fill_zones(tdb);
  596. if (add == TDB_OFF_ERR)
  597. goto fail;
  598. if (add >= needed) {
  599. /* Allocate from this zone. */
  600. tdb->last_zone = zone_of(tdb, tdb->map_size - add);
  601. goto success;
  602. }
  603. /* Slow path. Should we increase the number of buckets? */
  604. new_num_buckets = tdb->header.v.free_buckets;
  605. if (larger_buckets_might_help(tdb))
  606. new_num_buckets++;
  607. /* Now we'll need room for the new free buckets, too. Assume
  608. * worst case (zones expand). */
  609. needed += sizeof(struct tdb_used_record)
  610. + ((tdb->header.v.num_zones+1)
  611. * (new_num_buckets+1) * sizeof(tdb_off_t));
  612. /* If we need less that one zone, and they're working well, just add
  613. * another one. */
  614. if (needed < (1UL<<tdb->header.v.zone_bits) && zones_happy(tdb)) {
  615. new_num_zones = tdb->header.v.num_zones+1;
  616. new_zone_bits = tdb->header.v.zone_bits;
  617. add = 1ULL << tdb->header.v.zone_bits;
  618. } else {
  619. /* Increase the zone size. */
  620. new_num_zones = tdb->header.v.num_zones;
  621. new_zone_bits = tdb->header.v.zone_bits+1;
  622. while ((new_num_zones << new_zone_bits)
  623. < tdb->map_size + needed) {
  624. new_zone_bits++;
  625. }
  626. /* We expand by enough full zones to meet the need. */
  627. add = ((tdb->map_size + needed + (1ULL << new_zone_bits)-1)
  628. & ~((1ULL << new_zone_bits)-1))
  629. - tdb->map_size;
  630. }
  631. if (update_zones(tdb, new_num_zones, new_zone_bits, new_num_buckets,
  632. add) == -1)
  633. goto fail;
  634. success:
  635. tdb_allrecord_unlock(tdb, F_WRLCK);
  636. return 0;
  637. fail:
  638. tdb_allrecord_unlock(tdb, F_WRLCK);
  639. return -1;
  640. }