free.c 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730
  1. /*
  2. Trivial Database 2: free list/block handling
  3. Copyright (C) Rusty Russell 2010
  4. This library is free software; you can redistribute it and/or
  5. modify it under the terms of the GNU Lesser General Public
  6. License as published by the Free Software Foundation; either
  7. version 3 of the License, or (at your option) any later version.
  8. This library is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  11. Lesser General Public License for more details.
  12. You should have received a copy of the GNU Lesser General Public
  13. License along with this library; if not, see <http://www.gnu.org/licenses/>.
  14. */
  15. #include "private.h"
  16. #include <ccan/likely/likely.h>
  17. #include <time.h>
  18. #include <assert.h>
  19. #include <limits.h>
  20. /* We have to be able to fit a free record here. */
  21. #define MIN_DATA_LEN \
  22. (sizeof(struct tdb_free_record) - sizeof(struct tdb_used_record))
  23. /* We have a series of free lists, each one covering a "zone" of the file.
  24. *
  25. * For each zone we have a series of per-size buckets, and a final bucket for
  26. * "too big".
  27. *
  28. * It's possible to move the free_list_head, but *only* under the allrecord
  29. * lock. */
  30. static tdb_off_t free_list_off(struct tdb_context *tdb, unsigned int list)
  31. {
  32. return tdb->header.v.free_off + list * sizeof(tdb_off_t);
  33. }
  34. /* We're a library: playing with srandom() is unfriendly. srandom_r
  35. * probably lacks portability. We don't need very random here. */
  36. static unsigned int quick_random(struct tdb_context *tdb)
  37. {
  38. return getpid() + time(NULL) + (unsigned long)tdb;
  39. }
  40. /* Start by using a random zone to spread the load. */
  41. void tdb_zone_init(struct tdb_context *tdb)
  42. {
  43. /*
  44. * We read num_zones without a proper lock, so we could have
  45. * gotten a partial read. Since zone_bits is 1 byte long, we
  46. * can trust that; even if it's increased, the number of zones
  47. * cannot have decreased. And using the map size means we
  48. * will not start with a zone which hasn't been filled yet.
  49. */
  50. tdb->last_zone = quick_random(tdb)
  51. % ((tdb->map_size >> tdb->header.v.zone_bits) + 1);
  52. }
  53. static unsigned fls64(uint64_t val)
  54. {
  55. #if HAVE_BUILTIN_CLZL
  56. if (val <= ULONG_MAX) {
  57. /* This is significantly faster! */
  58. return val ? sizeof(long) * CHAR_BIT - __builtin_clzl(val) : 0;
  59. } else {
  60. #endif
  61. uint64_t r = 64;
  62. if (!val)
  63. return 0;
  64. if (!(val & 0xffffffff00000000ull)) {
  65. val <<= 32;
  66. r -= 32;
  67. }
  68. if (!(val & 0xffff000000000000ull)) {
  69. val <<= 16;
  70. r -= 16;
  71. }
  72. if (!(val & 0xff00000000000000ull)) {
  73. val <<= 8;
  74. r -= 8;
  75. }
  76. if (!(val & 0xf000000000000000ull)) {
  77. val <<= 4;
  78. r -= 4;
  79. }
  80. if (!(val & 0xc000000000000000ull)) {
  81. val <<= 2;
  82. r -= 2;
  83. }
  84. if (!(val & 0x8000000000000000ull)) {
  85. val <<= 1;
  86. r -= 1;
  87. }
  88. return r;
  89. #if HAVE_BUILTIN_CLZL
  90. }
  91. #endif
  92. }
  93. /* In which bucket would we find a particular record size? (ignoring header) */
  94. unsigned int size_to_bucket(struct tdb_context *tdb, tdb_len_t data_len)
  95. {
  96. unsigned int bucket;
  97. /* We can't have records smaller than this. */
  98. assert(data_len >= MIN_DATA_LEN);
  99. /* Ignoring the header... */
  100. if (data_len - MIN_DATA_LEN <= 64) {
  101. /* 0 in bucket 0, 8 in bucket 1... 64 in bucket 6. */
  102. bucket = (data_len - MIN_DATA_LEN) / 8;
  103. } else {
  104. /* After that we go power of 2. */
  105. bucket = fls64(data_len - MIN_DATA_LEN) + 2;
  106. }
  107. if (unlikely(bucket > tdb->header.v.free_buckets))
  108. bucket = tdb->header.v.free_buckets;
  109. return bucket;
  110. }
  111. /* What zone does a block belong in? */
  112. tdb_off_t zone_of(struct tdb_context *tdb, tdb_off_t off)
  113. {
  114. assert(tdb->header_uptodate);
  115. return off >> tdb->header.v.zone_bits;
  116. }
  117. /* Returns free_buckets + 1, or list number to search. */
  118. static tdb_off_t find_free_head(struct tdb_context *tdb, tdb_off_t bucket)
  119. {
  120. tdb_off_t first, off;
  121. /* Speculatively search for a non-zero bucket. */
  122. first = tdb->last_zone * (tdb->header.v.free_buckets+1) + bucket;
  123. off = tdb_find_nonzero_off(tdb, free_list_off(tdb, first),
  124. tdb->header.v.free_buckets + 1 - bucket);
  125. return bucket + off;
  126. }
  127. static int remove_from_list(struct tdb_context *tdb,
  128. tdb_off_t list, struct tdb_free_record *r)
  129. {
  130. tdb_off_t off;
  131. /* Front of list? */
  132. if (r->prev == 0) {
  133. off = free_list_off(tdb, list);
  134. } else {
  135. off = r->prev + offsetof(struct tdb_free_record, next);
  136. }
  137. /* r->prev->next = r->next */
  138. if (tdb_write_off(tdb, off, r->next)) {
  139. return -1;
  140. }
  141. if (r->next != 0) {
  142. off = r->next + offsetof(struct tdb_free_record, prev);
  143. /* r->next->prev = r->prev */
  144. if (tdb_write_off(tdb, off, r->prev)) {
  145. return -1;
  146. }
  147. }
  148. return 0;
  149. }
  150. /* Enqueue in this free list. */
  151. static int enqueue_in_free(struct tdb_context *tdb,
  152. tdb_off_t list,
  153. tdb_off_t off,
  154. struct tdb_free_record *new)
  155. {
  156. new->prev = 0;
  157. /* new->next = head. */
  158. new->next = tdb_read_off(tdb, free_list_off(tdb, list));
  159. if (new->next == TDB_OFF_ERR)
  160. return -1;
  161. if (new->next) {
  162. /* next->prev = new. */
  163. if (tdb_write_off(tdb, new->next
  164. + offsetof(struct tdb_free_record, prev),
  165. off) != 0)
  166. return -1;
  167. }
  168. /* head = new */
  169. if (tdb_write_off(tdb, free_list_off(tdb, list), off) != 0)
  170. return -1;
  171. return tdb_write_convert(tdb, off, new, sizeof(*new));
  172. }
  173. /* List isn't locked. */
  174. int add_free_record(struct tdb_context *tdb,
  175. tdb_off_t off, tdb_len_t len_with_header)
  176. {
  177. struct tdb_free_record new;
  178. tdb_off_t list;
  179. int ret;
  180. assert(len_with_header >= sizeof(new));
  181. new.magic = TDB_FREE_MAGIC;
  182. new.data_len = len_with_header - sizeof(struct tdb_used_record);
  183. tdb->last_zone = zone_of(tdb, off);
  184. list = tdb->last_zone * (tdb->header.v.free_buckets+1)
  185. + size_to_bucket(tdb, new.data_len);
  186. if (tdb_lock_free_list(tdb, list, TDB_LOCK_WAIT) != 0)
  187. return -1;
  188. ret = enqueue_in_free(tdb, list, off, &new);
  189. tdb_unlock_free_list(tdb, list);
  190. return ret;
  191. }
  192. /* If we have enough left over to be useful, split that off. */
  193. static int to_used_record(struct tdb_context *tdb,
  194. tdb_off_t off,
  195. tdb_len_t needed,
  196. tdb_len_t total_len,
  197. tdb_len_t *actual)
  198. {
  199. struct tdb_used_record used;
  200. tdb_len_t leftover;
  201. leftover = total_len - needed;
  202. if (leftover < sizeof(struct tdb_free_record))
  203. leftover = 0;
  204. *actual = total_len - leftover;
  205. if (leftover) {
  206. if (add_free_record(tdb, off + sizeof(used) + *actual,
  207. total_len - needed))
  208. return -1;
  209. }
  210. return 0;
  211. }
  212. /* Note: we unlock the current list if we coalesce or fail. */
  213. static int coalesce(struct tdb_context *tdb, tdb_off_t off,
  214. tdb_off_t list, tdb_len_t data_len)
  215. {
  216. struct tdb_free_record pad, *r;
  217. tdb_off_t end = off + sizeof(struct tdb_used_record) + data_len;
  218. while (!tdb->methods->oob(tdb, end + sizeof(*r), 1)) {
  219. tdb_off_t nlist;
  220. r = tdb_get(tdb, end, &pad, sizeof(pad));
  221. if (!r)
  222. goto err;
  223. if (r->magic != TDB_FREE_MAGIC)
  224. break;
  225. nlist = zone_of(tdb, end) * (tdb->header.v.free_buckets+1)
  226. + size_to_bucket(tdb, r->data_len);
  227. /* We may be violating lock order here, so best effort. */
  228. if (tdb_lock_free_list(tdb, nlist, TDB_LOCK_NOWAIT) == -1)
  229. break;
  230. /* Now we have lock, re-check. */
  231. r = tdb_get(tdb, end, &pad, sizeof(pad));
  232. if (!r) {
  233. tdb_unlock_free_list(tdb, nlist);
  234. goto err;
  235. }
  236. if (unlikely(r->magic != TDB_FREE_MAGIC)) {
  237. tdb_unlock_free_list(tdb, nlist);
  238. break;
  239. }
  240. if (remove_from_list(tdb, list, r) == -1) {
  241. tdb_unlock_free_list(tdb, nlist);
  242. goto err;
  243. }
  244. end += sizeof(struct tdb_used_record) + r->data_len;
  245. tdb_unlock_free_list(tdb, nlist);
  246. }
  247. /* Didn't find any adjacent free? */
  248. if (end == off + sizeof(struct tdb_used_record) + data_len)
  249. return 0;
  250. /* OK, expand record */
  251. r = tdb_get(tdb, off, &pad, sizeof(pad));
  252. if (!r)
  253. goto err;
  254. if (remove_from_list(tdb, list, r) == -1)
  255. goto err;
  256. /* We have to drop this to avoid deadlocks. */
  257. tdb_unlock_free_list(tdb, list);
  258. if (add_free_record(tdb, off, end - off) == -1)
  259. return -1;
  260. return 1;
  261. err:
  262. /* To unify error paths, we *always* unlock list. */
  263. tdb_unlock_free_list(tdb, list);
  264. return -1;
  265. }
  266. /* We need size bytes to put our key and data in. */
  267. static tdb_off_t lock_and_alloc(struct tdb_context *tdb,
  268. tdb_off_t bucket, size_t size,
  269. tdb_len_t *actual)
  270. {
  271. tdb_off_t list;
  272. tdb_off_t off, best_off;
  273. struct tdb_free_record pad, best = { 0 }, *r;
  274. double multiplier;
  275. again:
  276. list = tdb->last_zone * (tdb->header.v.free_buckets+1) + bucket;
  277. /* Lock this list. */
  278. if (tdb_lock_free_list(tdb, list, TDB_LOCK_WAIT) == -1) {
  279. return TDB_OFF_ERR;
  280. }
  281. best.data_len = -1ULL;
  282. best_off = 0;
  283. multiplier = 1.0;
  284. /* Walk the list to see if any are large enough, getting less fussy
  285. * as we go. */
  286. off = tdb_read_off(tdb, free_list_off(tdb, list));
  287. if (unlikely(off == TDB_OFF_ERR))
  288. goto unlock_err;
  289. while (off) {
  290. r = tdb_get(tdb, off, &pad, sizeof(*r));
  291. if (!r)
  292. goto unlock_err;
  293. if (r->magic != TDB_FREE_MAGIC) {
  294. tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
  295. "lock_and_alloc: %llu non-free 0x%llx\n",
  296. (long long)off, (long long)r->magic);
  297. goto unlock_err;
  298. }
  299. if (r->data_len >= size && r->data_len < best.data_len) {
  300. best_off = off;
  301. best = *r;
  302. }
  303. if (best.data_len < size * multiplier && best_off)
  304. goto use_best;
  305. multiplier *= 1.01;
  306. /* Since we're going slow anyway, try coalescing here. */
  307. switch (coalesce(tdb, off, list, r->data_len)) {
  308. case -1:
  309. /* This has already unlocked on error. */
  310. return -1;
  311. case 1:
  312. /* This has unlocked list, restart. */
  313. goto again;
  314. }
  315. off = r->next;
  316. }
  317. /* If we found anything at all, use it. */
  318. if (best_off) {
  319. use_best:
  320. /* We're happy with this size: take it. */
  321. if (remove_from_list(tdb, list, &best) != 0)
  322. goto unlock_err;
  323. tdb_unlock_free_list(tdb, list);
  324. if (to_used_record(tdb, best_off, size, best.data_len,
  325. actual)) {
  326. return -1;
  327. }
  328. return best_off;
  329. }
  330. tdb_unlock_free_list(tdb, list);
  331. return 0;
  332. unlock_err:
  333. tdb_unlock_free_list(tdb, list);
  334. return TDB_OFF_ERR;
  335. }
  336. /* We want a really big chunk. Look through every zone's oversize bucket */
  337. static tdb_off_t huge_alloc(struct tdb_context *tdb, size_t size,
  338. tdb_len_t *actual)
  339. {
  340. tdb_off_t i, off;
  341. for (i = 0; i < tdb->header.v.num_zones; i++) {
  342. /* Try getting one from list. */
  343. off = lock_and_alloc(tdb, tdb->header.v.free_buckets,
  344. size, actual);
  345. if (off == TDB_OFF_ERR)
  346. return TDB_OFF_ERR;
  347. if (off != 0)
  348. return off;
  349. /* FIXME: Coalesce! */
  350. }
  351. return 0;
  352. }
  353. static tdb_off_t get_free(struct tdb_context *tdb, size_t size,
  354. tdb_len_t *actual)
  355. {
  356. tdb_off_t off, bucket;
  357. unsigned int num_empty, step = 0;
  358. bucket = size_to_bucket(tdb, size);
  359. /* If we're after something bigger than a single zone, handle
  360. * specially. */
  361. if (unlikely(sizeof(struct tdb_used_record) + size
  362. >= (1ULL << tdb->header.v.zone_bits))) {
  363. return huge_alloc(tdb, size, actual);
  364. }
  365. /* Number of zones we search is proportional to the log of them. */
  366. for (num_empty = 0; num_empty < fls64(tdb->header.v.num_zones);
  367. num_empty++) {
  368. tdb_off_t b;
  369. /* Start at exact size bucket, and search up... */
  370. for (b = bucket; b <= tdb->header.v.free_buckets; b++) {
  371. b = find_free_head(tdb, b);
  372. /* Non-empty list? Try getting block. */
  373. if (b <= tdb->header.v.free_buckets) {
  374. /* Try getting one from list. */
  375. off = lock_and_alloc(tdb, b, size, actual);
  376. if (off == TDB_OFF_ERR)
  377. return TDB_OFF_ERR;
  378. if (off != 0)
  379. return off;
  380. /* Didn't work. Try next bucket. */
  381. }
  382. }
  383. /* Try another zone, at pseudo random. Avoid duplicates by
  384. using an odd step. */
  385. if (step == 0)
  386. step = ((quick_random(tdb)) % 65536) * 2 + 1;
  387. tdb->last_zone = (tdb->last_zone + step)
  388. % tdb->header.v.num_zones;
  389. }
  390. return 0;
  391. }
  392. int set_header(struct tdb_context *tdb,
  393. struct tdb_used_record *rec,
  394. uint64_t keylen, uint64_t datalen,
  395. uint64_t actuallen, uint64_t hash)
  396. {
  397. uint64_t keybits = (fls64(keylen) + 1) / 2;
  398. /* Use top bits of hash, so it's independent of hash table size. */
  399. rec->magic_and_meta
  400. = (actuallen - (keylen + datalen))
  401. | ((hash >> 53) << 32)
  402. | (keybits << 43)
  403. | (TDB_MAGIC << 48);
  404. rec->key_and_data_len = (keylen | (datalen << (keybits*2)));
  405. /* Encoding can fail on big values. */
  406. if (rec_key_length(rec) != keylen
  407. || rec_data_length(rec) != datalen
  408. || rec_extra_padding(rec) != actuallen - (keylen + datalen)) {
  409. tdb->ecode = TDB_ERR_IO;
  410. tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
  411. "Could not encode k=%llu,d=%llu,a=%llu\n",
  412. (long long)keylen, (long long)datalen,
  413. (long long)actuallen);
  414. return -1;
  415. }
  416. return 0;
  417. }
  418. static tdb_len_t adjust_size(size_t keylen, size_t datalen, bool growing)
  419. {
  420. tdb_len_t size = keylen + datalen;
  421. if (size < MIN_DATA_LEN)
  422. size = MIN_DATA_LEN;
  423. /* Overallocate if this is coming from an enlarging store. */
  424. if (growing)
  425. size += datalen / 2;
  426. /* Round to next uint64_t boundary. */
  427. return (size + (sizeof(uint64_t) - 1ULL)) & ~(sizeof(uint64_t) - 1ULL);
  428. }
  429. /* If this fails, try tdb_expand. */
  430. tdb_off_t alloc(struct tdb_context *tdb, size_t keylen, size_t datalen,
  431. uint64_t hash, bool growing)
  432. {
  433. tdb_off_t off;
  434. tdb_len_t size, actual;
  435. struct tdb_used_record rec;
  436. /* We don't want header to change during this! */
  437. assert(tdb->header_uptodate);
  438. size = adjust_size(keylen, datalen, growing);
  439. off = get_free(tdb, size, &actual);
  440. if (unlikely(off == TDB_OFF_ERR || off == 0))
  441. return off;
  442. /* Some supergiant values can't be encoded. */
  443. if (set_header(tdb, &rec, keylen, datalen, actual, hash) != 0) {
  444. add_free_record(tdb, off, sizeof(rec) + actual);
  445. return TDB_OFF_ERR;
  446. }
  447. if (tdb_write_convert(tdb, off, &rec, sizeof(rec)) != 0)
  448. return TDB_OFF_ERR;
  449. return off;
  450. }
  451. static bool larger_buckets_might_help(struct tdb_context *tdb)
  452. {
  453. /* If our buckets are already covering 1/8 of a zone, don't
  454. * bother (note: might become an 1/16 of a zone if we double
  455. * zone size). */
  456. tdb_len_t size = (1ULL << tdb->header.v.zone_bits) / 8;
  457. if (size >= MIN_DATA_LEN
  458. && size_to_bucket(tdb, size) < tdb->header.v.free_buckets) {
  459. return false;
  460. }
  461. /* FIXME: Put stats in tdb_context or examine db itself! */
  462. /* It's fairly cheap to do as we expand database. */
  463. return true;
  464. }
  465. static bool zones_happy(struct tdb_context *tdb)
  466. {
  467. /* FIXME: look at distribution of zones. */
  468. return true;
  469. }
  470. /* Expand the database. */
  471. int tdb_expand(struct tdb_context *tdb, tdb_len_t klen, tdb_len_t dlen,
  472. bool growing)
  473. {
  474. uint64_t new_num_buckets, new_num_zones, new_zone_bits;
  475. uint64_t i, old_num_total, old_num_zones, old_size, old_zone_bits;
  476. tdb_len_t add, freebucket_size, needed;
  477. tdb_off_t off, old_free_off;
  478. const tdb_off_t *oldf;
  479. struct tdb_used_record fhdr;
  480. /* We need room for the record header too. */
  481. needed = sizeof(struct tdb_used_record)
  482. + adjust_size(klen, dlen, growing);
  483. /* tdb_allrecord_lock will update header; did zones change? */
  484. old_zone_bits = tdb->header.v.zone_bits;
  485. old_num_zones = tdb->header.v.num_zones;
  486. /* FIXME: this is overkill. An expand lock? */
  487. if (tdb_allrecord_lock(tdb, F_WRLCK, TDB_LOCK_WAIT, false) == -1)
  488. return -1;
  489. /* Someone may have expanded for us. */
  490. if (old_zone_bits != tdb->header.v.zone_bits
  491. || old_num_zones != tdb->header.v.num_zones)
  492. goto success;
  493. /* They may have also expanded the underlying size (otherwise we'd
  494. * have expanded our mmap to look at those offsets already). */
  495. old_size = tdb->map_size;
  496. tdb->methods->oob(tdb, tdb->map_size + 1, true);
  497. if (tdb->map_size != old_size)
  498. goto success;
  499. /* Did we enlarge zones without enlarging file? */
  500. if (tdb->map_size < tdb->header.v.num_zones<<tdb->header.v.zone_bits) {
  501. add = (tdb->header.v.num_zones<<tdb->header.v.zone_bits)
  502. - tdb->map_size;
  503. /* Updates tdb->map_size. */
  504. if (tdb->methods->expand_file(tdb, add) == -1)
  505. goto fail;
  506. if (add_free_record(tdb, tdb->map_size - add, add) == -1)
  507. goto fail;
  508. if (add >= needed) {
  509. /* Allocate from this zone. */
  510. tdb->last_zone = zone_of(tdb, tdb->map_size - add);
  511. goto success;
  512. }
  513. }
  514. /* Slow path. Should we increase the number of buckets? */
  515. new_num_buckets = tdb->header.v.free_buckets;
  516. if (larger_buckets_might_help(tdb))
  517. new_num_buckets++;
  518. /* Now we'll need room for the new free buckets, too. Assume
  519. * worst case (zones expand). */
  520. needed += sizeof(fhdr)
  521. + ((tdb->header.v.num_zones+1)
  522. * (new_num_buckets+1) * sizeof(tdb_off_t));
  523. /* If we need less that one zone, and they're working well, just add
  524. * another one. */
  525. if (needed < (1UL<<tdb->header.v.zone_bits) && zones_happy(tdb)) {
  526. new_num_zones = tdb->header.v.num_zones+1;
  527. new_zone_bits = tdb->header.v.zone_bits;
  528. add = 1ULL << tdb->header.v.zone_bits;
  529. } else {
  530. /* Increase the zone size. */
  531. new_num_zones = tdb->header.v.num_zones;
  532. new_zone_bits = tdb->header.v.zone_bits+1;
  533. while ((new_num_zones << new_zone_bits)
  534. < tdb->map_size + needed) {
  535. new_zone_bits++;
  536. }
  537. /* We expand by enough full zones to meet the need. */
  538. add = ((tdb->map_size + needed + (1ULL << new_zone_bits)-1)
  539. & ~((1ULL << new_zone_bits)-1))
  540. - tdb->map_size;
  541. }
  542. /* Updates tdb->map_size. */
  543. if (tdb->methods->expand_file(tdb, add) == -1)
  544. goto fail;
  545. /* Use first part as new free bucket array. */
  546. off = tdb->map_size - add;
  547. freebucket_size = new_num_zones
  548. * (new_num_buckets + 1) * sizeof(tdb_off_t);
  549. /* Write header. */
  550. if (set_header(tdb, &fhdr, 0, freebucket_size, freebucket_size, 0))
  551. goto fail;
  552. if (tdb_write_convert(tdb, off, &fhdr, sizeof(fhdr)) == -1)
  553. goto fail;
  554. /* Adjust off to point to start of buckets, add to be remainder. */
  555. add -= freebucket_size + sizeof(fhdr);
  556. off += sizeof(fhdr);
  557. /* Access the old zones. */
  558. old_num_total = tdb->header.v.num_zones*(tdb->header.v.free_buckets+1);
  559. old_free_off = tdb->header.v.free_off;
  560. oldf = tdb_access_read(tdb, old_free_off,
  561. old_num_total * sizeof(tdb_off_t), true);
  562. if (!oldf)
  563. goto fail;
  564. /* Switch to using our new zone. */
  565. if (zero_out(tdb, off, freebucket_size) == -1)
  566. goto fail_release;
  567. tdb->header.v.free_off = off;
  568. tdb->header.v.num_zones = new_num_zones;
  569. tdb->header.v.zone_bits = new_zone_bits;
  570. tdb->header.v.free_buckets = new_num_buckets;
  571. /* FIXME: If zone size hasn't changed, can simply copy pointers. */
  572. /* FIXME: Coalesce? */
  573. for (i = 0; i < old_num_total; i++) {
  574. tdb_off_t next;
  575. struct tdb_free_record rec;
  576. tdb_off_t list;
  577. for (off = oldf[i]; off; off = next) {
  578. if (tdb_read_convert(tdb, off, &rec, sizeof(rec)))
  579. goto fail_release;
  580. list = zone_of(tdb, off)
  581. * (tdb->header.v.free_buckets+1)
  582. + size_to_bucket(tdb, rec.data_len);
  583. next = rec.next;
  584. if (enqueue_in_free(tdb, list, off, &rec) == -1)
  585. goto fail_release;
  586. }
  587. }
  588. /* Free up the old free buckets. */
  589. old_free_off -= sizeof(fhdr);
  590. if (tdb_read_convert(tdb, old_free_off, &fhdr, sizeof(fhdr)) == -1)
  591. goto fail_release;
  592. if (add_free_record(tdb, old_free_off,
  593. sizeof(fhdr)
  594. + rec_data_length(&fhdr)
  595. + rec_extra_padding(&fhdr)))
  596. goto fail_release;
  597. /* Add the rest as a new free record. */
  598. if (add_free_record(tdb, tdb->map_size - add, add) == -1)
  599. goto fail_release;
  600. /* Start allocating from where the new space is. */
  601. tdb->last_zone = zone_of(tdb, tdb->map_size - add);
  602. tdb_access_release(tdb, oldf);
  603. if (write_header(tdb) == -1)
  604. goto fail;
  605. success:
  606. tdb_allrecord_unlock(tdb, F_WRLCK);
  607. return 0;
  608. fail_release:
  609. tdb_access_release(tdb, oldf);
  610. fail:
  611. tdb_allrecord_unlock(tdb, F_WRLCK);
  612. return -1;
  613. }