|
@@ -380,122 +380,150 @@ struct tdb_context *tdb_open(const char *name, int tdb_flags,
|
|
|
return NULL;
|
|
return NULL;
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
-static int tdb_key_compare(TDB_DATA key, TDB_DATA data, void *private_data)
|
|
|
|
|
|
|
+static tdb_off_t hash_off(struct tdb_context *tdb, uint64_t list)
|
|
|
{
|
|
{
|
|
|
- return memcmp(data.dptr, key.dptr, data.dsize) == 0;
|
|
|
|
|
|
|
+ return tdb->header.v.hash_off
|
|
|
|
|
+ + ((list & ((1ULL << tdb->header.v.hash_bits) - 1))
|
|
|
|
|
+ * sizeof(tdb_off_t));
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
-static void unlock_lists(struct tdb_context *tdb,
|
|
|
|
|
- uint64_t start, uint64_t end, int ltype)
|
|
|
|
|
|
|
+/* Returns 0 if the entry is a zero (definitely not a match).
|
|
|
|
|
+ * Returns a valid entry offset if it's a match. Fills in rec.
|
|
|
|
|
+ * Otherwise returns TDB_OFF_ERR: keep searching. */
|
|
|
|
|
+static tdb_off_t entry_matches(struct tdb_context *tdb,
|
|
|
|
|
+ uint64_t list,
|
|
|
|
|
+ uint64_t hash,
|
|
|
|
|
+ const struct tdb_data *key,
|
|
|
|
|
+ struct tdb_used_record *rec)
|
|
|
{
|
|
{
|
|
|
- for (;;) {
|
|
|
|
|
- tdb_unlock_list(tdb, start, ltype);
|
|
|
|
|
- if (start == end)
|
|
|
|
|
- return;
|
|
|
|
|
- start = (start + 1) & ((1ULL << tdb->header.v.hash_bits) - 1);
|
|
|
|
|
- }
|
|
|
|
|
-}
|
|
|
|
|
-
|
|
|
|
|
-/* FIXME: Return header copy? */
|
|
|
|
|
-/* Returns -1 or offset of entry (0 if not found).
|
|
|
|
|
- * Locks hash entried from *start to *end (where the entry was found). */
|
|
|
|
|
-static tdb_off_t find_bucket_and_lock(struct tdb_context *tdb,
|
|
|
|
|
- const struct tdb_data *key,
|
|
|
|
|
- uint64_t hash,
|
|
|
|
|
- uint64_t *start,
|
|
|
|
|
- uint64_t *end,
|
|
|
|
|
- uint64_t *room,
|
|
|
|
|
- int ltype)
|
|
|
|
|
-{
|
|
|
|
|
- uint64_t hextra;
|
|
|
|
|
tdb_off_t off;
|
|
tdb_off_t off;
|
|
|
|
|
+ uint64_t keylen;
|
|
|
|
|
+ const unsigned char *rkey;
|
|
|
|
|
|
|
|
- /* FIXME: can we avoid locks for some fast paths? */
|
|
|
|
|
- *start = tdb_lock_list(tdb, hash, ltype, TDB_LOCK_WAIT);
|
|
|
|
|
- if (*start == TDB_OFF_ERR)
|
|
|
|
|
|
|
+ off = tdb_read_off(tdb, tdb->header.v.hash_off
|
|
|
|
|
+ + list * sizeof(tdb_off_t));
|
|
|
|
|
+ if (off == 0 || off == TDB_OFF_ERR)
|
|
|
|
|
+ return off;
|
|
|
|
|
+
|
|
|
|
|
+#if 0 /* FIXME: Check other bits. */
|
|
|
|
|
+ unsigned int bits, bitmask, hoffextra;
|
|
|
|
|
+ /* Bottom three bits show how many extra hash bits. */
|
|
|
|
|
+ bits = (off & ((1 << TDB_EXTRA_HASHBITS_NUM) - 1)) + 1;
|
|
|
|
|
+ bitmask = (1 << bits)-1;
|
|
|
|
|
+ hoffextra = ((off >> TDB_EXTRA_HASHBITS_NUM) & bitmask);
|
|
|
|
|
+ uint64_t hextra = hash >> tdb->header.v.hash_bits;
|
|
|
|
|
+ if ((hextra & bitmask) != hoffextra)
|
|
|
return TDB_OFF_ERR;
|
|
return TDB_OFF_ERR;
|
|
|
|
|
+ off &= ~...;
|
|
|
|
|
+#endif
|
|
|
|
|
|
|
|
- *end = *start;
|
|
|
|
|
- hextra = hash >> tdb->header.v.hash_bits;
|
|
|
|
|
|
|
+ if (tdb_read_convert(tdb, off, rec, sizeof(*rec)) == -1)
|
|
|
|
|
+ return TDB_OFF_ERR;
|
|
|
|
|
|
|
|
- while ((off = tdb_read_off(tdb, tdb->header.v.hash_off
|
|
|
|
|
- + *end * sizeof(tdb_off_t)))
|
|
|
|
|
- != TDB_OFF_ERR) {
|
|
|
|
|
- struct tdb_used_record pad, *r;
|
|
|
|
|
- uint64_t keylen;
|
|
|
|
|
|
|
+ /* FIXME: check extra bits in header! */
|
|
|
|
|
+ keylen = rec_key_length(rec);
|
|
|
|
|
+ if (keylen != key->dsize)
|
|
|
|
|
+ return TDB_OFF_ERR;
|
|
|
|
|
|
|
|
- /* Didn't find it? */
|
|
|
|
|
- if (!off)
|
|
|
|
|
- return 0;
|
|
|
|
|
|
|
+ rkey = tdb_access_read(tdb, off + sizeof(*rec), keylen);
|
|
|
|
|
+ if (!rkey)
|
|
|
|
|
+ return TDB_OFF_ERR;
|
|
|
|
|
+ if (memcmp(rkey, key->dptr, keylen) != 0)
|
|
|
|
|
+ off = TDB_OFF_ERR;
|
|
|
|
|
+ tdb_access_release(tdb, rkey);
|
|
|
|
|
+ return off;
|
|
|
|
|
+}
|
|
|
|
|
|
|
|
-#if 0 /* FIXME: Check other bits. */
|
|
|
|
|
- unsigned int bits, bitmask, hoffextra;
|
|
|
|
|
- /* Bottom three bits show how many extra hash bits. */
|
|
|
|
|
- bits = (off & ((1 << TDB_EXTRA_HASHBITS_NUM) - 1)) + 1;
|
|
|
|
|
- bitmask = (1 << bits)-1;
|
|
|
|
|
- hoffextra = ((off >> TDB_EXTRA_HASHBITS_NUM) & bitmask);
|
|
|
|
|
- if ((hextra & bitmask) != hoffextra)
|
|
|
|
|
- goto lock_next;
|
|
|
|
|
-#endif
|
|
|
|
|
|
|
+/* FIXME: Optimize? */
|
|
|
|
|
+static void unlock_range(struct tdb_context *tdb,
|
|
|
|
|
+ tdb_off_t list, tdb_len_t num,
|
|
|
|
|
+ int ltype)
|
|
|
|
|
+{
|
|
|
|
|
+ tdb_off_t i;
|
|
|
|
|
|
|
|
- r = tdb_get(tdb, off, &pad, sizeof(*r));
|
|
|
|
|
- if (!r)
|
|
|
|
|
- goto unlock_err;
|
|
|
|
|
|
|
+ for (i = list; i < list + num; i++)
|
|
|
|
|
+ tdb_unlock_list(tdb, i, ltype);
|
|
|
|
|
+}
|
|
|
|
|
|
|
|
- if (rec_magic(r) != TDB_MAGIC) {
|
|
|
|
|
- tdb->ecode = TDB_ERR_CORRUPT;
|
|
|
|
|
- tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
|
|
|
|
|
- "find_bucket_and_lock: bad magic 0x%llx"
|
|
|
|
|
- " at offset %llu!\n",
|
|
|
|
|
- (long long)rec_magic(r), (long long)off);
|
|
|
|
|
- goto unlock_err;
|
|
|
|
|
|
|
+/* FIXME: Optimize? */
|
|
|
|
|
+static int lock_range(struct tdb_context *tdb,
|
|
|
|
|
+ tdb_off_t list, tdb_len_t num,
|
|
|
|
|
+ int ltype)
|
|
|
|
|
+{
|
|
|
|
|
+ tdb_off_t i;
|
|
|
|
|
+
|
|
|
|
|
+ for (i = list; i < list + num; i++) {
|
|
|
|
|
+ if (tdb_lock_list(tdb, i, ltype, TDB_LOCK_WAIT) != 0) {
|
|
|
|
|
+ unlock_range(tdb, list, i - list, ltype);
|
|
|
|
|
+ return -1;
|
|
|
}
|
|
}
|
|
|
|
|
+ }
|
|
|
|
|
+ return 0;
|
|
|
|
|
+}
|
|
|
|
|
|
|
|
- /* FIXME: check extra bits in header! */
|
|
|
|
|
- keylen = rec_key_length(r);
|
|
|
|
|
- if (keylen != key->dsize)
|
|
|
|
|
- goto lock_next;
|
|
|
|
|
-
|
|
|
|
|
- switch (tdb_parse_data(tdb, *key, off + sizeof(*r), key->dsize,
|
|
|
|
|
- tdb_key_compare, NULL)) {
|
|
|
|
|
- case 1:
|
|
|
|
|
- /* Match! */
|
|
|
|
|
- *room = rec_data_length(r) + rec_extra_padding(r);
|
|
|
|
|
- return off >> TDB_EXTRA_HASHBITS_NUM;
|
|
|
|
|
- case 0:
|
|
|
|
|
- break;
|
|
|
|
|
- default:
|
|
|
|
|
- goto unlock_err;
|
|
|
|
|
|
|
+/* We lock hashes up to the next empty offset. We already hold the
|
|
|
|
|
+ * lock on the start bucket, but we may need to release and re-grab
|
|
|
|
|
+ * it. If we fail, we hold no locks at all! */
|
|
|
|
|
+static tdb_len_t relock_hash_to_zero(struct tdb_context *tdb,
|
|
|
|
|
+ tdb_off_t start, int ltype)
|
|
|
|
|
+{
|
|
|
|
|
+ tdb_len_t num, len, pre_locks;
|
|
|
|
|
+
|
|
|
|
|
+again:
|
|
|
|
|
+ num = 1ULL << tdb->header.v.hash_bits;
|
|
|
|
|
+ len = tdb_find_zero_off(tdb, hash_off(tdb, start), num - start);
|
|
|
|
|
+ if (unlikely(len == num - start)) {
|
|
|
|
|
+ /* We hit the end of the hash range. Drop lock: we have
|
|
|
|
|
+ to lock start of hash first. */
|
|
|
|
|
+ tdb_unlock_list(tdb, start, ltype);
|
|
|
|
|
+ /* Grab something, so header is stable. */
|
|
|
|
|
+ if (tdb_lock_list(tdb, 0, ltype, TDB_LOCK_WAIT))
|
|
|
|
|
+ return TDB_OFF_ERR;
|
|
|
|
|
+ len = tdb_find_zero_off(tdb, hash_off(tdb, 0), num);
|
|
|
|
|
+ if (lock_range(tdb, 1, len, ltype) == -1) {
|
|
|
|
|
+ tdb_unlock_list(tdb, 0, ltype);
|
|
|
|
|
+ return TDB_OFF_ERR;
|
|
|
}
|
|
}
|
|
|
|
|
+ pre_locks = len;
|
|
|
|
|
+ len = num - start;
|
|
|
|
|
+ } else {
|
|
|
|
|
+ /* We already have lock on start. */
|
|
|
|
|
+ start++;
|
|
|
|
|
+ pre_locks = 0;
|
|
|
|
|
+ }
|
|
|
|
|
+ if (unlikely(lock_range(tdb, start, len, ltype) == -1)) {
|
|
|
|
|
+ if (pre_locks)
|
|
|
|
|
+ unlock_range(tdb, 0, pre_locks, ltype);
|
|
|
|
|
+ else
|
|
|
|
|
+ tdb_unlock_list(tdb, start, ltype);
|
|
|
|
|
+ return TDB_OFF_ERR;
|
|
|
|
|
+ }
|
|
|
|
|
|
|
|
- lock_next:
|
|
|
|
|
- /* Lock next bucket. */
|
|
|
|
|
- /* FIXME: We can deadlock if this wraps! */
|
|
|
|
|
- off = tdb_lock_list(tdb, ++hash, ltype, TDB_LOCK_WAIT);
|
|
|
|
|
- if (off == TDB_OFF_ERR)
|
|
|
|
|
- goto unlock_err;
|
|
|
|
|
- *end = off;
|
|
|
|
|
|
|
+ /* Now, did we lose the race, and it's not zero any more? */
|
|
|
|
|
+ if (unlikely(tdb_read_off(tdb, hash_off(tdb, pre_locks + len)) != 0)) {
|
|
|
|
|
+ unlock_range(tdb, 0, pre_locks, ltype);
|
|
|
|
|
+ /* Leave the start locked, as expected. */
|
|
|
|
|
+ unlock_range(tdb, start + 1, len - 1, ltype);
|
|
|
|
|
+ goto again;
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
-unlock_err:
|
|
|
|
|
- TEST_IT(*end < *start);
|
|
|
|
|
- unlock_lists(tdb, *start, *end, ltype);
|
|
|
|
|
- return TDB_OFF_ERR;
|
|
|
|
|
|
|
+ return pre_locks + len;
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
+/* FIXME: modify, don't rewrite! */
|
|
|
static int update_rec_hdr(struct tdb_context *tdb,
|
|
static int update_rec_hdr(struct tdb_context *tdb,
|
|
|
tdb_off_t off,
|
|
tdb_off_t off,
|
|
|
tdb_len_t keylen,
|
|
tdb_len_t keylen,
|
|
|
tdb_len_t datalen,
|
|
tdb_len_t datalen,
|
|
|
- tdb_len_t room,
|
|
|
|
|
|
|
+ struct tdb_used_record *rec,
|
|
|
uint64_t h)
|
|
uint64_t h)
|
|
|
{
|
|
{
|
|
|
- struct tdb_used_record rec;
|
|
|
|
|
|
|
+ uint64_t room = rec_data_length(rec) + rec_extra_padding(rec);
|
|
|
|
|
|
|
|
- if (set_header(tdb, &rec, keylen, datalen, room - datalen, h))
|
|
|
|
|
|
|
+ if (set_header(tdb, rec, keylen, datalen, room - datalen, h))
|
|
|
return -1;
|
|
return -1;
|
|
|
|
|
|
|
|
- return tdb_write_convert(tdb, off, &rec, sizeof(rec));
|
|
|
|
|
|
|
+ return tdb_write_convert(tdb, off, rec, sizeof(*rec));
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
/* If we fail, others will try after us. */
|
|
/* If we fail, others will try after us. */
|
|
@@ -569,15 +597,42 @@ unlock:
|
|
|
int tdb_store(struct tdb_context *tdb,
|
|
int tdb_store(struct tdb_context *tdb,
|
|
|
struct tdb_data key, struct tdb_data dbuf, int flag)
|
|
struct tdb_data key, struct tdb_data dbuf, int flag)
|
|
|
{
|
|
{
|
|
|
- tdb_off_t new_off, off, start, end, room;
|
|
|
|
|
|
|
+ tdb_off_t new_off, off, old_bucket, start, num_locks = 1;
|
|
|
|
|
+ struct tdb_used_record rec;
|
|
|
uint64_t h;
|
|
uint64_t h;
|
|
|
bool growing = false;
|
|
bool growing = false;
|
|
|
|
|
|
|
|
h = tdb_hash(tdb, key.dptr, key.dsize);
|
|
h = tdb_hash(tdb, key.dptr, key.dsize);
|
|
|
- off = find_bucket_and_lock(tdb, &key, h, &start, &end, &room, F_WRLCK);
|
|
|
|
|
- if (off == TDB_OFF_ERR)
|
|
|
|
|
|
|
+
|
|
|
|
|
+ /* FIXME: can we avoid locks for some fast paths? */
|
|
|
|
|
+ start = tdb_lock_list(tdb, h, F_WRLCK, TDB_LOCK_WAIT);
|
|
|
|
|
+ if (start == TDB_OFF_ERR)
|
|
|
return -1;
|
|
return -1;
|
|
|
|
|
|
|
|
|
|
+ /* Fast path. */
|
|
|
|
|
+ old_bucket = start;
|
|
|
|
|
+ off = entry_matches(tdb, start, h, &key, &rec);
|
|
|
|
|
+ if (unlikely(off == TDB_OFF_ERR)) {
|
|
|
|
|
+ /* Slow path, need to grab more locks and search. */
|
|
|
|
|
+ tdb_off_t i;
|
|
|
|
|
+
|
|
|
|
|
+ /* Warning: this may drop the lock! Does that on error. */
|
|
|
|
|
+ num_locks = relock_hash_to_zero(tdb, start, F_WRLCK);
|
|
|
|
|
+ if (num_locks == TDB_OFF_ERR)
|
|
|
|
|
+ return -1;
|
|
|
|
|
+
|
|
|
|
|
+ for (i = start; i < start + num_locks; i++) {
|
|
|
|
|
+ off = entry_matches(tdb, i, h, &key, &rec);
|
|
|
|
|
+ /* Empty entry or we found it? */
|
|
|
|
|
+ if (off == 0 || off != TDB_OFF_ERR) {
|
|
|
|
|
+ old_bucket = i;
|
|
|
|
|
+ break;
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ if (i == start + num_locks)
|
|
|
|
|
+ off = 0;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
/* Now we have lock on this hash bucket. */
|
|
/* Now we have lock on this hash bucket. */
|
|
|
if (flag == TDB_INSERT) {
|
|
if (flag == TDB_INSERT) {
|
|
|
if (off) {
|
|
if (off) {
|
|
@@ -586,11 +641,12 @@ int tdb_store(struct tdb_context *tdb,
|
|
|
}
|
|
}
|
|
|
} else {
|
|
} else {
|
|
|
if (off) {
|
|
if (off) {
|
|
|
- if (room >= key.dsize + dbuf.dsize) {
|
|
|
|
|
|
|
+ if (rec_data_length(&rec) + rec_extra_padding(&rec)
|
|
|
|
|
+ >= dbuf.dsize) {
|
|
|
new_off = off;
|
|
new_off = off;
|
|
|
if (update_rec_hdr(tdb, off,
|
|
if (update_rec_hdr(tdb, off,
|
|
|
key.dsize, dbuf.dsize,
|
|
key.dsize, dbuf.dsize,
|
|
|
- room, h))
|
|
|
|
|
|
|
+ &rec, h))
|
|
|
goto fail;
|
|
goto fail;
|
|
|
goto write;
|
|
goto write;
|
|
|
}
|
|
}
|
|
@@ -611,7 +667,7 @@ int tdb_store(struct tdb_context *tdb,
|
|
|
/* Allocate a new record. */
|
|
/* Allocate a new record. */
|
|
|
new_off = alloc(tdb, key.dsize, dbuf.dsize, h, growing);
|
|
new_off = alloc(tdb, key.dsize, dbuf.dsize, h, growing);
|
|
|
if (new_off == 0) {
|
|
if (new_off == 0) {
|
|
|
- unlock_lists(tdb, start, end, F_WRLCK);
|
|
|
|
|
|
|
+ unlock_range(tdb, start, num_locks, F_WRLCK);
|
|
|
/* Expand, then try again... */
|
|
/* Expand, then try again... */
|
|
|
if (tdb_expand(tdb, key.dsize, dbuf.dsize, growing) == -1)
|
|
if (tdb_expand(tdb, key.dsize, dbuf.dsize, growing) == -1)
|
|
|
return -1;
|
|
return -1;
|
|
@@ -621,13 +677,14 @@ int tdb_store(struct tdb_context *tdb,
|
|
|
/* We didn't like the existing one: remove it. */
|
|
/* We didn't like the existing one: remove it. */
|
|
|
if (off) {
|
|
if (off) {
|
|
|
add_free_record(tdb, off, sizeof(struct tdb_used_record)
|
|
add_free_record(tdb, off, sizeof(struct tdb_used_record)
|
|
|
- + key.dsize + room);
|
|
|
|
|
|
|
+ + rec_key_length(&rec)
|
|
|
|
|
+ + rec_data_length(&rec)
|
|
|
|
|
+ + rec_extra_padding(&rec));
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
write:
|
|
write:
|
|
|
- off = tdb->header.v.hash_off + end * sizeof(tdb_off_t);
|
|
|
|
|
/* FIXME: Encode extra hash bits! */
|
|
/* FIXME: Encode extra hash bits! */
|
|
|
- if (tdb_write_off(tdb, off, new_off) == -1)
|
|
|
|
|
|
|
+ if (tdb_write_off(tdb, hash_off(tdb, old_bucket), new_off) == -1)
|
|
|
goto fail;
|
|
goto fail;
|
|
|
|
|
|
|
|
off = new_off + sizeof(struct tdb_used_record);
|
|
off = new_off + sizeof(struct tdb_used_record);
|
|
@@ -638,48 +695,65 @@ write:
|
|
|
goto fail;
|
|
goto fail;
|
|
|
|
|
|
|
|
/* FIXME: tdb_increment_seqnum(tdb); */
|
|
/* FIXME: tdb_increment_seqnum(tdb); */
|
|
|
- unlock_lists(tdb, start, end, F_WRLCK);
|
|
|
|
|
|
|
+ unlock_range(tdb, start, num_locks, F_WRLCK);
|
|
|
|
|
|
|
|
- /* By simple trial and error, this roughly approximates a 60%
|
|
|
|
|
- * full measure. */
|
|
|
|
|
- if (unlikely(end - start > 4 * tdb->header.v.hash_bits - 32))
|
|
|
|
|
|
|
+ /* FIXME: by simple simulation, this approximated 60% full.
|
|
|
|
|
+ * Check in real case! */
|
|
|
|
|
+ if (unlikely(num_locks > 4 * tdb->header.v.hash_bits - 31))
|
|
|
enlarge_hash(tdb);
|
|
enlarge_hash(tdb);
|
|
|
|
|
|
|
|
return 0;
|
|
return 0;
|
|
|
|
|
|
|
|
fail:
|
|
fail:
|
|
|
- unlock_lists(tdb, start, end, F_WRLCK);
|
|
|
|
|
|
|
+ unlock_range(tdb, start, num_locks, F_WRLCK);
|
|
|
return -1;
|
|
return -1;
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
struct tdb_data tdb_fetch(struct tdb_context *tdb, struct tdb_data key)
|
|
struct tdb_data tdb_fetch(struct tdb_context *tdb, struct tdb_data key)
|
|
|
{
|
|
{
|
|
|
- tdb_off_t off, start, end, room;
|
|
|
|
|
|
|
+ tdb_off_t off, start, num_locks = 1;
|
|
|
|
|
+ struct tdb_used_record rec;
|
|
|
uint64_t h;
|
|
uint64_t h;
|
|
|
- struct tdb_used_record pad, *r;
|
|
|
|
|
struct tdb_data ret;
|
|
struct tdb_data ret;
|
|
|
|
|
|
|
|
h = tdb_hash(tdb, key.dptr, key.dsize);
|
|
h = tdb_hash(tdb, key.dptr, key.dsize);
|
|
|
- off = find_bucket_and_lock(tdb, &key, h, &start, &end, &room, F_RDLCK);
|
|
|
|
|
- if (off == TDB_OFF_ERR)
|
|
|
|
|
- return tdb_null;
|
|
|
|
|
|
|
|
|
|
- if (!off) {
|
|
|
|
|
- unlock_lists(tdb, start, end, F_RDLCK);
|
|
|
|
|
- tdb->ecode = TDB_SUCCESS;
|
|
|
|
|
|
|
+ /* FIXME: can we avoid locks for some fast paths? */
|
|
|
|
|
+ start = tdb_lock_list(tdb, h, F_RDLCK, TDB_LOCK_WAIT);
|
|
|
|
|
+ if (start == TDB_OFF_ERR)
|
|
|
return tdb_null;
|
|
return tdb_null;
|
|
|
|
|
+
|
|
|
|
|
+ /* Fast path. */
|
|
|
|
|
+ off = entry_matches(tdb, start, h, &key, &rec);
|
|
|
|
|
+ if (unlikely(off == TDB_OFF_ERR)) {
|
|
|
|
|
+ /* Slow path, need to grab more locks and search. */
|
|
|
|
|
+ tdb_off_t i;
|
|
|
|
|
+
|
|
|
|
|
+ /* Warning: this may drop the lock! Does that on error. */
|
|
|
|
|
+ num_locks = relock_hash_to_zero(tdb, start, F_RDLCK);
|
|
|
|
|
+ if (num_locks == TDB_OFF_ERR)
|
|
|
|
|
+ return tdb_null;
|
|
|
|
|
+
|
|
|
|
|
+ for (i = start; i < start + num_locks; i++) {
|
|
|
|
|
+ off = entry_matches(tdb, i, h, &key, &rec);
|
|
|
|
|
+ /* Empty entry or we found it? */
|
|
|
|
|
+ if (off == 0 || off != TDB_OFF_ERR)
|
|
|
|
|
+ break;
|
|
|
|
|
+ }
|
|
|
|
|
+ if (i == start + num_locks)
|
|
|
|
|
+ off = 0;
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
- r = tdb_get(tdb, off, &pad, sizeof(*r));
|
|
|
|
|
- if (!r) {
|
|
|
|
|
- unlock_lists(tdb, start, end, F_RDLCK);
|
|
|
|
|
|
|
+ if (!off) {
|
|
|
|
|
+ unlock_range(tdb, start, num_locks, F_RDLCK);
|
|
|
|
|
+ tdb->ecode = TDB_ERR_NOEXIST;
|
|
|
return tdb_null;
|
|
return tdb_null;
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
- ret.dsize = rec_data_length(r);
|
|
|
|
|
- ret.dptr = tdb_alloc_read(tdb, off + sizeof(*r) + key.dsize,
|
|
|
|
|
|
|
+ ret.dsize = rec_data_length(&rec);
|
|
|
|
|
+ ret.dptr = tdb_alloc_read(tdb, off + sizeof(rec) + key.dsize,
|
|
|
ret.dsize);
|
|
ret.dsize);
|
|
|
- unlock_lists(tdb, start, end, F_RDLCK);
|
|
|
|
|
|
|
+ unlock_range(tdb, start, num_locks, F_RDLCK);
|
|
|
return ret;
|
|
return ret;
|
|
|
}
|
|
}
|
|
|
|
|
|
|
@@ -687,157 +761,127 @@ static int hash_add(struct tdb_context *tdb, uint64_t h, tdb_off_t off)
|
|
|
{
|
|
{
|
|
|
tdb_off_t i, hoff, len, num;
|
|
tdb_off_t i, hoff, len, num;
|
|
|
|
|
|
|
|
|
|
+ /* Look for next space. */
|
|
|
i = (h & ((1ULL << tdb->header.v.hash_bits) - 1));
|
|
i = (h & ((1ULL << tdb->header.v.hash_bits) - 1));
|
|
|
- hoff = tdb->header.v.hash_off + i * sizeof(tdb_off_t);
|
|
|
|
|
len = (1ULL << tdb->header.v.hash_bits) - i;
|
|
len = (1ULL << tdb->header.v.hash_bits) - i;
|
|
|
|
|
+ num = tdb_find_zero_off(tdb, hash_off(tdb, i), len);
|
|
|
|
|
|
|
|
- /* Look for next space. */
|
|
|
|
|
- num = tdb_find_zero_off(tdb, hoff, len);
|
|
|
|
|
if (unlikely(num == len)) {
|
|
if (unlikely(num == len)) {
|
|
|
- hoff = tdb->header.v.hash_off;
|
|
|
|
|
|
|
+ /* We wrapped. Look through start of hash table. */
|
|
|
|
|
+ hoff = hash_off(tdb, 0);
|
|
|
len = (1ULL << tdb->header.v.hash_bits);
|
|
len = (1ULL << tdb->header.v.hash_bits);
|
|
|
num = tdb_find_zero_off(tdb, hoff, len);
|
|
num = tdb_find_zero_off(tdb, hoff, len);
|
|
|
- if (i == len)
|
|
|
|
|
|
|
+ if (i == len) {
|
|
|
|
|
+ tdb->ecode = TDB_ERR_CORRUPT;
|
|
|
|
|
+ tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
|
|
|
|
|
+ "hash_add: full hash table!\n");
|
|
|
return -1;
|
|
return -1;
|
|
|
|
|
+ }
|
|
|
}
|
|
}
|
|
|
/* FIXME: Encode extra hash bits! */
|
|
/* FIXME: Encode extra hash bits! */
|
|
|
- return tdb_write_off(tdb, hoff + num * sizeof(tdb_off_t), off);
|
|
|
|
|
|
|
+ return tdb_write_off(tdb, hash_off(tdb, i + num), off);
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
-static int unlink_used_record(struct tdb_context *tdb, tdb_off_t chain,
|
|
|
|
|
- uint64_t *extra_locks)
|
|
|
|
|
|
|
+int tdb_delete(struct tdb_context *tdb, struct tdb_data key)
|
|
|
{
|
|
{
|
|
|
- tdb_off_t num, len, i, hoff;
|
|
|
|
|
|
|
+ tdb_off_t i, old_bucket, off, start, num_locks = 1;
|
|
|
|
|
+ struct tdb_used_record rec;
|
|
|
|
|
+ uint64_t h;
|
|
|
|
|
|
|
|
- /* FIXME: Maybe lock more in search? Maybe don't lock if scan
|
|
|
|
|
- * finds none? */
|
|
|
|
|
-again:
|
|
|
|
|
- len = (1ULL << tdb->header.v.hash_bits) - (chain + 1);
|
|
|
|
|
- hoff = tdb->header.v.hash_off + (chain + 1) * sizeof(tdb_off_t);
|
|
|
|
|
- num = tdb_find_zero_off(tdb, hoff, len);
|
|
|
|
|
-
|
|
|
|
|
- /* We want to lock the zero entry, too. In the wrap case,
|
|
|
|
|
- * this locks one extra. That's harmless. */
|
|
|
|
|
- num++;
|
|
|
|
|
-
|
|
|
|
|
- for (i = chain + 1; i < chain + 1 + num; i++) {
|
|
|
|
|
- if (tdb_lock_list(tdb, i, F_WRLCK, TDB_LOCK_WAIT)
|
|
|
|
|
- == TDB_OFF_ERR) {
|
|
|
|
|
- if (i != chain + 1)
|
|
|
|
|
- unlock_lists(tdb, chain + 1, i-1, F_WRLCK);
|
|
|
|
|
|
|
+ h = tdb_hash(tdb, key.dptr, key.dsize);
|
|
|
|
|
+
|
|
|
|
|
+ /* FIXME: can we avoid locks for some fast paths? */
|
|
|
|
|
+ start = tdb_lock_list(tdb, h, F_WRLCK, TDB_LOCK_WAIT);
|
|
|
|
|
+ if (start == TDB_OFF_ERR)
|
|
|
|
|
+ return -1;
|
|
|
|
|
+
|
|
|
|
|
+ /* Fast path. */
|
|
|
|
|
+ old_bucket = start;
|
|
|
|
|
+ off = entry_matches(tdb, start, h, &key, &rec);
|
|
|
|
|
+ if (off && off != TDB_OFF_ERR) {
|
|
|
|
|
+ /* We can only really fastpath delete if next bucket
|
|
|
|
|
+ * is 0. Note that we haven't locked it, but our lock
|
|
|
|
|
+ * on this bucket stops anyone overflowing into it
|
|
|
|
|
+ * while we look. */
|
|
|
|
|
+ if (tdb_read_off(tdb, hash_off(tdb, h+1)) == 0)
|
|
|
|
|
+ goto delete;
|
|
|
|
|
+ /* Slow path. */
|
|
|
|
|
+ off = TDB_OFF_ERR;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ if (unlikely(off == TDB_OFF_ERR)) {
|
|
|
|
|
+ /* Slow path, need to grab more locks and search. */
|
|
|
|
|
+ tdb_off_t i;
|
|
|
|
|
+
|
|
|
|
|
+ /* Warning: this may drop the lock! Does that on error. */
|
|
|
|
|
+ num_locks = relock_hash_to_zero(tdb, start, F_WRLCK);
|
|
|
|
|
+ if (num_locks == TDB_OFF_ERR)
|
|
|
return -1;
|
|
return -1;
|
|
|
- }
|
|
|
|
|
- }
|
|
|
|
|
|
|
|
|
|
- /* The wrap case: we need those locks out of order! */
|
|
|
|
|
- if (unlikely(num == len + 1)) {
|
|
|
|
|
- *extra_locks = tdb_find_zero_off(tdb, tdb->header.v.hash_off,
|
|
|
|
|
- 1ULL << tdb->header.v.hash_bits);
|
|
|
|
|
- (*extra_locks)++;
|
|
|
|
|
- for (i = 0; i < *extra_locks; i++) {
|
|
|
|
|
- if (tdb_lock_list(tdb, i, F_WRLCK, TDB_LOCK_NOWAIT)
|
|
|
|
|
- == TDB_OFF_ERR) {
|
|
|
|
|
- /* Failed. Caller must lock in order. */
|
|
|
|
|
- if (i)
|
|
|
|
|
- unlock_lists(tdb, 0, i-1, F_WRLCK);
|
|
|
|
|
- unlock_lists(tdb, chain + 1, chain + num,
|
|
|
|
|
- F_WRLCK);
|
|
|
|
|
- return 1;
|
|
|
|
|
|
|
+ for (i = start; i < start + num_locks; i++) {
|
|
|
|
|
+ off = entry_matches(tdb, i, h, &key, &rec);
|
|
|
|
|
+ /* Empty entry or we found it? */
|
|
|
|
|
+ if (off == 0 || off != TDB_OFF_ERR) {
|
|
|
|
|
+ old_bucket = i;
|
|
|
|
|
+ break;
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
- num += *extra_locks;
|
|
|
|
|
|
|
+ if (i == start + num_locks)
|
|
|
|
|
+ off = 0;
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
- /* Now we have the locks, be certain that offset is still 0! */
|
|
|
|
|
- hoff = tdb->header.v.hash_off
|
|
|
|
|
- + (((chain + num) * sizeof(tdb_off_t))
|
|
|
|
|
- & ((1ULL << tdb->header.v.hash_bits) - 1));
|
|
|
|
|
-
|
|
|
|
|
- if (unlikely(tdb_read_off(tdb, hoff) != 0)) {
|
|
|
|
|
- unlock_lists(tdb, chain + 1, chain + num, F_WRLCK);
|
|
|
|
|
- goto again;
|
|
|
|
|
|
|
+ if (!off) {
|
|
|
|
|
+ unlock_range(tdb, start, num_locks, F_WRLCK);
|
|
|
|
|
+ tdb->ecode = TDB_ERR_NOEXIST;
|
|
|
|
|
+ return -1;
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
- /* OK, all locked. Unlink first one. */
|
|
|
|
|
- hoff = tdb->header.v.hash_off + chain * sizeof(tdb_off_t);
|
|
|
|
|
- if (tdb_write_off(tdb, hoff, 0) == -1)
|
|
|
|
|
|
|
+delete:
|
|
|
|
|
+ /* This actually unlinks it. */
|
|
|
|
|
+ if (tdb_write_off(tdb, hash_off(tdb, old_bucket), 0) == -1)
|
|
|
goto unlock_err;
|
|
goto unlock_err;
|
|
|
|
|
|
|
|
- /* Rehash the rest. */
|
|
|
|
|
- for (i = 1; i < num; i++) {
|
|
|
|
|
- tdb_off_t off;
|
|
|
|
|
- uint64_t h;
|
|
|
|
|
|
|
+ /* Rehash anything following. */
|
|
|
|
|
+ for (i = old_bucket+1; i < h + num_locks; i++) {
|
|
|
|
|
+ tdb_off_t off2;
|
|
|
|
|
+ uint64_t h2;
|
|
|
|
|
|
|
|
- hoff = tdb->header.v.hash_off
|
|
|
|
|
- + (((chain + i) * sizeof(tdb_off_t))
|
|
|
|
|
- & ((1ULL << tdb->header.v.hash_bits) - 1));
|
|
|
|
|
- off = tdb_read_off(tdb, hoff);
|
|
|
|
|
- if (unlikely(off == TDB_OFF_ERR))
|
|
|
|
|
|
|
+ off2 = tdb_read_off(tdb, hash_off(tdb, i));
|
|
|
|
|
+ if (unlikely(off2 == TDB_OFF_ERR))
|
|
|
goto unlock_err;
|
|
goto unlock_err;
|
|
|
|
|
|
|
|
/* Maybe use a bit to indicate it is in ideal place? */
|
|
/* Maybe use a bit to indicate it is in ideal place? */
|
|
|
- h = hash_record(tdb, off);
|
|
|
|
|
|
|
+ h2 = hash_record(tdb, off2);
|
|
|
/* Is it happy where it is? */
|
|
/* Is it happy where it is? */
|
|
|
- if ((h & ((1ULL << tdb->header.v.hash_bits)-1)) == (chain + i))
|
|
|
|
|
|
|
+ if ((h2 & ((1ULL << tdb->header.v.hash_bits)-1))
|
|
|
|
|
+ == (i & ((1ULL << tdb->header.v.hash_bits)-1)))
|
|
|
continue;
|
|
continue;
|
|
|
|
|
|
|
|
/* Remove it. */
|
|
/* Remove it. */
|
|
|
- if (tdb_write_off(tdb, hoff, 0) == -1)
|
|
|
|
|
|
|
+ if (tdb_write_off(tdb, hash_off(tdb, i), 0) == -1)
|
|
|
goto unlock_err;
|
|
goto unlock_err;
|
|
|
|
|
|
|
|
/* Rehash it. */
|
|
/* Rehash it. */
|
|
|
- if (hash_add(tdb, h, off) == -1)
|
|
|
|
|
|
|
+ if (hash_add(tdb, h2, off2) == -1)
|
|
|
goto unlock_err;
|
|
goto unlock_err;
|
|
|
}
|
|
}
|
|
|
- unlock_lists(tdb, chain + 1, chain + num, F_WRLCK);
|
|
|
|
|
|
|
+
|
|
|
|
|
+ /* Free the deleted entry. */
|
|
|
|
|
+ if (add_free_record(tdb, off,
|
|
|
|
|
+ sizeof(struct tdb_used_record)
|
|
|
|
|
+ + rec_key_length(&rec)
|
|
|
|
|
+ + rec_data_length(&rec)
|
|
|
|
|
+ + rec_extra_padding(&rec)) != 0)
|
|
|
|
|
+ goto unlock_err;
|
|
|
|
|
+
|
|
|
|
|
+ unlock_range(tdb, start, num_locks, F_WRLCK);
|
|
|
return 0;
|
|
return 0;
|
|
|
|
|
|
|
|
unlock_err:
|
|
unlock_err:
|
|
|
- unlock_lists(tdb, chain + 1, chain + num, F_WRLCK);
|
|
|
|
|
|
|
+ unlock_range(tdb, start, num_locks, F_WRLCK);
|
|
|
return -1;
|
|
return -1;
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
-int tdb_delete(struct tdb_context *tdb, struct tdb_data key)
|
|
|
|
|
-{
|
|
|
|
|
- tdb_off_t off, start, end, room, extra_locks = 0;
|
|
|
|
|
- uint64_t h;
|
|
|
|
|
- int ret;
|
|
|
|
|
-
|
|
|
|
|
- h = tdb_hash(tdb, key.dptr, key.dsize);
|
|
|
|
|
- off = find_bucket_and_lock(tdb, &key, h, &start, &end, &room, F_WRLCK);
|
|
|
|
|
- if (off == TDB_OFF_ERR)
|
|
|
|
|
- return -1;
|
|
|
|
|
-
|
|
|
|
|
- if (off == 0) {
|
|
|
|
|
- unlock_lists(tdb, start, end, F_WRLCK);
|
|
|
|
|
- tdb->ecode = TDB_ERR_NOEXIST;
|
|
|
|
|
- return -1;
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
- ret = unlink_used_record(tdb, end, &extra_locks);
|
|
|
|
|
- if (unlikely(ret == 1)) {
|
|
|
|
|
- unsigned int i;
|
|
|
|
|
-
|
|
|
|
|
- unlock_lists(tdb, start, end, F_WRLCK);
|
|
|
|
|
-
|
|
|
|
|
- /* We need extra locks at the start. */
|
|
|
|
|
- for (i = 0; i < extra_locks; i++) {
|
|
|
|
|
- if (tdb_lock_list(tdb, i, F_WRLCK, TDB_LOCK_WAIT)
|
|
|
|
|
- == TDB_OFF_ERR) {
|
|
|
|
|
- if (i)
|
|
|
|
|
- unlock_lists(tdb, 0, i-1, F_WRLCK);
|
|
|
|
|
- return -1;
|
|
|
|
|
- }
|
|
|
|
|
- }
|
|
|
|
|
- /* Try again now we're holding more locks. */
|
|
|
|
|
- ret = tdb_delete(tdb, key);
|
|
|
|
|
- unlock_lists(tdb, 0, i, F_WRLCK);
|
|
|
|
|
- return ret;
|
|
|
|
|
- }
|
|
|
|
|
- unlock_lists(tdb, start, end, F_WRLCK);
|
|
|
|
|
- return ret;
|
|
|
|
|
-}
|
|
|
|
|
-
|
|
|
|
|
int tdb_close(struct tdb_context *tdb)
|
|
int tdb_close(struct tdb_context *tdb)
|
|
|
{
|
|
{
|
|
|
struct tdb_context **i;
|
|
struct tdb_context **i;
|