invbloom.c 4.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205
  1. /* Licensed under BSD-MIT - see LICENSE file for details */
  2. #include "invbloom.h"
  3. #include <ccan/hash/hash.h>
  4. #include <ccan/endian/endian.h>
  5. #include <assert.h>
  6. /* "We will show that hash_count values of 3 or 4 work well in practice"
  7. From:
  8. Eppstein, David, et al. "What's the difference?: efficient set reconciliation without prior context." ACM SIGCOMM Computer Communication Review. Vol. 41. No. 4. ACM, 2011. http://conferences.sigcomm.org/sigcomm/2011/papers/sigcomm/p218.pdf
  9. */
  10. #define NUM_HASHES 3
  11. struct invbloom *invbloom_new_(const tal_t *ctx,
  12. size_t id_size,
  13. size_t n_elems,
  14. u32 salt)
  15. {
  16. struct invbloom *ib = tal(ctx, struct invbloom);
  17. if (ib) {
  18. ib->n_elems = n_elems;
  19. ib->id_size = id_size;
  20. ib->salt = salt;
  21. ib->singleton = NULL;
  22. ib->count = tal_arrz(ib, s32, n_elems);
  23. ib->idsum = tal_arrz(ib, u8, id_size * n_elems);
  24. if (!ib->count || !ib->idsum)
  25. ib = tal_free(ib);
  26. }
  27. return ib;
  28. }
  29. void invbloom_singleton_cb_(struct invbloom *ib,
  30. void (*cb)(struct invbloom *,
  31. size_t bucket, bool, void *),
  32. void *data)
  33. {
  34. ib->singleton = cb;
  35. ib->singleton_data = data;
  36. }
  37. static size_t hash_bucket(const struct invbloom *ib, const void *id, size_t i)
  38. {
  39. return hash((const char *)id, ib->id_size, ib->salt+i*7) % ib->n_elems;
  40. }
  41. static u8 *idsum_ptr(const struct invbloom *ib, size_t bucket)
  42. {
  43. return (u8 *)ib->idsum + bucket * ib->id_size;
  44. }
  45. static void check_for_singleton(struct invbloom *ib, size_t bucket, bool before)
  46. {
  47. if (!ib->singleton)
  48. return;
  49. if (ib->count[bucket] != 1 && ib->count[bucket] != -1)
  50. return;
  51. ib->singleton(ib, bucket, before, ib->singleton_data);
  52. }
  53. static void add_to_bucket(struct invbloom *ib, size_t n, const u8 *id)
  54. {
  55. size_t i;
  56. u8 *idsum = idsum_ptr(ib, n);
  57. check_for_singleton(ib, n, true);
  58. ib->count[n]++;
  59. for (i = 0; i < ib->id_size; i++)
  60. idsum[i] ^= id[i];
  61. check_for_singleton(ib, n, false);
  62. }
  63. static void remove_from_bucket(struct invbloom *ib, size_t n, const u8 *id)
  64. {
  65. size_t i;
  66. u8 *idsum = idsum_ptr(ib, n);
  67. check_for_singleton(ib, n, true);
  68. ib->count[n]--;
  69. for (i = 0; i < ib->id_size; i++)
  70. idsum[i] ^= id[i];
  71. check_for_singleton(ib, n, false);
  72. }
  73. void invbloom_insert(struct invbloom *ib, const void *id)
  74. {
  75. unsigned int i;
  76. for (i = 0; i < NUM_HASHES; i++)
  77. add_to_bucket(ib, hash_bucket(ib, id, i), id);
  78. }
  79. void invbloom_delete(struct invbloom *ib, const void *id)
  80. {
  81. unsigned int i;
  82. for (i = 0; i < NUM_HASHES; i++)
  83. remove_from_bucket(ib, hash_bucket(ib, id, i), id);
  84. }
  85. static bool all_zero(const u8 *mem, size_t size)
  86. {
  87. unsigned int i;
  88. for (i = 0; i < size; i++)
  89. if (mem[i])
  90. return false;
  91. return true;
  92. }
  93. bool invbloom_get(const struct invbloom *ib, const void *id)
  94. {
  95. unsigned int i;
  96. for (i = 0; i < NUM_HASHES; i++) {
  97. size_t h = hash_bucket(ib, id, i);
  98. u8 *idsum = idsum_ptr(ib, h);
  99. if (ib->count[h] == 0 && all_zero(idsum, ib->id_size))
  100. return false;
  101. if (ib->count[h] == 1)
  102. return (memcmp(idsum, id, ib->id_size) == 0);
  103. }
  104. return false;
  105. }
  106. static void *extract(const tal_t *ctx, struct invbloom *ib, int count)
  107. {
  108. size_t i;
  109. /* FIXME: this makes full extraction O(n^2). */
  110. for (i = 0; i < ib->n_elems; i++) {
  111. void *id;
  112. if (ib->count[i] != count)
  113. continue;
  114. id = tal_dup_arr(ctx, u8, idsum_ptr(ib, i), ib->id_size, 0);
  115. return id;
  116. }
  117. return NULL;
  118. }
  119. void *invbloom_extract(const tal_t *ctx, struct invbloom *ib)
  120. {
  121. void *id;
  122. id = extract(ctx, ib, 1);
  123. if (id)
  124. invbloom_delete(ib, id);
  125. return id;
  126. }
  127. void *invbloom_extract_negative(const tal_t *ctx, struct invbloom *ib)
  128. {
  129. void *id;
  130. id = extract(ctx, ib, -1);
  131. if (id)
  132. invbloom_insert(ib, id);
  133. return id;
  134. }
  135. void invbloom_subtract(struct invbloom *ib1, const struct invbloom *ib2)
  136. {
  137. size_t i;
  138. assert(ib1->n_elems == ib2->n_elems);
  139. assert(ib1->id_size == ib2->id_size);
  140. assert(ib1->salt == ib2->salt);
  141. for (i = 0; i < ib1->n_elems; i++)
  142. check_for_singleton(ib1, i, true);
  143. for (i = 0; i < ib1->n_elems * ib1->id_size; i++)
  144. ib1->idsum[i] ^= ib2->idsum[i];
  145. for (i = 0; i < ib1->n_elems; i++) {
  146. ib1->count[i] -= ib2->count[i];
  147. check_for_singleton(ib1, i, false);
  148. }
  149. }
  150. bool invbloom_empty(const struct invbloom *ib)
  151. {
  152. size_t i;
  153. for (i = 0; i < ib->n_elems; i++) {
  154. if (ib->count[i])
  155. return false;
  156. if (!all_zero(idsum_ptr(ib, i), ib->id_size))
  157. return false;
  158. }
  159. return true;
  160. }