strset.c 6.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298
  1. /* This code is based on the public domain code at
  2. * http://github.com/agl/critbit writtem by Adam Langley
  3. * <agl@imperialviolet.org>.
  4. *
  5. * Here are the main implementation differences:
  6. * (1) We don't strdup the string on insert; we use the pointer we're given.
  7. * (2) We use a straight bit number rather than a mask; it's simpler.
  8. * (3) We don't use the bottom bit of the pointer, but instead use a leading
  9. * zero to distinguish nodes from strings.
  10. * (4) The empty string (which would look like a node) is handled
  11. * using a special "empty node".
  12. * (5) Delete returns the string, so you can free it if you want to.
  13. * (6) Unions instead of void *, bool instead of int.
  14. */
  15. #include <ccan/strset/strset.h>
  16. #include <ccan/short_types/short_types.h>
  17. #include <ccan/likely/likely.h>
  18. #include <ccan/str/str.h>
  19. #include <ccan/ilog/ilog.h>
  20. #include <assert.h>
  21. #include <stdlib.h>
  22. struct node {
  23. /* To differentiate us from strings. */
  24. char nul_byte;
  25. /* The bit where these children differ. */
  26. u8 bit_num;
  27. /* The byte number where first bit differs (-1 == empty string node). */
  28. size_t byte_num;
  29. /* These point to strings or nodes. */
  30. struct strset child[2];
  31. };
  32. /* Closest member to this in a non-empty set. */
  33. static const char *closest(struct strset n, const char *member)
  34. {
  35. size_t len = strlen(member);
  36. const u8 *bytes = (const u8 *)member;
  37. /* Anything with first byte 0 is a node. */
  38. while (!n.u.s[0]) {
  39. u8 direction = 0;
  40. /* Special node which represents the empty string. */
  41. if (unlikely(n.u.n->byte_num == (size_t)-1)) {
  42. n = n.u.n->child[0];
  43. break;
  44. }
  45. if (n.u.n->byte_num < len) {
  46. u8 c = bytes[n.u.n->byte_num];
  47. direction = (c >> n.u.n->bit_num) & 1;
  48. }
  49. n = n.u.n->child[direction];
  50. }
  51. return n.u.s;
  52. }
  53. char *strset_test(const struct strset *set, const char *member)
  54. {
  55. const char *str;
  56. /* Empty set? */
  57. if (!set->u.n)
  58. return NULL;
  59. str = closest(*set, member);
  60. if (streq(member, str))
  61. return (char *)str;
  62. return NULL;
  63. }
  64. static bool set_string(struct strset *set,
  65. struct strset *n, const char *member)
  66. {
  67. /* Substitute magic empty node if this is the empty string */
  68. if (unlikely(!member[0])) {
  69. n->u.n = malloc(sizeof(*n->u.n));
  70. if (unlikely(!n->u.n))
  71. return false;
  72. n->u.n->nul_byte = '\0';
  73. n->u.n->byte_num = (size_t)-1;
  74. /* Attach the string to child[0] */
  75. n = &n->u.n->child[0];
  76. }
  77. n->u.s = member;
  78. return true;
  79. }
  80. bool strset_set(struct strset *set, const char *member)
  81. {
  82. size_t len = strlen(member);
  83. const u8 *bytes = (const u8 *)member;
  84. struct strset *np;
  85. const char *str;
  86. struct node *newn;
  87. size_t byte_num;
  88. u8 bit_num, new_dir;
  89. /* Empty set? */
  90. if (!set->u.n) {
  91. return set_string(set, set, member);
  92. }
  93. /* Find closest existing member. */
  94. str = closest(*set, member);
  95. /* Find where they differ. */
  96. for (byte_num = 0; str[byte_num] == member[byte_num]; byte_num++) {
  97. if (member[byte_num] == '\0') {
  98. /* All identical! */
  99. return false;
  100. }
  101. }
  102. /* Find which bit differs (if we had ilog8, we'd use it) */
  103. bit_num = ilog32_nz((u8)str[byte_num] ^ bytes[byte_num]) - 1;
  104. assert(bit_num < CHAR_BIT);
  105. /* Which direction do we go at this bit? */
  106. new_dir = ((bytes[byte_num]) >> bit_num) & 1;
  107. /* Allocate new node. */
  108. newn = malloc(sizeof(*newn));
  109. if (!newn) {
  110. /* FIXME */
  111. return false;
  112. }
  113. newn->nul_byte = '\0';
  114. newn->byte_num = byte_num;
  115. newn->bit_num = bit_num;
  116. if (unlikely(!set_string(set, &newn->child[new_dir], member))) {
  117. free(newn);
  118. return false;
  119. }
  120. /* Find where to insert: not closest, but first which differs! */
  121. np = set;
  122. while (!np->u.s[0]) {
  123. u8 direction = 0;
  124. /* Special node which represents the empty string will
  125. * break here too! */
  126. if (np->u.n->byte_num > byte_num)
  127. break;
  128. /* Subtle: bit numbers are "backwards" for comparison */
  129. if (np->u.n->byte_num == byte_num && np->u.n->bit_num < bit_num)
  130. break;
  131. if (np->u.n->byte_num < len) {
  132. u8 c = bytes[np->u.n->byte_num];
  133. direction = (c >> np->u.n->bit_num) & 1;
  134. }
  135. np = &np->u.n->child[direction];
  136. }
  137. newn->child[!new_dir]= *np;
  138. np->u.n = newn;
  139. return true;
  140. }
  141. char *strset_clear(struct strset *set, const char *member)
  142. {
  143. size_t len = strlen(member);
  144. const u8 *bytes = (const u8 *)member;
  145. struct strset *parent = NULL, *n;
  146. const char *ret = NULL;
  147. u8 direction = 0; /* prevent bogus gcc warning. */
  148. /* Empty set? */
  149. if (!set->u.n)
  150. return NULL;
  151. /* Find closest, but keep track of parent. */
  152. n = set;
  153. /* Anything with first byte 0 is a node. */
  154. while (!n->u.s[0]) {
  155. u8 c = 0;
  156. /* Special node which represents the empty string. */
  157. if (unlikely(n->u.n->byte_num == (size_t)-1)) {
  158. const char *empty_str = n->u.n->child[0].u.s;
  159. if (member[0])
  160. return NULL;
  161. /* Sew empty string back so remaining logic works */
  162. free(n->u.n);
  163. n->u.s = empty_str;
  164. break;
  165. }
  166. parent = n;
  167. if (n->u.n->byte_num < len) {
  168. c = bytes[n->u.n->byte_num];
  169. direction = (c >> n->u.n->bit_num) & 1;
  170. } else
  171. direction = 0;
  172. n = &n->u.n->child[direction];
  173. }
  174. /* Did we find it? */
  175. if (!streq(member, n->u.s))
  176. return NULL;
  177. ret = n->u.s;
  178. if (!parent) {
  179. /* We deleted last node. */
  180. set->u.n = NULL;
  181. } else {
  182. struct node *old = parent->u.n;
  183. /* Raise other node to parent. */
  184. *parent = old->child[!direction];
  185. free(old);
  186. }
  187. return (char *)ret;
  188. }
  189. static bool iterate(struct strset n,
  190. bool (*handle)(const char *, void *), void *data)
  191. {
  192. if (n.u.s[0])
  193. return handle(n.u.s, data);
  194. if (unlikely(n.u.n->byte_num == (size_t)-1))
  195. return handle(n.u.n->child[0].u.s, data);
  196. return iterate(n.u.n->child[0], handle, data)
  197. || iterate(n.u.n->child[1], handle, data);
  198. }
  199. void strset_iterate_(const struct strset *set,
  200. bool (*handle)(const char *, void *), void *data)
  201. {
  202. /* Empty set? */
  203. if (!set->u.n)
  204. return;
  205. iterate(*set, handle, data);
  206. }
  207. const struct strset *strset_prefix(const struct strset *set, const char *prefix)
  208. {
  209. const struct strset *n, *top;
  210. size_t len = strlen(prefix);
  211. const u8 *bytes = (const u8 *)prefix;
  212. /* Empty set -> return empty set. */
  213. if (!set->u.n)
  214. return set;
  215. top = n = set;
  216. /* We walk to find the top, but keep going to check prefix matches. */
  217. while (!n->u.s[0]) {
  218. u8 c = 0, direction;
  219. /* Special node which represents the empty string. */
  220. if (unlikely(n->u.n->byte_num == (size_t)-1)) {
  221. n = &n->u.n->child[0];
  222. break;
  223. }
  224. if (n->u.n->byte_num < len)
  225. c = bytes[n->u.n->byte_num];
  226. direction = (c >> n->u.n->bit_num) & 1;
  227. n = &n->u.n->child[direction];
  228. if (c)
  229. top = n;
  230. }
  231. if (!strstarts(n->u.s, prefix)) {
  232. /* Convenient return for prefixes which do not appear in set. */
  233. static const struct strset empty_set;
  234. return &empty_set;
  235. }
  236. return top;
  237. }
  238. static void destroy(struct strset n)
  239. {
  240. if (!n.u.s[0]) {
  241. if (likely(n.u.n->byte_num != (size_t)-1)) {
  242. destroy(n.u.n->child[0]);
  243. destroy(n.u.n->child[1]);
  244. }
  245. free(n.u.n);
  246. }
  247. }
  248. void strset_destroy(struct strset *set)
  249. {
  250. if (set->u.n)
  251. destroy(*set);
  252. set->u.n = NULL;
  253. }