read_cnumber.c 9.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408
  1. //for strtold
  2. #define _ISOC99_SOURCE
  3. #include <stdlib.h>
  4. #undef _ISOC99_SOURCE
  5. #include "ccan_tokenizer.h"
  6. #ifndef ULLONG_MAX
  7. #define ULLONG_MAX 18446744073709551615ULL
  8. #endif
  9. static const char *skipnum(const char *s, const char *e, readui_base base) {
  10. for (;s<e;s++) {
  11. unsigned int c = (unsigned char)*s;
  12. if (cdigit(c)) {
  13. if ( c-'0' >= (base & 0xFF) &&
  14. !(base & READUI_ALLOWHIGHERDIGITS) )
  15. break;
  16. } else if (c>='A' && c<='Z') {
  17. if (!(base & READUI_ALLOWCAPLETTERS))
  18. break;
  19. if ( c-'A'+10 >= (base & 0xFF) &&
  20. !(base & READUI_ALLOWHIGHERDIGITS))
  21. break;
  22. } else if (c>='a' && c<='z') {
  23. if (!(base & READUI_ALLOWLCASELETTERS))
  24. break;
  25. if ( c-'a'+10 >= (base & 0xFF) &&
  26. !(base & READUI_ALLOWHIGHERDIGITS))
  27. break;
  28. } else
  29. break;
  30. }
  31. return s;
  32. }
  33. static uint64_t readui_valid(const char *s, const char *e, readui_base base) {
  34. uint64_t ret = 0;
  35. uint64_t multiplier = 1;
  36. uint64_t digit_value;
  37. //64-bit multiplication with overflow checking
  38. #define multiply(dest, src) do { \
  39. uint32_t a0 = (uint64_t)(dest) & 0xFFFFFFFF; \
  40. uint32_t a1 = (uint64_t)(dest) >> 32; \
  41. uint32_t b0 = (uint64_t)(src) & 0xFFFFFFFF; \
  42. uint32_t b1 = (uint64_t)(src) >> 32; \
  43. uint64_t a, b; \
  44. \
  45. if (a1 && b1) \
  46. goto overflowed; \
  47. a = (uint64_t)a1*b0 + (uint64_t)a0*b1; \
  48. if (a > 0xFFFFFFFF) \
  49. goto overflowed; \
  50. a <<= 32; \
  51. b = (uint64_t)a0*b0; \
  52. \
  53. if (a+b < a) \
  54. goto overflowed; \
  55. (dest) = a+b; \
  56. } while(0)
  57. if (s >= e || ((base&0xFF) < 1)) {
  58. errno = EINVAL;
  59. return 0;
  60. }
  61. while (s<e && *s=='0') s++;
  62. if (e > s) {
  63. for (;;) {
  64. char c = *--e;
  65. //this series of if statements takes advantage of the fact that 'a'>'A'>'0'
  66. if (c >= 'a')
  67. c -= 'a'-10;
  68. else if (c >= 'A')
  69. c -= 'A'-10;
  70. else
  71. c -= '0';
  72. digit_value = c;
  73. //TODO: Write/find a testcase where temp *= multiplier does overflow
  74. multiply(digit_value, multiplier);
  75. if (ret+digit_value < ret)
  76. goto overflowed;
  77. ret += digit_value;
  78. if (e <= s)
  79. break;
  80. multiply(multiplier, base & 0xFF);
  81. }
  82. }
  83. errno = 0;
  84. return ret;
  85. overflowed:
  86. errno = ERANGE;
  87. return ULLONG_MAX;
  88. #undef multiply
  89. }
  90. uint64_t readui(const char **sp, const char *e, readui_base base) {
  91. const char *s = *sp;
  92. while (s<e && cwhite(*s)) s++;
  93. e = skipnum(s, e, base);
  94. *sp = e;
  95. return readui_valid(s, e, base);
  96. }
  97. #define MESSAGE_PATH "tokenize/read_cnumber/"
  98. struct scan_number {
  99. /*
  100. * Each of the pointers points to the first character of a given component.
  101. * Consider 0x50.1p+1f . It would be broken down into:
  102. */
  103. const char *prefix; // 0x
  104. const char *digits; // 50.1
  105. const char *exponent; // p+1
  106. const char *suffix; // f
  107. const char *end;
  108. size_t dots_found; // 1
  109. };
  110. /*
  111. * Scans past all the characters in a number token, fills the struct, and
  112. * returns one of TOK_INTEGER or TOK_FLOATING to indicate the type.
  113. *
  114. * First character must be [0-9 '.']
  115. */
  116. static enum token_type scan_number(struct scan_number *sn,
  117. const char *s, const char *e) {
  118. enum token_type type;
  119. sn->dots_found = 0;
  120. sn->prefix = s;
  121. sn->digits = s;
  122. if (s+3<=e && s[0]=='0') {
  123. if (s[1]=='X' || s[1]=='x') {
  124. //hexadecimal
  125. s += 2;
  126. sn->digits = s;
  127. for (;s<e;s++) {
  128. if (*s == '.')
  129. sn->dots_found++;
  130. else if (!chex(*s))
  131. break;
  132. }
  133. goto done_scanning_digits;
  134. } else if (s[1]=='B' || s[1]=='b') {
  135. //binary
  136. s += 2;
  137. if (*s!='0' && *s!='1')
  138. s -= 2;
  139. sn->digits = s;
  140. }
  141. }
  142. //binary, decimal, or octal
  143. for (;s<e;s++) {
  144. if (*s == '.')
  145. sn->dots_found++;
  146. else if (!cdigit(*s))
  147. break;
  148. }
  149. done_scanning_digits:
  150. sn->exponent = s;
  151. if (s<e && (
  152. (sn->prefix==sn->digits && (*s=='E' || *s=='e')) ||
  153. (sn->prefix < sn->digits && (*s=='P' || *s=='p'))
  154. )) {
  155. s++;
  156. if (s<e && (*s=='+' || *s=='-'))
  157. s++;
  158. while (s<e && cdigit(*s)) s++;
  159. }
  160. sn->suffix = s;
  161. while (s<e && (cdigit(*s) || cletter(*s) ||
  162. *s=='.' || *s=='_' || *s=='$')) s++;
  163. sn->end = s;
  164. //Now we're done scanning, but now we want to know what type this is
  165. type = TOK_INTEGER;
  166. if (sn->dots_found)
  167. type = TOK_FLOATING;
  168. if (sn->exponent < sn->suffix)
  169. type = TOK_FLOATING;
  170. //if this is an octal, make the leading 0 a prefix
  171. if (type==TOK_INTEGER && sn->prefix==sn->digits &&
  172. sn->digits < s && sn->digits[0]=='0')
  173. sn->digits++;
  174. return type;
  175. }
  176. static enum tok_suffix read_number_suffix(const char *s, const char *e,
  177. enum token_type type, tok_message_queue *mq) {
  178. const char *orig_s = s;
  179. enum tok_suffix sfx = 0;
  180. //read the suffix in pieces
  181. while (s<e) {
  182. enum tok_suffix sfx_prev = sfx;
  183. char c = *s++;
  184. if (c>='a' && c<='z')
  185. c -= 'a'-'A';
  186. if (c=='L') {
  187. if (s<e && (*s=='L' || *s=='l')) {
  188. s++;
  189. sfx |= TOK_LL;
  190. //TOK_L and TOK_LL are mutually exclusive
  191. if (sfx & TOK_L)
  192. goto invalid;
  193. } else {
  194. sfx |= TOK_L;
  195. }
  196. }
  197. else if (c=='U')
  198. sfx |= TOK_U;
  199. else if (c=='F')
  200. sfx |= TOK_F;
  201. else if (c=='I')
  202. sfx |= TOK_I;
  203. else
  204. goto invalid;
  205. if (sfx == sfx_prev)
  206. goto invalid; //suffix piece was repeated
  207. }
  208. //make sure the suffix is appropriate for this number type
  209. if (type==TOK_INTEGER && (sfx & TOK_F)) {
  210. tok_msg_error(suffix_float_only, orig_s,
  211. "Suffix only valid for floating point numbers");
  212. sfx = TOK_NOSUFFIX;
  213. }
  214. if (type==TOK_FLOATING && (sfx & (TOK_U | TOK_LL))) {
  215. tok_msg_error(suffix_integer_only, orig_s,
  216. "Suffix only valid for integers");
  217. sfx = TOK_NOSUFFIX;
  218. }
  219. return sfx;
  220. invalid:
  221. if (type==TOK_INTEGER)
  222. tok_msg_error(integer_suffix_invalid, orig_s,
  223. "Integer suffix invalid");
  224. else
  225. tok_msg_error(floating_suffix_invalid, orig_s,
  226. "Floating point suffix invalid");
  227. return TOK_NOSUFFIX;
  228. }
  229. static void read_integer(struct tok_integer *out, const struct scan_number *sn,
  230. tok_message_queue *mq) {
  231. /*
  232. Assertions about an integer's struct scan_number:
  233. prefix is empty or [0 0B 0b 0X 0x]
  234. sn->digits is not empty (i.e. sn->digits < sn->exponent)
  235. *unless* the prefix is "0"
  236. has no exponent
  237. suffix is [0-9 A-Z a-z '.']*
  238. dots_found == 0
  239. */
  240. readui_base base = READUI_DEC;
  241. const char *tokstart = sn->prefix;
  242. const char *s = sn->digits, *e = sn->exponent;
  243. if (sn->prefix+1 < sn->digits) {
  244. if (sn->prefix[1]=='X' || sn->prefix[1]=='x')
  245. base = READUI_HEX;
  246. else
  247. base = READUI_OCT;
  248. } else if (sn->prefix < sn->digits) {
  249. base = READUI_OCT;
  250. }
  251. if (s>=e && base==READUI_OCT) {
  252. //octal contains no digits
  253. out->v = 0;
  254. out->base = 8;
  255. goto suffix;
  256. }
  257. out->v = readui(&s, sn->exponent, base);
  258. out->base = base & 0xFF;
  259. if (s != e || errno == EINVAL) {
  260. tok_msg_error(integer_invalid_digits, tokstart,
  261. "Integer constant contains invalid digits");
  262. } else if (errno) {
  263. if (errno == ERANGE) {
  264. tok_msg_error(integer_out_of_range, tokstart,
  265. "Integer constant out of range");
  266. } else {
  267. tok_msg_bug(readui_unknown, tokstart,
  268. "Unknown error returned by readui");
  269. }
  270. }
  271. suffix:
  272. out->suffix =
  273. read_number_suffix(sn->suffix, sn->end, TOK_INTEGER, mq);
  274. return;
  275. }
  276. static void read_floating(struct tok_floating *out, const struct scan_number *sn,
  277. tok_message_queue *mq) {
  278. /*
  279. Assertions about a float's struct scan_number:
  280. prefix is empty or [0B 0b 0X 0x] (note: no octal prefix 0)
  281. sn->digits not empty, ever
  282. exponent may or may not exist
  283. If exponent exists, it is valid and formatted as:
  284. ( [E P e p] ['+' '-']*0..1 [0-9]* )
  285. An exponent starts with E if this is decimal, P if it is hex/binary
  286. suffix is [0-9 A-Z a-z '.']*
  287. dots_found can be anything
  288. */
  289. const char *tokstart = sn->prefix;
  290. const char *s = sn->prefix, *e = sn->suffix;
  291. char borrow = *sn->end;
  292. //long double strtold(const char *nptr, char **endptr);
  293. out->v = 0.0;
  294. out->suffix = TOK_NOSUFFIX;
  295. if (sn->prefix < sn->digits) {
  296. if (sn->prefix[1]=='B' || sn->prefix[1]=='b') {
  297. tok_msg_error(binary_float, tokstart,
  298. "Binary floating point constants not allowed");
  299. return;
  300. }
  301. if (sn->exponent >= sn->suffix) {
  302. tok_msg_error(hex_float_no_exponent, tokstart,
  303. "Hex floating point constant missing exponent");
  304. return;
  305. }
  306. }
  307. /* Stick a null terminator at the end of the input so strtold
  308. * won't read beyond the given input.
  309. *
  310. * This is thread-safe because the input is from
  311. * token_list.txt, which was generated in the
  312. * tokenize function which is still running.
  313. */
  314. *(char*)sn->end = 0;
  315. errno = 0;
  316. out->v = strtold(s, (char**)&s);
  317. //don't forget to set it back
  318. *(char*)sn->end = borrow;
  319. if (errno) {
  320. //for some reason, strtold may errno to EDOM to indicate underrun
  321. //open test/run.c and search "floating_out_of_range" for more details
  322. if (errno == ERANGE || errno == EDOM) {
  323. tok_msg_error(floating_out_of_range, tokstart,
  324. "Floating point constant out of range");
  325. } else {
  326. tok_msg_bug(strtold_unknown, tokstart,
  327. "Unknown error returned by strtold");
  328. }
  329. }
  330. if (s != e) {
  331. tok_msg_error(floating_invalid_digits, tokstart,
  332. "Floating point constant contains invalid digits");
  333. }
  334. out->suffix =
  335. read_number_suffix(sn->suffix, sn->end, TOK_FLOATING, mq);
  336. }
  337. char *read_cnumber(struct token *tok, const char *s, const char *e, tok_message_queue *mq) {
  338. struct scan_number sn;
  339. tok->type = scan_number(&sn, s, e);
  340. if (tok->type == TOK_INTEGER)
  341. read_integer(&tok->integer, &sn, mq);
  342. else
  343. read_floating(&tok->floating, &sn, mq);
  344. return (char*)sn.end;
  345. }
  346. #undef MESSAGE_PATH