ccan_tokenizer.h 9.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321
  1. /*
  2. Copyright (c) 2009 Joseph A. Adams
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions
  6. are met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in the
  11. documentation and/or other materials provided with the distribution.
  12. 3. The name of the author may not be used to endorse or promote products
  13. derived from this software without specific prior written permission.
  14. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  15. IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  16. OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  17. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  18. INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  19. NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  20. DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  21. THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  22. (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  23. THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  24. */
  25. #ifndef CCAN_TOKENIZER_H
  26. #define CCAN_TOKENIZER_H
  27. #include <ccan/darray/darray.h>
  28. #include "charflag.h"
  29. #include "dict.h"
  30. #include "queue.h"
  31. #include <stdint.h>
  32. #include <errno.h> //for readui
  33. /* Definition of tokens and the token list */
  34. enum token_type {
  35. TOK_INTEGER, //integer (e.g. 5, 1000L, 0x5)
  36. TOK_FLOATING, //floating point number (e.g. 5.0, 7.0f, etc.)
  37. TOK_OPERATOR, //operator (e.g. +, -, (, ), ++, etc.)
  38. #define token_type_is_identifier(type) ((type)>=TOK_KEYWORD && (type)<=TOK_IDENTIFIER)
  39. TOK_KEYWORD, //keyword (e.g. char, _Bool, ifdef)
  40. TOK_IDENTIFIER, //identifier or unprocessed keyword (e.g. int, token, pp_conditions)
  41. TOK_CHAR, //character literal (e.g. 'a' or even '1234')
  42. TOK_STRING, //string literal (e.g. "hello" or "zero\0inside")
  43. TOK_LEADING_POUND, //leading # in a preprocessor directive (e.g. # include)
  44. TOK_STRING_IQUOTE, // "config.h"
  45. TOK_STRING_IANGLE, // <stdio.h>
  46. #define token_type_is_ignored(type) ((type)>=TOK_CCOMMENT && (type)<=TOK_WHITE)
  47. #define token_type_is_comment(type) ((type)>=TOK_CCOMMENT && (type)<=TOK_CPPCOMMENT)
  48. TOK_CCOMMENT, //C comment (e.g. /* comment */)
  49. TOK_CPPCOMMENT, //C++ comment (e.g. //comment )
  50. TOK_WHITE, //whitespace (span of \t\n\v\f\r and space)
  51. TOK_STARTLINE, //beginning of line (txt/txtsize is always empty)
  52. TOK_STRAY, //control characters, weird characters, and extended characters where they shouldn't be
  53. };
  54. enum tok_suffix {
  55. TOK_NOSUFFIX = 0,
  56. TOK_U = 1, //unsigned
  57. TOK_L = 2, //long or double-precision float
  58. TOK_LL = 4, //long long (note that TOK_L and TOK_LL are mutually exclusive)
  59. TOK_F = 8, //float (single-precision)
  60. TOK_I = 16, //imaginary
  61. TOK_UL = TOK_U | TOK_L, //unsigned long
  62. TOK_ULL = TOK_U | TOK_LL, //unsigned long long
  63. //Imaginary combo meals
  64. TOK_IMAG_U = TOK_I | TOK_U,
  65. TOK_IMAG_L = TOK_I | TOK_L,
  66. TOK_IMAG_LL = TOK_I | TOK_LL,
  67. TOK_IMAG_F = TOK_I | TOK_F,
  68. TOK_IMAG_UL = TOK_I | TOK_UL,
  69. TOK_IMAG_ULL = TOK_I | TOK_ULL,
  70. };
  71. struct tok_integer {
  72. uint64_t v;
  73. int base; //one of 2, 8, 10, or 16
  74. enum tok_suffix suffix;
  75. };
  76. struct tok_floating {
  77. long double v;
  78. enum tok_suffix suffix;
  79. };
  80. //Operator/keyword naming conventions taken from Jeff Lee's Yacc grammar:
  81. //http://www.lysator.liu.se/c/ANSI-C-grammar-y.html
  82. enum tok_opkw {
  83. /* Permute these regularly */
  84. PTR_OP=128, INC_OP, DEC_OP, LEFT_OP, RIGHT_OP, LE_OP, GE_OP, EQ_OP, NE_OP,
  85. AND_OP, OR_OP,
  86. MUL_ASSIGN, DIV_ASSIGN, MOD_ASSIGN,
  87. ADD_ASSIGN, SUB_ASSIGN,
  88. AND_ASSIGN, XOR_ASSIGN, OR_ASSIGN,
  89. LEFT_ASSIGN, RIGHT_ASSIGN,
  90. ELLIPSIS,
  91. DOUBLE_POUND,
  92. //Keywords
  93. _BOOL,
  94. _COMPLEX,
  95. _IMAGINARY,
  96. BREAK,
  97. CASE,
  98. CHAR,
  99. CONST,
  100. CONTINUE,
  101. DEFAULT,
  102. DO,
  103. DOUBLE,
  104. ELSE,
  105. ENUM,
  106. EXTERN,
  107. FLOAT,
  108. FOR,
  109. GOTO,
  110. IF,
  111. INLINE,
  112. INT,
  113. LONG,
  114. REGISTER,
  115. RESTRICT,
  116. RETURN,
  117. SHORT,
  118. SIGNED,
  119. SIZEOF,
  120. STATIC,
  121. STRUCT,
  122. SWITCH,
  123. TYPEDEF,
  124. UNION,
  125. UNSIGNED,
  126. VOID,
  127. VOLATILE,
  128. WHILE,
  129. //Preprocessor keywords (except those already defined)
  130. VA_ARGS,
  131. #define opkw_is_directive_only(opkw) ((opkw)>=DEFINE && (opkw)<=WARNING)
  132. #define opkw_is_directive(opkw) (opkw_is_directive_only(opkw) || (opkw)==ELSE || (opkw)==IF)
  133. DEFINE,
  134. ELIF,
  135. //ELSE,
  136. ENDIF,
  137. ERROR,
  138. //IF,
  139. IFDEF,
  140. IFNDEF,
  141. INCLUDE,
  142. LINE,
  143. PRAGMA,
  144. UNDEF,
  145. WARNING, /* gcc extension */
  146. };
  147. struct token_flags {
  148. unsigned short
  149. pp:1, //is token part of a preprocessor line
  150. pp_directive:1; //does token follow a TOK_LEADING_POUND (e.g. # include)
  151. };
  152. struct token {
  153. struct token *prev, *next;
  154. struct token_flags flags;
  155. short type; //enum token_type
  156. union {
  157. struct tok_integer integer;
  158. struct tok_floating floating;
  159. int opkw; //operator or keyword ID (e.g. '+', INC_OP (++), ADD_ASSIGN (+=))
  160. darray_char *string; //applies to TOK_CHAR and TOK_STRING
  161. char *include; //applies to TOK_STRING_IQUOTE and TOK_STRING_IANGLE
  162. };
  163. //text this token represents (with backslash-broken lines merged)
  164. const char *txt;
  165. size_t txt_size;
  166. //text this token represents (untouched)
  167. const char *orig;
  168. size_t orig_size;
  169. //zero-based line and column number of this token
  170. size_t line, col;
  171. };
  172. //keywords such as int, long, etc. may be defined over, making them identifiers in a sense
  173. static inline int token_is_identifier(const struct token *tok) {
  174. return token_type_is_identifier(tok->type);
  175. }
  176. static inline int token_is_ignored(const struct token *tok) {
  177. return token_type_is_ignored(tok->type);
  178. }
  179. static inline int token_is_op(const struct token *tok, int opkw) {
  180. return tok->type==TOK_OPERATOR && tok->opkw==opkw;
  181. }
  182. static inline int token_is_kw(const struct token *tok, int opkw) {
  183. return tok->type==TOK_KEYWORD && tok->opkw==opkw;
  184. }
  185. static inline int token_txt_is(const struct token *tok, const char *str) {
  186. size_t len = strlen(str);
  187. return tok->txt_size==len && !memcmp(tok->txt, str, len);
  188. }
  189. struct token_list {
  190. struct token *first, *last;
  191. //Points to original input as given
  192. const char *orig;
  193. size_t orig_size;
  194. //position of the start of each real line with respect to orig
  195. const char * const *olines;
  196. size_t olines_size;
  197. //Copy of original input without backslash-broken lines
  198. const char *txt;
  199. size_t txt_size;
  200. //position of the start of each real line with respect to txt
  201. const char * const *tlines;
  202. size_t tlines_size;
  203. //Set me so tok_message_print will know what file name to display
  204. const char *filename;
  205. };
  206. extern struct dict *tokenizer_dict;
  207. typedef queue(struct tok_message) tok_message_queue;
  208. //the token_list is allocated as a child of tcontext
  209. struct token_list *tokenize(const void *tcontext, const char *orig, size_t orig_size, tok_message_queue *mq);
  210. size_t token_list_count(const struct token_list *tl);
  211. //used for debugging
  212. int token_list_sanity_check(const struct token_list *tl, FILE *err);
  213. void token_list_dump(const struct token_list *tl, FILE *f);
  214. /* tok_point_lookup is used to locate a pointer that is within a token list's
  215. txt or orig fields */
  216. struct tok_point {
  217. const char *txt, *orig;
  218. size_t line, col;
  219. };
  220. //returns nonzero if the pointer could be resolved
  221. int tok_point_lookup(struct tok_point *out, const char *ptr,
  222. const struct token_list *tl);
  223. /* Tokenizer message queue; used to gather and report warnings, errors, etc. */
  224. enum tok_message_level {TM_DEBUG, TM_INFO, TM_WARN, TM_ERROR, TM_BUG};
  225. struct tok_message {
  226. enum tok_message_level level;
  227. const char *path;
  228. //Unique slash-delimited name of the message
  229. //e.g. tokenize/read_cstring/ambiguous_octal
  230. const char *message;
  231. //Human-readable description
  232. //e.g. `Octal \007 followed by digit`
  233. const char *location;
  234. //Pointer (typically within the token list's txt or orig) of the error
  235. };
  236. #define tok_msg_debug(name, loc, fmt, ...) tok_message_add(mq, TM_DEBUG, MESSAGE_PATH #name, loc, fmt, ##__VA_ARGS__)
  237. #define tok_msg_info(name, loc, fmt, ...) tok_message_add(mq, TM_INFO, MESSAGE_PATH #name, loc, fmt, ##__VA_ARGS__)
  238. #define tok_msg_warn(name, loc, fmt, ...) tok_message_add(mq, TM_WARN, MESSAGE_PATH #name, loc, fmt, ##__VA_ARGS__)
  239. #define tok_msg_error(name, loc, fmt, ...) tok_message_add(mq, TM_ERROR, MESSAGE_PATH #name, loc, fmt, ##__VA_ARGS__)
  240. #define tok_msg_bug(name, loc, fmt, ...) tok_message_add(mq, TM_BUG, MESSAGE_PATH #name, loc, fmt, ##__VA_ARGS__)
  241. void tok_message_add(tok_message_queue *mq, enum tok_message_level level,
  242. const char *path, const char *loc, const char *fmt, ...);
  243. void tok_message_print(struct tok_message *m, struct token_list *tl);
  244. void tok_message_dump(struct tok_message *m);
  245. void tok_message_queue_dump(const tok_message_queue *mq);
  246. /* Miscellaneous internal components */
  247. char *read_cstring(darray_char *out, const char *s, const char *e, char quoteChar, tok_message_queue *mq);
  248. char *read_cnumber(struct token *tok, const char *s, const char *e, tok_message_queue *mq);
  249. typedef unsigned int readui_base;
  250. #define READUI_ALLOWHIGHERDIGITS 256
  251. #define READUI_ALLOWCAPLETTERS 512
  252. #define READUI_ALLOWLCASELETTERS 1024
  253. #define READUI_ALLOWLETTERS (READUI_ALLOWCAPLETTERS | READUI_ALLOWLCASELETTERS)
  254. #define READUI_DEC ((readui_base)(10))
  255. #define READUI_HEX ((readui_base)(16 | READUI_ALLOWLETTERS))
  256. #define READUI_OCT ((readui_base)(8))
  257. #define READUI_BIN ((readui_base)(2))
  258. uint64_t readui(const char **sp, const char *e, readui_base base);
  259. #endif