ccan_tokenizer.c 28 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091
  1. /*
  2. Copyright (c) 2009 Joseph A. Adams
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions
  6. are met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in the
  11. documentation and/or other materials provided with the distribution.
  12. 3. The name of the author may not be used to endorse or promote products
  13. derived from this software without specific prior written permission.
  14. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  15. IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  16. OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  17. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  18. INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  19. NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  20. DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  21. THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  22. (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  23. THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  24. */
  25. #include "ccan_tokenizer.h"
  26. #include <ccan/talloc/talloc.h>
  27. #include <assert.h>
  28. //Shown by operator precedence; based on
  29. // http://tigcc.ticalc.org/doc/opers.html#precedence .
  30. static struct dict_entry c_dictionary[] = {
  31. //1. Highest
  32. {'(',"("}, {')',")"},
  33. {'[',"["}, {']',"]"},
  34. {'{',"{"}, {'}',"}"},
  35. {'.',"."},
  36. {PTR_OP,"->"},
  37. //2. Unary
  38. {'!',"!"}, {'~',"~"}, //prefix
  39. {INC_OP,"++"}, {DEC_OP,"--"}, //prefix or postfix
  40. // + - & *
  41. //3. Multiplicative
  42. // *
  43. {'/',"/"}, {'%',"%"},
  44. //4. Additive
  45. // + -
  46. //5. Shift
  47. {LEFT_OP,"<<"}, {RIGHT_OP,">>"},
  48. //6. Relational
  49. {'<',"<"}, {'>',">"},
  50. {LE_OP,"<="}, {GE_OP,">="},
  51. //7. Equality
  52. {EQ_OP,"=="}, {NE_OP,"!="},
  53. //8. Bitwise AND
  54. // &
  55. //9. Bitwise XOR
  56. {'^',"^"},
  57. //10. Bitwise OR
  58. {'|',"|"},
  59. //11. Logical AND
  60. {AND_OP,"&&"},
  61. //12. Logical OR
  62. {OR_OP,"||"},
  63. //13. Conditional
  64. {'?',"?"}, {':',":"},
  65. //14. Assignment
  66. {'=',"="},
  67. {MUL_ASSIGN,"*="}, {DIV_ASSIGN,"/="}, {MOD_ASSIGN,"%="},
  68. {ADD_ASSIGN,"+="}, {SUB_ASSIGN,"-="},
  69. {AND_ASSIGN,"&="}, {XOR_ASSIGN,"^="}, {OR_ASSIGN,"|="},
  70. {LEFT_ASSIGN,"<<="}, {RIGHT_ASSIGN,">>="},
  71. //15. Comma
  72. {',',","},
  73. //16. Semicolon
  74. {';',";"},
  75. //Misc
  76. {ELLIPSIS,"..."},
  77. {'#',"#"},
  78. {DOUBLE_POUND,"##"},
  79. //Ambiguous
  80. //unary or binary
  81. {'+',"+"}, {'-',"-"},
  82. {'&',"&"}, {'*',"*"},
  83. //Keywords
  84. {_BOOL, "_Bool"},
  85. {_COMPLEX, "_Complex"},
  86. {_IMAGINARY, "_Imaginary"},
  87. {BREAK, "break"},
  88. {CASE, "case"},
  89. {CHAR, "char"},
  90. {CONST, "const"},
  91. {CONTINUE, "continue"},
  92. {DEFAULT, "default"},
  93. {DO, "do"},
  94. {DOUBLE, "double"},
  95. {ELSE, "else"},
  96. {ENUM, "enum"},
  97. {EXTERN, "extern"},
  98. {FLOAT, "float"},
  99. {FOR, "for"},
  100. {GOTO, "goto"},
  101. {IF, "if"},
  102. {INLINE, "inline"},
  103. {INT, "int"},
  104. {LONG, "long"},
  105. {REGISTER, "register"},
  106. {RESTRICT, "restrict"},
  107. {RETURN, "return"},
  108. {SHORT, "short"},
  109. {SIGNED, "signed"},
  110. {SIZEOF, "sizeof"},
  111. {STATIC, "static"},
  112. {STRUCT, "struct"},
  113. {SWITCH, "switch"},
  114. {TYPEDEF, "typedef"},
  115. {UNION, "union"},
  116. {UNSIGNED, "unsigned"},
  117. {VOID, "void"},
  118. {VOLATILE, "volatile"},
  119. {WHILE, "while"},
  120. //Preprocessor keywords (except those already defined)
  121. {VA_ARGS, "__VA_ARGS__"},
  122. {DEFINE, "define"},
  123. {ELIF, "elif"},
  124. // {ELSE, "else"},
  125. {ENDIF, "endif"},
  126. {ERROR, "error"},
  127. // {IF, "if"},
  128. {IFDEF, "ifdef"},
  129. {IFNDEF, "ifndef"},
  130. {INCLUDE, "include"},
  131. {LINE, "line"},
  132. {PRAGMA, "pragma"},
  133. {UNDEF, "undef"},
  134. {WARNING, "warning"},
  135. };
  136. #if 0
  137. struct tokenizer *tokenizer_new(void *ctx) {
  138. struct tokenizer *t = talloc(ctx, struct tokenizer);
  139. t->ctx = ctx;
  140. queue_init(t->mq, t);
  141. t->dict = dict_build(t, c_dictionary, sizeof(c_dictionary)/sizeof(*c_dictionary));
  142. return t;
  143. }
  144. #endif
  145. static int talloc_darray_destructor(void *ptr);
  146. /*
  147. * darray(T) *talloc_darray(const void *context);
  148. *
  149. * Create a new darray anchored in a talloc buffer.
  150. * When this pointer is freed, the darray will be freed as well.
  151. */
  152. static void *talloc_darray(const void *context)
  153. {
  154. void *ret = talloc(context, darray(void));
  155. darray_init(*(darray(void)*)ret);
  156. talloc_set_destructor(ret, talloc_darray_destructor);
  157. return ret;
  158. }
  159. static int talloc_darray_destructor(void *ptr)
  160. {
  161. darray(void) *arr = ptr;
  162. free(arr->item);
  163. return 0;
  164. }
  165. #define MESSAGE_PATH "tokenize/"
  166. static void unbreak_backslash_broken_lines(struct token_list *tl, tok_message_queue *mq) {
  167. const char *s = tl->orig, *e = s+tl->orig_size;
  168. darray_char *txt = talloc_darray(tl);
  169. darray(const char*) *olines = talloc_darray(tl);
  170. darray(const char*) *tlines = talloc_darray(tl);
  171. do {
  172. const char *line_start = s, *line_end;
  173. const char *lnw; //last non-white
  174. size_t start_offset = txt->size;
  175. //scan to the next line and find the last non-white character in the line
  176. while (s<e && !creturn(*s)) s++;
  177. line_end = s;
  178. lnw = s;
  179. while (lnw>line_start && cspace(lnw[-1])) lnw--;
  180. if (s<e && creturn(*s)) {
  181. s++;
  182. //check for non-standard newlines (i.e. "\r", "\r\n", or "\n\r")
  183. if (s<e && *s=='\n'+'\r'-s[-1])
  184. s++;
  185. }
  186. //add the backslash-break-free version of the text
  187. if (lnw>line_start && lnw[-1]=='\\' && line_end<e) {
  188. darray_append_items(*txt, line_start, lnw-1-line_start);
  189. if (lnw<e && cspace(*lnw)) {
  190. tok_msg_warn(spaces_after_backslash_break, lnw,
  191. "Trailing spaces after backslash-broken line");
  192. }
  193. } else
  194. darray_append_items(*txt, line_start, s-line_start);
  195. //add the line starts for this line
  196. darray_append(*olines, line_start);
  197. darray_append(*tlines, (const char*)start_offset);
  198. //Since the txt buffer moves when expanded, we're storing offsets
  199. // for now. Once we're done building txt, we can add the base
  200. // of it to all the offsets to make them pointers.
  201. } while (s<e);
  202. //stick a null terminator at the end of the text
  203. darray_realloc(*txt, txt->size+1);
  204. txt->item[txt->size] = 0;
  205. //convert the line start offsets to pointers
  206. {
  207. const char **i;
  208. darray_foreach(i, *tlines)
  209. *i = txt->item + (size_t)(*i);
  210. }
  211. tl->olines = olines->item;
  212. tl->olines_size = olines->size;
  213. tl->txt = txt->item;
  214. tl->txt_size = txt->size;
  215. tl->tlines = tlines->item;
  216. tl->tlines_size = tlines->size;
  217. }
  218. static void normal_keyword(struct token *tok) {
  219. if (tok->type==TOK_KEYWORD &&
  220. (opkw_is_directive_only(tok->opkw) || tok->opkw==VA_ARGS))
  221. tok->type = TOK_IDENTIFIER;
  222. }
  223. static int define_parmlist_has_ellipsis(struct token *start, struct token *end) {
  224. while (end>start && token_is_ignored(end-1)) end--;
  225. return (end-->start && end->type==TOK_OPERATOR && end->opkw==ELLIPSIS);
  226. }
  227. //Used to label __VA_ARGS__ as keywords within applicable macro expansions
  228. //Start should follow the DEFINE directive keyword
  229. static void this_is_a_define(struct token *start, struct token *end) {
  230. struct token *i = start, *pl_start;
  231. //skip past the identifier that is defined
  232. while (i<end && token_is_ignored(i)) i++;
  233. if (i >= end)
  234. return;
  235. //TODO: check i->type to make sure it's an identifier, throw error otherwise
  236. normal_keyword(i++);
  237. //see if this is actually a variadic macro
  238. if (!(i<end && i->type==TOK_OPERATOR && i->opkw=='('))
  239. goto not_va_args;
  240. pl_start = ++i;
  241. while (i<end && !(i->type==TOK_OPERATOR && i->opkw==')'))
  242. normal_keyword(i++);
  243. if (!define_parmlist_has_ellipsis(pl_start, i++))
  244. goto not_va_args;
  245. //We have arrived at the macro expansion and know there is a ... argument
  246. //Thus, we'll only change directive-only keywords to identifiers
  247. for(; i<end; i++) {
  248. if (i->type==TOK_KEYWORD && opkw_is_directive_only(i->opkw))
  249. i->type = TOK_IDENTIFIER;
  250. }
  251. not_va_args:
  252. while (i < end)
  253. normal_keyword(i++);
  254. }
  255. //fill the flags field of each token and untangle keywords and such
  256. static void finalize_line(struct token *start, struct token *end) {
  257. struct token *i = start, *j;
  258. assert(start<end && start->type==TOK_STARTLINE);
  259. i++;
  260. while (i<end && token_is_ignored(i)) i++;
  261. if (i<end && i->type==TOK_OPERATOR && i->opkw=='#') {
  262. //preprocessor line
  263. i->type = TOK_LEADING_POUND;
  264. //set pp on all tokens in this line
  265. for (j=start; j<end; j++)
  266. j->flags.pp = 1;
  267. //find the relevant token after the '#'
  268. for (i++; i<end; i++) {
  269. if (!token_is_ignored(i)) {
  270. i->flags.pp_directive = 1;
  271. if (i->type==TOK_KEYWORD && !opkw_is_directive(i->opkw))
  272. i->type = TOK_IDENTIFIER;
  273. //TODO: Handle invalid preprocessor directives (e.g. #+ )
  274. if (i->type==TOK_KEYWORD && i->opkw==DEFINE) {
  275. for (j=i+1; j<end; j++)
  276. this_is_a_define(i+1, end);
  277. } else {
  278. while (++i < end)
  279. normal_keyword(i);
  280. }
  281. break;
  282. }
  283. }
  284. } else {
  285. //normal line
  286. while (i < end)
  287. normal_keyword(i++);
  288. }
  289. }
  290. //fill the list, flags, line, col, orig, and orig_size fields of each token
  291. //convert identifiers mistaken for preprocessor keywords (e.g. ifdef) to identifiers
  292. static void finalize(struct token_list *tl, struct token *start, struct token *end) {
  293. const char * const *lss = tl->tlines;
  294. const char * const *lse = lss + tl->tlines_size;
  295. struct token *i;
  296. struct token *startline = NULL;
  297. assert(start < end);
  298. tl->first = start;
  299. tl->last = end-1;
  300. for (i=start; ; i++) {
  301. //perform a second pass on each line
  302. if (i >= end || i->type == TOK_STARTLINE) {
  303. if (startline)
  304. finalize_line(startline, i);
  305. startline = i;
  306. }
  307. if (i >= end) {
  308. end[-1].orig_size = tl->orig+tl->orig_size - end[-1].orig;
  309. break;
  310. }
  311. //set up the list links
  312. i->prev = i>start ? i-1 : NULL;
  313. i->next = i+1<end ? i+1 : NULL;
  314. //if i->txt starts on a later line, advance to it
  315. while (lss+1<lse && i->txt >= lss[1] && i->txt > lss[0])
  316. lss++;
  317. //set up line, col, orig, and orig_size
  318. i->line = lss - tl->tlines;
  319. i->col = i->txt - *lss;
  320. i->orig = tl->olines[i->line] + i->col;
  321. if (i > start)
  322. i[-1].orig_size = i->orig - i[-1].orig;
  323. assert(i->line < tl->olines_size);
  324. //clear the flags
  325. memset(&i->flags, 0, sizeof(i->flags));
  326. }
  327. }
  328. #define add(...) do { \
  329. struct token tok = {__VA_ARGS__}; \
  330. tok.txt = orig; \
  331. tok.txt_size = s-orig; \
  332. darray_append(*arr, tok); \
  333. } while (0)
  334. #define cstray(c) (ccontrol(c) || cextended(c) || (c)=='@' || (c)=='`' || (c)=='\\')
  335. #define cident(c) (cletter(c) || cdigit(c) || c=='_' || c=='$')
  336. //believe it or not, $ is a valid character in an identifier
  337. struct dict *tokenizer_dict = NULL;
  338. static void free_tokenizer_dict(void) {
  339. talloc_free(tokenizer_dict);
  340. }
  341. struct token_list *tokenize(const void *tcontext, const char *orig, size_t orig_size,
  342. tok_message_queue *mq) {
  343. struct token_list *tl = talloc(tcontext, struct token_list);
  344. const char *s, *e;
  345. size_t stray_count=0, cr_count=0;
  346. darray(struct token) *arr = talloc_darray(tl);
  347. int only_pound_include = 0;
  348. if (!tokenizer_dict) {
  349. tokenizer_dict = dict_build(NULL, c_dictionary,
  350. sizeof(c_dictionary)/sizeof(*c_dictionary));
  351. atexit(free_tokenizer_dict);
  352. }
  353. tl->orig = orig;
  354. tl->orig_size = orig_size;
  355. unbreak_backslash_broken_lines(tl, mq);
  356. tl->filename = NULL;
  357. s = tl->txt;
  358. e = s + tl->txt_size;
  359. darray_appends_t(*arr, struct token, {
  360. .type = TOK_STARTLINE,
  361. .txt = s,
  362. .txt_size = 0
  363. } );
  364. while (s<e) {
  365. const char *orig = s;
  366. char c = *s++;
  367. int added_something = 1;
  368. if (cstray(c)) {
  369. stray_count++;
  370. while (s<e && cstray(*s)) {
  371. s++;
  372. stray_count++;
  373. }
  374. add(.type = TOK_STRAY);
  375. /* This has the potential to be very noisy on binary
  376. files, but it really is quite useful. */
  377. tok_msg_error(stray_segment, orig,
  378. "%zu stray characters", s-orig);
  379. } else if (creturn(c)) {
  380. //check for non-standard newlines (i.e. "\r", "\r\n", or "\n\r")
  381. if (s<e && *s=='\n'+'\r'-c) {
  382. s++;
  383. cr_count++;
  384. } else if (c=='\r')
  385. cr_count++;
  386. add(.type = TOK_WHITE);
  387. orig = s;
  388. //add a TOK_STARTLINE for the next line unless this is the end of the document
  389. if (s<e)
  390. add(.type = TOK_STARTLINE);
  391. only_pound_include = 0;
  392. } else if (cspace(c)) {
  393. //skip over the remaining whitespace
  394. while (s<e && cspace(*s)) s++;
  395. add(.type = TOK_WHITE);
  396. added_something = 0;
  397. } else if (cdigit(c) || (c=='.' && s<e && cdigit(*s))) {
  398. struct token tok;
  399. s = read_cnumber(&tok, s-1, e, mq);
  400. tok.txt = orig;
  401. tok.txt_size = s-orig;
  402. darray_append(*arr, tok);
  403. } else if (csymbol(c) || cident(c)) {
  404. if (only_pound_include && (c=='"' || c=='<')) { //include string
  405. char *include;
  406. char end = c=='"' ? '"' : '>';
  407. short type = c=='"' ? TOK_STRING_IQUOTE : TOK_STRING_IANGLE;
  408. while (s<e && !creturn(*s) && *s!=end) s++;
  409. include = talloc_strndup(tl, orig+1, s-(orig+1));
  410. if (s<e && *s==end) {
  411. s++;
  412. } else {
  413. tok_msg_error(include_missing_terminator, orig,
  414. "Missing terminating %c character", end);
  415. }
  416. add(.type = type,
  417. {.include = include});
  418. } else if (c=='\'' || c=='\"') { //character or string literal
  419. darray_char *string = talloc_darray(tl);
  420. s = read_cstring(string, s, e, c, mq);
  421. if (s<e) s++; //advance past endquote (if available)
  422. add(.type = c=='\'' ? TOK_CHAR : TOK_STRING,
  423. {.string = string});
  424. if (c=='\'' && string->size==0) {
  425. tok_msg_error(empty_char_constant, orig,
  426. "Empty character constant");
  427. }
  428. } else if (c=='/' && s<e && (*s=='*' || *s=='/')) { //comment
  429. if (*s++ == '*') { /* C-style comment */
  430. const char *comment_start = s-2;
  431. for (;;s++) {
  432. if (s+1 >= e) {
  433. s = e;
  434. tok_msg_error(unterminated_comment, comment_start,
  435. "Unterminated comment");
  436. break;
  437. }
  438. if (s[0]=='*' && s[1]=='/') {
  439. s += 2;
  440. break;
  441. }
  442. }
  443. add(.type = TOK_CCOMMENT);
  444. } else { // C++-style comment
  445. while (s<e && !creturn(*s)) s++;
  446. add(.type = TOK_CPPCOMMENT);
  447. }
  448. added_something = 0;
  449. } else { //operator, keyword, or identifier
  450. struct dict_entry *ent;
  451. const char *ident_e = --s;
  452. while (ident_e<e && cident(*ident_e) ) ident_e++;
  453. ent = dict_lookup(tokenizer_dict, &s, e);
  454. if (cident(c)) { //keyword or identifier
  455. if (ent && s==ident_e) {
  456. add(.type = TOK_KEYWORD,
  457. {.opkw = ent->id});
  458. if (ent->id == INCLUDE) {
  459. //hacky way to lex #include string properly
  460. struct token *ts = arr->item;
  461. struct token *tp = ts+arr->size-1;
  462. while (tp>ts && token_is_ignored(tp-1))
  463. tp--;
  464. if (tp>ts && token_is_op(tp-1, '#')) {
  465. tp--;
  466. while (tp>ts && token_is_ignored(tp-1))
  467. tp--;
  468. if (tp>ts && tp[-1].type==TOK_STARTLINE) {
  469. only_pound_include = 1;
  470. continue;
  471. }
  472. }
  473. }
  474. } else {
  475. s = ident_e;
  476. add(.type = TOK_IDENTIFIER);
  477. }
  478. } else if (ent) { //operator
  479. add(.type = TOK_OPERATOR,
  480. {.opkw = ent->id});
  481. } else { //invalid symbol (shouldn't happen)
  482. tok_msg_bug(unrecognized_symbol, s,
  483. "Unrecognized symbol \'%c\'", c);
  484. s++;
  485. add(.type = TOK_STRAY);
  486. }
  487. }
  488. }
  489. if (added_something)
  490. only_pound_include = 0;
  491. }
  492. /*if (stray_count) {
  493. tok_msg_error(stray_characters, NULL,
  494. "%lu stray characters in text", (unsigned long)stray_count);
  495. }*/
  496. if (cr_count) {
  497. tok_msg_warn(nonstandard_newlines, NULL,
  498. "Text contains non-standard line terminators");
  499. }
  500. finalize(tl, arr->item, arr->item+arr->size);
  501. return tl;
  502. }
  503. size_t token_list_count(const struct token_list *tl) {
  504. size_t ret = 0;
  505. const struct token *i;
  506. for (i=tl->first; i; i=i->next)
  507. ret++;
  508. return ret;
  509. }
  510. static size_t find_line(const char *ptr, const char * const *lines, size_t line_count) {
  511. const char * const *orig = lines;
  512. const char * const *orig_e = lines+line_count;
  513. while (line_count > 1) {
  514. size_t middle = line_count>>1;
  515. if (ptr < lines[middle])
  516. line_count = middle;
  517. else {
  518. lines += middle;
  519. line_count -= middle;
  520. }
  521. }
  522. //select the *last* of equivalent lines
  523. while (lines+1 < orig_e && lines[0]==lines[1])
  524. lines++;
  525. // (don't) select the *first* of equivalent lines
  526. //while (lines>orig && lines<orig_e && lines[-1]==lines[0])
  527. // lines--;
  528. return lines - orig;
  529. }
  530. int tok_point_lookup(struct tok_point *out, const char *ptr,
  531. const struct token_list *tl) {
  532. size_t line_count = tl->olines_size;
  533. memset(out, 0, sizeof(*out));
  534. if (!tl)
  535. return 0;
  536. if (ptr >= tl->txt && ptr <= tl->txt+tl->txt_size) {
  537. out->txt = ptr;
  538. out->line = find_line(ptr, tl->tlines, line_count);
  539. if (out->line < line_count) {
  540. out->col = ptr - tl->tlines[out->line];
  541. out->orig = tl->olines[out->line] + out->col;
  542. } else {
  543. out->col = 0;
  544. out->orig = tl->orig + tl->orig_size;
  545. }
  546. return 1;
  547. } else if (ptr >= tl->orig && ptr <= tl->orig+tl->orig_size) {
  548. out->orig = ptr;
  549. out->line = find_line(ptr, tl->olines, line_count);
  550. if (out->line < line_count) {
  551. const char *tline_start = tl->tlines[out->line];
  552. const char *tline_end = out->line+1 < line_count ?
  553. tl->tlines[out->line+1] :
  554. tl->txt + tl->txt_size;
  555. out->col = ptr - tl->olines[out->line];
  556. out->txt = tline_start + out->col;
  557. if (out->txt > tline_end)
  558. out->txt = tline_end;
  559. } else {
  560. out->col = 0;
  561. out->txt = tl->txt + tl->txt_size;
  562. }
  563. return 1;
  564. } else {
  565. return 0;
  566. }
  567. }
  568. static char *escape_string(darray_char *buf, const char *str, size_t size) {
  569. const char *s = str, *e = s+size;
  570. darray_from_lit(*buf, "");
  571. for (;s<e;s++) {
  572. char buffer[8];
  573. const char *esc = buffer;
  574. unsigned char c = (unsigned char)*s;
  575. if (ccontrol(c))
  576. sprintf(buffer, "\\x%02X", c);
  577. else switch(c) {
  578. case '\t': esc = "\\t"; break;
  579. case '\n': esc = "\\n"; break;
  580. case '\v': esc = "\\v"; break;
  581. case '\f': esc = "\\f"; break;
  582. case '\r': esc = "\\r"; break;
  583. case '"': esc = "\\\""; break;
  584. case '\\': esc = "\\\\"; break;
  585. default:
  586. buffer[0] = c;
  587. buffer[1] = 0;
  588. }
  589. darray_append_string(*buf, esc);
  590. }
  591. return buf->item;
  592. }
  593. static int txt_orig_matches(const char *txt, size_t txt_size, const char *orig, size_t orig_size) {
  594. const char *ts = txt, *te = ts+txt_size;
  595. const char *os = orig, *oe = os+orig_size;
  596. do {
  597. const char *ob = os; //start of next backslash break
  598. const char *obe; //end of next backslash break
  599. size_t size; //amount of text to compare for this round
  600. while (ob<oe && *ob!='\\') ob++;
  601. obe = ob;
  602. if (obe < oe) { //there's a backslash
  603. obe++;
  604. while (obe<oe && cspace(*obe)) obe++;
  605. if (obe<oe && creturn(*obe)) { //there's a backslash-broken line
  606. obe++;
  607. if (obe<oe && *obe == '\n'+'\r'-obe[-1])
  608. obe++;
  609. } else //this is just a plain old backslash
  610. ob = obe;
  611. }
  612. size = ob-os;
  613. if (ts+size > te || memcmp(ts, os, size))
  614. return 0;
  615. ts += size;
  616. os = obe;
  617. } while (ts<te);
  618. if (ts != te || os != oe)
  619. return 0;
  620. return 1;
  621. }
  622. static int is_backslash_break(const char **end, const char *s, const char *e) {
  623. if (s<e && *s == '\\') {
  624. s++;
  625. while (s<e && cspace(*s)) s++;
  626. if (s<e && creturn(*s)) {
  627. s++;
  628. if (s<e && *s=='\n'+'\r'-s[-1])
  629. s++;
  630. *end = s;
  631. return 1;
  632. }
  633. return 0;
  634. }
  635. return 0;
  636. }
  637. #define failed(fmt, ...) do {fprintf(err, fmt "\n", ##__VA_ARGS__); return 0; } while(0)
  638. //tests that should pass on an untainted token list out of the tokenize() function
  639. static int token_list_sanity_check_initial(const struct token_list *tl, FILE *err) {
  640. struct token *first = tl->first;
  641. struct token *last = tl->last;
  642. struct token *i;
  643. const char *txt=tl->txt, *orig=tl->orig;
  644. const char *txt_e = txt+tl->txt_size, *orig_e = orig+tl->orig_size;
  645. if ((char*)first > (char*)last ||
  646. (size_t)((char*)last - (char*)first) % sizeof(struct token))
  647. failed("Token list pointers don't look right");
  648. //token list should not end with TOK_STARTLINE unless
  649. // the document is empty
  650. if (last!=first && last->type==TOK_STARTLINE)
  651. return 0;
  652. for (i=first; i; i=i->next) {
  653. //Verify list links
  654. if (i != first && i->prev != i-1)
  655. failed("list.prev is incorrect");
  656. if (i != last && i->next != i+1)
  657. failed("list.next is incorrect");
  658. //Make sure txt segments fill the entire tl->txt
  659. if (i->txt != txt)
  660. failed("txt does not fill the token list");
  661. txt += i->txt_size;
  662. if (txt > txt_e)
  663. failed("txt is out of bounds");
  664. //Make sure orig segments fill the entire tl->orig
  665. if (i->orig != orig)
  666. failed("orig does not fill the token list");
  667. orig += i->orig_size;
  668. if (orig > orig_e)
  669. failed("orig is out of bounds");
  670. }
  671. if (txt != txt_e)
  672. return 0;
  673. if (orig != orig_e)
  674. return 0;
  675. return 1;
  676. }
  677. int token_list_sanity_check(const struct token_list *tl, FILE *err) {
  678. struct token *first = tl->first;
  679. struct token *last = tl->last;
  680. struct token *i;
  681. int initial = 1;
  682. if (tl->first == NULL || tl->last == NULL)
  683. failed("Token list is completely empty");
  684. if (first->type!=TOK_STARTLINE ||
  685. first->txt!=tl->txt || first->txt_size!=0 ||
  686. first->orig!=tl->orig || first->orig_size!=0 ||
  687. first->line!=0 || first->col!=0)
  688. failed("Token list does not start with a valid TOK_STARTLINE");
  689. if (first->prev!=NULL || last->next!=NULL)
  690. failed("Token edge links are not NULL");
  691. for (i=first; i; i=i->next) {
  692. //Verify line,col
  693. if (tl->tlines[i->line] + i->col != i->txt)
  694. failed("line,col is wrong against txt");
  695. if (tl->olines[i->line] + i->col != i->orig)
  696. failed("line,col is wrong against orig");
  697. //Make sure tokens have proper sizes
  698. if (i->type!=TOK_STARTLINE && (i->txt_size==0 || i->orig_size==0 || i->txt_size > i->orig_size) )
  699. failed("Token is empty");
  700. if (i->type==TOK_STARTLINE && (i->txt_size!=0 || i->orig_size!=0) )
  701. failed("TOK_STARTLINE is non-empty");
  702. //Make sure TOK_WHITE actually contains white tokens
  703. if (i->type==TOK_WHITE) {
  704. const char *s = i->txt, *e = s+i->txt_size;
  705. while (s<e && cwhite(*s)) s++;
  706. if (s != e)
  707. failed("TOK_WHITE does not contain only white characters");
  708. }
  709. //Make sure txt and orig match exactly except for backslash line breaks
  710. if (!txt_orig_matches(i->txt, i->txt_size, i->orig, i->orig_size)) {
  711. darray_char buf = darray_new();
  712. fprintf(err,
  713. "txt and orig do not match:\n"
  714. "\ttxt = \"%s\"\n",
  715. escape_string(&buf, i->txt, i->txt_size) );
  716. fprintf(err, "\torig = \"%s\"\n",
  717. escape_string(&buf, i->orig, i->orig_size) );
  718. darray_free(buf);
  719. return 0;
  720. }
  721. //Make sure tok_point_lookup returns correct point
  722. {
  723. struct tok_point tok_point;
  724. const char *t=i->txt, *o=i->orig, *e=o+i->orig_size, *p;
  725. size_t line=i->line, col=i->col;
  726. #define check(ptr) do { \
  727. if (tok_point_lookup(&tok_point, ptr, tl)) { \
  728. if (tok_point.txt != t || tok_point.orig != o) \
  729. failed("tok_point_lookup on txt reported incorrect txt/orig (orig is %d, should be %d)", \
  730. (int)(tok_point.orig-i->orig), (int)(o-i->orig)); \
  731. if (tok_point.line != line || tok_point.col != col) \
  732. failed("tok_point_lookup on txt reported incorrect line/col (off by %d, %d)", \
  733. (int)(tok_point.line-line), (int)(tok_point.col-col)); \
  734. } else if (initial) {\
  735. failed("tok_point_lookup failed on initial token list"); \
  736. } \
  737. } while(0)
  738. for (;;) {
  739. while (is_backslash_break(&p, o, e)) {
  740. while (o<p) {
  741. check(o);
  742. o++;
  743. col++;
  744. }
  745. col = 0;
  746. line++;
  747. }
  748. if (o >= e)
  749. break;
  750. do {
  751. if (creturn(*o)) {
  752. p = o+1;
  753. if (p<e && *p=='\n'+'\r'-p[-1])
  754. p++;
  755. while (o<p) {
  756. check(o);
  757. check(t);
  758. t++, o++, col++;
  759. }
  760. line++;
  761. col = 0;
  762. } else {
  763. check(o);
  764. check(t);
  765. o++, t++, col++;
  766. }
  767. } while (o<e && *o!='\\');
  768. }
  769. #undef check
  770. }
  771. };
  772. //Verify olines and tlines
  773. {
  774. const char *s = tl->orig, *e = s+tl->orig_size;
  775. size_t i, line_count = tl->olines_size;
  776. //both line arrays should be exactly the same size
  777. if (tl->olines_size != tl->tlines_size)
  778. return 0;
  779. for (i=0; s<e; i++) {
  780. const char *line_start = s, *line_end;
  781. size_t tline_size, oline_size;
  782. const char *p;
  783. if (i+1 < line_count)
  784. tline_size = tl->tlines[i+1] - tl->tlines[i];
  785. else
  786. tline_size = tl->txt+tl->txt_size - tl->tlines[i];
  787. while (s<e && !creturn(*s)) s++;
  788. line_end = s;
  789. if (s<e) {
  790. s++;
  791. if (s<e && *s=='\n'+'\r'-s[-1])
  792. s++;
  793. }
  794. oline_size = s-line_start;
  795. //verify that olines elements are correct
  796. if (line_start != tl->olines[i])
  797. return 0;
  798. //verify that tlines elements are in range
  799. p = tl->tlines[i];
  800. if (p < tl->txt || p+tline_size > tl->txt+tl->txt_size)
  801. return 0;
  802. //verify that original lines have sizes >= the unbroken lines
  803. if (oline_size < tline_size)
  804. return 0;
  805. //if sizes are inconsistent, make sure it is due to a backslash escape
  806. if (oline_size > tline_size) {
  807. p = line_start+tline_size;
  808. if (*p++ != '\\')
  809. return 0;
  810. while (p<e && cspace(*p)) p++;
  811. if (p != line_end)
  812. return 0;
  813. }
  814. //make sure the text of both copies match
  815. if ( memcmp(
  816. tl->olines[i],
  817. tl->tlines[i],
  818. tline_size) )
  819. return 0;
  820. }
  821. }
  822. if (initial && !token_list_sanity_check_initial(tl, err))
  823. failed("Initial sanity checks failed. Has the list been modified after it was returned from tokenize() ?");
  824. return 1;
  825. }
  826. #undef failed
  827. static char *sprint_token_flags(char buf[3], struct token_flags flags) {
  828. buf[0] = flags.pp ? 'p' : '-';
  829. buf[1] = flags.pp_directive ? 'D' : '-';
  830. buf[2] = 0;
  831. return buf;
  832. }
  833. void token_list_dump(const struct token_list *tl, FILE *f) {
  834. struct token *tok;
  835. darray_char buf = darray_new();
  836. size_t i = 0;
  837. char buf2[8];
  838. const char *token_type_str[] = {
  839. "TOK_INTEGER ",
  840. "TOK_FLOATING ",
  841. "TOK_OPERATOR ",
  842. "TOK_KEYWORD ",
  843. "TOK_IDENTIFIER ",
  844. "TOK_CHAR ",
  845. "TOK_STRING ",
  846. "TOK_LEADING_POUND",
  847. "TOK_STRING_IQUOTE",
  848. "TOK_STRING_IANGLE",
  849. "TOK_CCOMMENT ",
  850. "TOK_CPPCOMMENT ",
  851. "TOK_WHITE ",
  852. "TOK_STARTLINE ",
  853. "TOK_STRAY "
  854. };
  855. for (tok=tl->first; tok; tok=tok->next) {
  856. fprintf(f, "%lu\t%s\t%s\t\"%s\"", (unsigned long)(i++),
  857. token_type_str[tok->type],
  858. sprint_token_flags(buf2, tok->flags),
  859. escape_string(&buf, tok->txt, tok->txt_size));
  860. #if 1 //print tok->orig
  861. fprintf(f, "\t\"%s\"\n", escape_string(&buf, tok->orig, tok->orig_size));
  862. #else
  863. fprintf(f, "\n");
  864. #endif
  865. }
  866. darray_free(buf);
  867. }
  868. void tok_message_print(struct tok_message *m, struct token_list *tl) {
  869. struct tok_point pt;
  870. int resolved = tok_point_lookup(&pt, m->location, tl);
  871. if (tl->filename) {
  872. printf("%s:%s", tl->filename, resolved ? "" : " ");
  873. }
  874. if (resolved) {
  875. printf("%zu:%zu %s: %s\n",
  876. pt.line+1, pt.col+1,
  877. m->level==TM_DEBUG ? "debug" :
  878. m->level==TM_INFO ? "info" :
  879. m->level==TM_WARN ? "warning" :
  880. m->level==TM_ERROR ? "error" :
  881. m->level==TM_BUG ? "BUG" :
  882. "???",
  883. m->message);
  884. } else {
  885. printf("%s: %s\n",
  886. m->level==TM_DEBUG ? "debug" :
  887. m->level==TM_INFO ? "info" :
  888. m->level==TM_WARN ? "warning" :
  889. m->level==TM_ERROR ? "error" :
  890. m->level==TM_BUG ? "BUG" :
  891. "???",
  892. m->message);
  893. }
  894. }
  895. void tok_message_dump(struct tok_message *m) {
  896. printf("%s: %s: %s\n",
  897. m->level==TM_DEBUG ? "debug" :
  898. m->level==TM_INFO ? "info" :
  899. m->level==TM_WARN ? "warning" :
  900. m->level==TM_ERROR ? "error" :
  901. m->level==TM_BUG ? "BUG" :
  902. "???", m->path, m->message);
  903. }
  904. void tok_message_add(tok_message_queue *mq, enum tok_message_level level,
  905. const char *path, const char *loc, const char *fmt, ...) {
  906. struct tok_message msg = {.level=level, .path=path, .location=loc};
  907. va_list ap;
  908. if (!mq)
  909. return;
  910. va_start(ap, fmt);
  911. msg.message = talloc_vasprintf(mq->item, fmt, ap);
  912. va_end(ap);
  913. enqueue(*mq, msg);
  914. }
  915. void tok_message_queue_dump(const tok_message_queue *mq) {
  916. size_t i;
  917. for (i=0; i<queue_count(*mq); i++)
  918. tok_message_dump(&queue_item(*mq, i));
  919. }
  920. #undef add
  921. #undef cstray
  922. #undef cident