Pchen0
/
ccan
mirror of https://git.ozlabs.org/~ccan/ccan


			
							12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091
							/*
        Copyright (c) 2009  Joseph A. Adams
        All rights reserved.
        
        Redistribution and use in source and binary forms, with or without
        modification, are permitted provided that the following conditions
        are met:
        1. Redistributions of source code must retain the above copyright
           notice, this list of conditions and the following disclaimer.
        2. Redistributions in binary form must reproduce the above copyright
           notice, this list of conditions and the following disclaimer in the
           documentation and/or other materials provided with the distribution.
        3. The name of the author may not be used to endorse or promote products
           derived from this software without specific prior written permission.
        
        THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
        IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
        OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
        IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
        INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
        NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
        DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
        THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
        (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
        THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

#include "ccan_tokenizer.h"

#include <ccan/talloc/talloc.h>

#include <assert.h>

//Shown by operator precedence; based on
// http://tigcc.ticalc.org/doc/opers.html#precedence .

static struct dict_entry c_dictionary[] = {
//1. Highest
	{'(',"("}, {')',")"},
	{'[',"["}, {']',"]"},
	{'{',"{"}, {'}',"}"},
	{'.',"."},
	{PTR_OP,"->"},
	
//2. Unary
	{'!',"!"}, {'~',"~"}, //prefix
	{INC_OP,"++"}, {DEC_OP,"--"}, //prefix or postfix
	// + - & *
	
//3. Multiplicative
	// *
	{'/',"/"}, {'%',"%"},
	
//4. Additive
	// + -
	
//5. Shift
	{LEFT_OP,"<<"}, {RIGHT_OP,">>"},
	
//6. Relational
	{'<',"<"}, {'>',">"},
	{LE_OP,"<="}, {GE_OP,">="},
	
//7. Equality
	{EQ_OP,"=="}, {NE_OP,"!="},
	
//8. Bitwise AND
	// &
//9. Bitwise XOR
	{'^',"^"},
//10. Bitwise OR
	{'|',"|"},

//11. Logical AND
	{AND_OP,"&&"},
//12. Logical OR
	{OR_OP,"||"},

//13. Conditional
	{'?',"?"}, {':',":"},

//14. Assignment
	{'=',"="},
	{MUL_ASSIGN,"*="}, {DIV_ASSIGN,"/="}, {MOD_ASSIGN,"%="},
	{ADD_ASSIGN,"+="}, {SUB_ASSIGN,"-="},
	{AND_ASSIGN,"&="}, {XOR_ASSIGN,"^="}, {OR_ASSIGN,"|="},
	{LEFT_ASSIGN,"<<="}, {RIGHT_ASSIGN,">>="},
	
//15. Comma
	{',',","},

//16. Semicolon
	{';',";"},
	
//Misc
	{ELLIPSIS,"..."},
	{'#',"#"},
	{DOUBLE_POUND,"##"},

//Ambiguous
	//unary or binary
	{'+',"+"}, {'-',"-"},
	{'&',"&"}, {'*',"*"},

//Keywords
	{_BOOL, "_Bool"},
	{_COMPLEX, "_Complex"},
	{_IMAGINARY, "_Imaginary"},
	{BREAK, "break"},
	{CASE, "case"},
	{CHAR, "char"},
	{CONST, "const"},
	{CONTINUE, "continue"},
	{DEFAULT, "default"},
	{DO, "do"},
	{DOUBLE, "double"},
	{ELSE, "else"},
	{ENUM, "enum"},
	{EXTERN, "extern"},
	{FLOAT, "float"},
	{FOR, "for"},
	{GOTO, "goto"},
	{IF, "if"},
	{INLINE, "inline"},
	{INT, "int"},
	{LONG, "long"},
	{REGISTER, "register"},
	{RESTRICT, "restrict"},
	{RETURN, "return"},
	{SHORT, "short"},
	{SIGNED, "signed"},
	{SIZEOF, "sizeof"},
	{STATIC, "static"},
	{STRUCT, "struct"},
	{SWITCH, "switch"},
	{TYPEDEF, "typedef"},
	{UNION, "union"},
	{UNSIGNED, "unsigned"},
	{VOID, "void"},
	{VOLATILE, "volatile"},
	{WHILE, "while"},

//Preprocessor keywords (except those already defined)
	{VA_ARGS, "__VA_ARGS__"},
	{DEFINE, "define"},
	{ELIF, "elif"},
//	{ELSE, "else"},
	{ENDIF, "endif"},
	{ERROR, "error"},
//	{IF, "if"},
	{IFDEF, "ifdef"},
	{IFNDEF, "ifndef"},
	{INCLUDE, "include"},
	{LINE, "line"},
	{PRAGMA, "pragma"},
	{UNDEF, "undef"},
	{WARNING, "warning"},
};

#if 0

struct tokenizer *tokenizer_new(void *ctx) {
	struct tokenizer *t = talloc(ctx, struct tokenizer);
	t->ctx = ctx;
	queue_init(t->mq, t);
	t->dict = dict_build(t, c_dictionary, sizeof(c_dictionary)/sizeof(*c_dictionary));
	
	return t;
}

#endif

static int talloc_darray_destructor(void *ptr);

/*
 * darray(T) *talloc_darray(const void *context);
 *
 * Create a new darray anchored in a talloc buffer.
 * When this pointer is freed, the darray will be freed as well.
 */
static void *talloc_darray(const void *context)
{
	void *ret = talloc(context, darray(void));
	darray_init(*(darray(void)*)ret);
	talloc_set_destructor(ret, talloc_darray_destructor);
	return ret;
}

static int talloc_darray_destructor(void *ptr)
{
	darray(void) *arr = ptr;
	free(arr->item);
	return 0;
}

#define MESSAGE_PATH "tokenize/"

static void unbreak_backslash_broken_lines(struct token_list *tl, tok_message_queue *mq) {
	const char *s = tl->orig, *e = s+tl->orig_size;
	darray_char         *txt    = talloc_darray(tl);
	darray(const char*) *olines = talloc_darray(tl);
	darray(const char*) *tlines = talloc_darray(tl);
	
	do {
		const char *line_start = s, *line_end;
		const char *lnw; //last non-white
		size_t start_offset = txt->size;
		
		//scan to the next line and find the last non-white character in the line
		while (s<e && !creturn(*s)) s++;
		line_end = s;
		lnw = s;
		while (lnw>line_start && cspace(lnw[-1])) lnw--;
		if (s<e && creturn(*s)) {
			s++;
			//check for non-standard newlines (i.e. "\r", "\r\n", or "\n\r")
			if (s<e && *s=='\n'+'\r'-s[-1])
				s++;
		}
		
		//add the backslash-break-free version of the text
		if (lnw>line_start && lnw[-1]=='\\' && line_end<e) {
			darray_append_items(*txt, line_start, lnw-1-line_start);
			if (lnw<e && cspace(*lnw)) {
				tok_msg_warn(spaces_after_backslash_break, lnw,
					"Trailing spaces after backslash-broken line");
			}
		} else
			darray_append_items(*txt, line_start, s-line_start);
		
		//add the line starts for this line
		darray_append(*olines, line_start);
		darray_append(*tlines, (const char*)start_offset);
			//Since the txt buffer moves when expanded, we're storing offsets
			//  for now.  Once we're done building txt, we can add the base
			//  of it to all the offsets to make them pointers.
	} while (s<e);
	
	//stick a null terminator at the end of the text
	darray_realloc(*txt, txt->size+1);
	txt->item[txt->size] = 0;
	
	//convert the line start offsets to pointers
	{
		const char **i;
		darray_foreach(i, *tlines)
			*i = txt->item + (size_t)(*i);
	}
	
	tl->olines = olines->item;
	tl->olines_size = olines->size;
	tl->txt = txt->item;
	tl->txt_size = txt->size;
	tl->tlines = tlines->item;
	tl->tlines_size = tlines->size;
}

static void normal_keyword(struct token *tok) {
	if (tok->type==TOK_KEYWORD &&
			(opkw_is_directive_only(tok->opkw) || tok->opkw==VA_ARGS))
		tok->type = TOK_IDENTIFIER;
}

static int define_parmlist_has_ellipsis(struct token *start, struct token *end) {
	while (end>start && token_is_ignored(end-1)) end--;
	return (end-->start && end->type==TOK_OPERATOR && end->opkw==ELLIPSIS);
}

//Used to label __VA_ARGS__ as keywords within applicable macro expansions
//Start should follow the DEFINE directive keyword
static void this_is_a_define(struct token *start, struct token *end) {
	struct token *i = start, *pl_start;
	
	//skip past the identifier that is defined
	while (i<end && token_is_ignored(i)) i++;
	if (i >= end)
		return;
	 //TODO:  check i->type to make sure it's an identifier, throw error otherwise
	normal_keyword(i++);
	
	//see if this is actually a variadic macro
	if (!(i<end && i->type==TOK_OPERATOR && i->opkw=='('))
		goto not_va_args;
	pl_start = ++i;
	while (i<end && !(i->type==TOK_OPERATOR && i->opkw==')'))
		normal_keyword(i++);
	if (!define_parmlist_has_ellipsis(pl_start, i++))
		goto not_va_args;
	
	//We have arrived at the macro expansion and know there is a ... argument
	//Thus, we'll only change directive-only keywords to identifiers
	for(; i<end; i++) {
		if (i->type==TOK_KEYWORD && opkw_is_directive_only(i->opkw))
			i->type = TOK_IDENTIFIER;
	}
	
not_va_args:
	while (i < end)
		normal_keyword(i++);
}

//fill the flags field of each token and untangle keywords and such
static void finalize_line(struct token *start, struct token *end) {
	struct token *i = start, *j;
	
	assert(start<end && start->type==TOK_STARTLINE);
	i++;
	
	while (i<end && token_is_ignored(i)) i++;
	
	if (i<end && i->type==TOK_OPERATOR && i->opkw=='#') {
	//preprocessor line
		i->type = TOK_LEADING_POUND;
		
		//set pp on all tokens in this line
		for (j=start; j<end; j++)
			j->flags.pp = 1;
		
		//find the relevant token after the '#'
		for (i++; i<end; i++) {
			if (!token_is_ignored(i)) {
				i->flags.pp_directive = 1;
				if (i->type==TOK_KEYWORD && !opkw_is_directive(i->opkw))
					i->type = TOK_IDENTIFIER;
				//TODO:  Handle invalid preprocessor directives (e.g. #+ )
				
				if (i->type==TOK_KEYWORD && i->opkw==DEFINE) {
					for (j=i+1; j<end; j++)
					this_is_a_define(i+1, end);
				} else {
					while (++i < end)
						normal_keyword(i);
				}
				break;
			}
		}
	} else {
	//normal line
		while (i < end)
			normal_keyword(i++);
	}
}

//fill the list, flags, line, col, orig, and orig_size fields of each token
//convert identifiers mistaken for preprocessor keywords (e.g. ifdef) to identifiers
static void finalize(struct token_list *tl, struct token *start, struct token *end) {
	const char * const *lss = tl->tlines;
	const char * const *lse = lss + tl->tlines_size;
	struct token *i;
	struct token *startline = NULL;
	
	assert(start < end);
	
	tl->first = start;
	tl->last = end-1;
	
	for (i=start; ; i++) {
		//perform a second pass on each line
		if (i >= end || i->type == TOK_STARTLINE) {
			if (startline)
				finalize_line(startline, i);
			startline = i;
		}
		
		if (i >= end) {
			end[-1].orig_size = tl->orig+tl->orig_size - end[-1].orig;
			break;
		}
		
		//set up the list links
		i->prev = i>start ? i-1 : NULL;
		i->next = i+1<end ? i+1 : NULL;
		
		//if i->txt starts on a later line, advance to it
		while (lss+1<lse && i->txt >= lss[1] && i->txt > lss[0])
			lss++;
		
		//set up line, col, orig, and orig_size
		i->line = lss - tl->tlines;
		i->col = i->txt - *lss;
		i->orig = tl->olines[i->line] + i->col;
		if (i > start)
			i[-1].orig_size = i->orig - i[-1].orig;
		
		assert(i->line < tl->olines_size);
		
		//clear the flags
		memset(&i->flags, 0, sizeof(i->flags));
	}
}

#define add(...) do { \
		struct token tok = {__VA_ARGS__}; \
		tok.txt = orig; \
		tok.txt_size = s-orig; \
		darray_append(*arr, tok); \
	} while (0)

#define cstray(c) (ccontrol(c) || cextended(c) || (c)=='@' || (c)=='`' || (c)=='\\')
#define cident(c) (cletter(c) || cdigit(c) || c=='_' || c=='$')
	//believe it or not, $ is a valid character in an identifier

struct dict *tokenizer_dict = NULL;

static void free_tokenizer_dict(void) {
	talloc_free(tokenizer_dict);
}

struct token_list *tokenize(const void *tcontext, const char *orig, size_t orig_size,
				tok_message_queue *mq) {
	struct token_list *tl = talloc(tcontext, struct token_list);
	const char *s, *e;
	size_t stray_count=0, cr_count=0;
	darray(struct token) *arr = talloc_darray(tl);
	int only_pound_include = 0;
	
	if (!tokenizer_dict) {
		tokenizer_dict = dict_build(NULL, c_dictionary,
			sizeof(c_dictionary)/sizeof(*c_dictionary));
		atexit(free_tokenizer_dict);
	}
	
	tl->orig = orig;
	tl->orig_size = orig_size;
	unbreak_backslash_broken_lines(tl, mq);
	tl->filename = NULL;
	
	s = tl->txt;
	e = s + tl->txt_size;
	
	darray_appends_t(*arr, struct token, {
		.type = TOK_STARTLINE,
		.txt = s,
		.txt_size = 0
	} );
	
	while (s<e) {
		const char *orig = s;
		char c = *s++;
		int added_something = 1;
		
		if (cstray(c)) {
			stray_count++;
			while (s<e && cstray(*s)) {
				s++;
				stray_count++;
			}
			add(.type = TOK_STRAY);
			
			/* This has the potential to be very noisy on binary
			   files, but it really is quite useful. */
			tok_msg_error(stray_segment, orig,
				"%zu stray characters", s-orig);
		
		} else if (creturn(c)) {
			//check for non-standard newlines (i.e. "\r", "\r\n", or "\n\r")
			if (s<e && *s=='\n'+'\r'-c) {
				s++;
				cr_count++;
			} else if (c=='\r')
				cr_count++;
			
			add(.type = TOK_WHITE);
			orig = s;
			
			//add a TOK_STARTLINE for the next line unless this is the end of the document
			if (s<e)
				add(.type = TOK_STARTLINE);
			
			only_pound_include = 0;
		
		} else if (cspace(c)) {
			//skip over the remaining whitespace
			while (s<e && cspace(*s)) s++;
			add(.type = TOK_WHITE);
			added_something = 0;
		
		} else if (cdigit(c) || (c=='.' && s<e && cdigit(*s))) {
			struct token tok;
			s = read_cnumber(&tok, s-1, e, mq);
			tok.txt = orig;
			tok.txt_size = s-orig;
			darray_append(*arr, tok);
			
		} else if (csymbol(c) || cident(c)) {
			if (only_pound_include && (c=='"' || c=='<')) { //include string
				char *include;
				char end = c=='"' ? '"' : '>';
				short type = c=='"' ? TOK_STRING_IQUOTE : TOK_STRING_IANGLE;
				
				while (s<e && !creturn(*s) && *s!=end) s++;
				include = talloc_strndup(tl, orig+1, s-(orig+1));
				
				if (s<e && *s==end) {
					s++;
				} else {
					tok_msg_error(include_missing_terminator, orig,
						"Missing terminating %c character", end);
				}
				
				add(.type = type,
					{.include = include});
				
			} else if (c=='\'' || c=='\"') { //character or string literal
				darray_char *string = talloc_darray(tl);
				s = read_cstring(string, s, e, c, mq);
				if (s<e) s++; //advance past endquote (if available)
				add(.type = c=='\'' ? TOK_CHAR : TOK_STRING,
				    {.string = string});
				
				if (c=='\'' && string->size==0) {
					tok_msg_error(empty_char_constant, orig,
						"Empty character constant");
				}
				
			} else if (c=='/' && s<e && (*s=='*' || *s=='/')) { //comment
				if (*s++ == '*') { /* C-style comment */
					const char *comment_start = s-2;
					for (;;s++) {
						if (s+1 >= e) {
							s = e;
							tok_msg_error(unterminated_comment, comment_start,
								"Unterminated comment");
							break;
						}
						if (s[0]=='*' && s[1]=='/') {
							s += 2;
							break;
						}
					}
					add(.type = TOK_CCOMMENT);
				} else { // C++-style comment
					while (s<e && !creturn(*s)) s++;
					add(.type = TOK_CPPCOMMENT);
				}
				added_something = 0;
			
			} else { //operator, keyword, or identifier
				struct dict_entry *ent;
				const char *ident_e = --s;
				while (ident_e<e && cident(*ident_e) ) ident_e++;
				
				ent = dict_lookup(tokenizer_dict, &s, e);
				if (cident(c)) { //keyword or identifier
					if (ent && s==ident_e) {
						add(.type = TOK_KEYWORD,
							{.opkw = ent->id});
						if (ent->id == INCLUDE) {
							//hacky way to lex #include string properly
							struct token *ts = arr->item;
							struct token *tp = ts+arr->size-1;
							while (tp>ts && token_is_ignored(tp-1))
								tp--;
							if (tp>ts && token_is_op(tp-1, '#')) {
								tp--;
								while (tp>ts && token_is_ignored(tp-1))
									tp--;
								if (tp>ts && tp[-1].type==TOK_STARTLINE) {
									only_pound_include = 1;
									continue;
								}
							}
						}
					} else {
						s = ident_e;
						add(.type = TOK_IDENTIFIER);
					}
				} else if (ent) { //operator
					add(.type = TOK_OPERATOR,
					    {.opkw = ent->id});
				} else { //invalid symbol (shouldn't happen)
					tok_msg_bug(unrecognized_symbol, s,
						"Unrecognized symbol \'%c\'", c);
					s++;
					add(.type = TOK_STRAY);
				}
			}
		}
		
		if (added_something)
			only_pound_include = 0;
	}
	
	/*if (stray_count) {
		tok_msg_error(stray_characters, NULL,
			"%lu stray characters in text", (unsigned long)stray_count);
	}*/
	if (cr_count) {
		tok_msg_warn(nonstandard_newlines, NULL,
			"Text contains non-standard line terminators");
	}
	
	finalize(tl, arr->item, arr->item+arr->size);
	
	return tl;
}

size_t token_list_count(const struct token_list *tl) {
	size_t ret = 0;
	const struct token *i;
	
	for (i=tl->first; i; i=i->next)
		ret++;
	
	return ret;
}

static size_t find_line(const char *ptr, const char * const *lines, size_t line_count) {
	const char * const *orig = lines;
	const char * const *orig_e = lines+line_count;
	
	while (line_count > 1) {
		size_t middle = line_count>>1;
		if (ptr < lines[middle])
			line_count = middle;
		else {
			lines += middle;
			line_count -= middle;
		}
	}
	
	//select the *last* of equivalent lines
	while (lines+1 < orig_e && lines[0]==lines[1])
		lines++;
	
	// (don't) select the *first* of equivalent lines
	//while (lines>orig && lines<orig_e && lines[-1]==lines[0])
	//	lines--;
	
	return lines - orig;
}

int tok_point_lookup(struct tok_point *out, const char *ptr,
			const struct token_list *tl) {
	size_t line_count = tl->olines_size;
	
	memset(out, 0, sizeof(*out));
	if (!tl)
		return 0;
	
	if (ptr >= tl->txt && ptr <= tl->txt+tl->txt_size) {
		out->txt = ptr;
		out->line = find_line(ptr, tl->tlines, line_count);
		if (out->line < line_count) {
			out->col = ptr - tl->tlines[out->line];
			out->orig = tl->olines[out->line] + out->col;
		} else {
			out->col = 0;
			out->orig = tl->orig + tl->orig_size;
		}
		return 1;
	} else if (ptr >= tl->orig && ptr <= tl->orig+tl->orig_size) {
		out->orig = ptr;
		out->line = find_line(ptr, tl->olines, line_count);
		if (out->line < line_count) {
			const char *tline_start = tl->tlines[out->line];
			const char *tline_end = out->line+1 < line_count ?
				tl->tlines[out->line+1] :
				tl->txt + tl->txt_size;
			
			out->col = ptr - tl->olines[out->line];
			out->txt = tline_start + out->col;
			
			if (out->txt > tline_end)
				out->txt = tline_end;
		} else {
			out->col = 0;
			out->txt = tl->txt + tl->txt_size;
		}
		return 1;
	} else {
		return 0;
	}
}

static char *escape_string(darray_char *buf, const char *str, size_t size) {
	const char *s = str, *e = s+size;
	darray_from_lit(*buf, "");
	
	for (;s<e;s++) {
		char buffer[8];
		const char *esc = buffer;
		unsigned char c = (unsigned char)*s;
		if (ccontrol(c))
			sprintf(buffer, "\\x%02X", c);
		else switch(c) {
			case '\t': esc = "\\t"; break;
			case '\n': esc = "\\n"; break;
			case '\v': esc = "\\v"; break;
			case '\f': esc = "\\f"; break;
			case '\r': esc = "\\r"; break;
			case '"': esc = "\\\""; break;
			case '\\': esc = "\\\\"; break;
			default:
				buffer[0] = c;
				buffer[1] = 0;
		}
		darray_append_string(*buf, esc);
	}
	
	return buf->item;
}

static int txt_orig_matches(const char *txt, size_t txt_size, const char *orig, size_t orig_size) {
	const char *ts = txt, *te = ts+txt_size;
	const char *os = orig, *oe = os+orig_size;
	
	do {
		const char *ob = os; //start of next backslash break
		const char *obe; //end of next backslash break
		size_t size; //amount of text to compare for this round
		
		while (ob<oe && *ob!='\\') ob++;
		obe = ob;
		if (obe < oe) { //there's a backslash
			obe++;
			while (obe<oe && cspace(*obe)) obe++;
			if (obe<oe && creturn(*obe)) { //there's a backslash-broken line
				obe++;
				if (obe<oe && *obe == '\n'+'\r'-obe[-1])
					obe++;
			} else //this is just a plain old backslash
				ob = obe;
		}
		
		size = ob-os;
		
		if (ts+size > te || memcmp(ts, os, size))
			return 0;
		ts += size;
		os = obe;
	} while (ts<te);
	
	if (ts != te || os != oe)
		return 0;
	
	return 1;
}

static int is_backslash_break(const char **end, const char *s, const char *e) {
	if (s<e && *s == '\\') {
		s++;
		while (s<e && cspace(*s)) s++;
		if (s<e && creturn(*s)) {
			s++;
			if (s<e && *s=='\n'+'\r'-s[-1])
				s++;
			*end = s;
			return 1;
		}
		return 0;
	}
	return 0;
}

#define failed(fmt, ...) do {fprintf(err, fmt "\n", ##__VA_ARGS__); return 0; } while(0)

//tests that should pass on an untainted token list out of the tokenize() function
static int token_list_sanity_check_initial(const struct token_list *tl, FILE *err) {
	struct token *first = tl->first;
	struct token *last = tl->last;
	struct token *i;
	const char *txt=tl->txt, *orig=tl->orig;
	const char *txt_e = txt+tl->txt_size, *orig_e = orig+tl->orig_size;
	
	if ((char*)first > (char*)last ||
		(size_t)((char*)last - (char*)first) % sizeof(struct token))
		failed("Token list pointers don't look right");
	
	//token list should not end with TOK_STARTLINE unless
	//  the document is empty
	if (last!=first && last->type==TOK_STARTLINE)
		return 0;
	
	for (i=first; i; i=i->next) {
		//Verify list links
		if (i != first && i->prev != i-1)
			failed("list.prev is incorrect");
		if (i != last && i->next != i+1)
			failed("list.next is incorrect");
		
		//Make sure txt segments fill the entire tl->txt
		if (i->txt != txt)
			failed("txt does not fill the token list");
		txt += i->txt_size;
		if (txt > txt_e)
			failed("txt is out of bounds");
		
		//Make sure orig segments fill the entire tl->orig
		if (i->orig != orig)
			failed("orig does not fill the token list");
		orig += i->orig_size;
		if (orig > orig_e)
			failed("orig is out of bounds");
	}
	
	if (txt != txt_e)
		return 0;
	if (orig != orig_e)
		return 0;
	
	return 1;
}

int token_list_sanity_check(const struct token_list *tl, FILE *err) {
	struct token *first = tl->first;
	struct token *last = tl->last;
	struct token *i;
	int initial = 1;
	
	if (tl->first == NULL || tl->last == NULL)
		failed("Token list is completely empty");
	
	if (first->type!=TOK_STARTLINE ||
	    first->txt!=tl->txt || first->txt_size!=0 ||
	    first->orig!=tl->orig || first->orig_size!=0 ||
	    first->line!=0 || first->col!=0)
		failed("Token list does not start with a valid TOK_STARTLINE");
	
	if (first->prev!=NULL || last->next!=NULL)
		failed("Token edge links are not NULL");
	
	for (i=first; i; i=i->next) {
		//Verify line,col
		if (tl->tlines[i->line] + i->col != i->txt)
			failed("line,col is wrong against txt");
		if (tl->olines[i->line] + i->col != i->orig)
			failed("line,col is wrong against orig");
		
		//Make sure tokens have proper sizes
		if (i->type!=TOK_STARTLINE && (i->txt_size==0 || i->orig_size==0 || i->txt_size > i->orig_size) )
			failed("Token is empty");
		if (i->type==TOK_STARTLINE && (i->txt_size!=0 || i->orig_size!=0) )
			failed("TOK_STARTLINE is non-empty");
		
		//Make sure TOK_WHITE actually contains white tokens
		if (i->type==TOK_WHITE) {
			const char *s = i->txt, *e = s+i->txt_size;
			while (s<e && cwhite(*s)) s++;
			if (s != e)
				failed("TOK_WHITE does not contain only white characters");
		}
		
		//Make sure txt and orig match exactly except for backslash line breaks
		if (!txt_orig_matches(i->txt, i->txt_size, i->orig, i->orig_size)) {
			darray_char buf = darray_new();
			fprintf(err,
				"txt and orig do not match:\n"
				"\ttxt  = \"%s\"\n",
				escape_string(&buf, i->txt, i->txt_size) );
			fprintf(err, "\torig = \"%s\"\n",
				escape_string(&buf, i->orig, i->orig_size) );
			
			darray_free(buf);
			return 0;
		}
		
		//Make sure tok_point_lookup returns correct point
		{
			struct tok_point tok_point;
			const char *t=i->txt, *o=i->orig, *e=o+i->orig_size, *p;
			size_t line=i->line, col=i->col;
			
			#define check(ptr) do { \
				if (tok_point_lookup(&tok_point, ptr, tl)) { \
					if (tok_point.txt != t || tok_point.orig != o) \
						failed("tok_point_lookup on txt reported incorrect txt/orig (orig is %d, should be %d)", \
						(int)(tok_point.orig-i->orig), (int)(o-i->orig)); \
					if (tok_point.line != line || tok_point.col != col) \
						failed("tok_point_lookup on txt reported incorrect line/col (off by %d, %d)", \
						(int)(tok_point.line-line), (int)(tok_point.col-col)); \
				} else if (initial) {\
					failed("tok_point_lookup failed on initial token list"); \
				} \
			} while(0)
			
			for (;;) {
				while (is_backslash_break(&p, o, e)) {
					while (o<p) {
						check(o);
						o++;
						col++;
					}
					col = 0;
					line++;
				}
				if (o >= e)
					break;
				do {
					if (creturn(*o)) {
						p = o+1;
						if (p<e && *p=='\n'+'\r'-p[-1])
							p++;
						while (o<p) {
							check(o);
							check(t);
							t++, o++, col++;
						}
						line++;
						col = 0;
					} else {
						check(o);
						check(t);
						o++, t++, col++;
					}
				} while (o<e && *o!='\\');
			}
			
			#undef check
		}
	};
	
	//Verify olines and tlines
	{
		const char *s = tl->orig, *e = s+tl->orig_size;
		size_t i, line_count = tl->olines_size;
		
		//both line arrays should be exactly the same size
		if (tl->olines_size != tl->tlines_size)
			return 0;
		
		for (i=0; s<e; i++) {
			const char *line_start = s, *line_end;
			size_t tline_size, oline_size;
			const char *p;
			
			if (i+1 < line_count)
				tline_size = tl->tlines[i+1] - tl->tlines[i];
			else
				tline_size = tl->txt+tl->txt_size - tl->tlines[i];
			
			while (s<e && !creturn(*s)) s++;
			line_end = s;
			if (s<e) {
				s++;
				if (s<e && *s=='\n'+'\r'-s[-1])
					s++;
			}
			
			oline_size = s-line_start;
			
			//verify that olines elements are correct
			if (line_start != tl->olines[i])
				return 0;
			
			//verify that tlines elements are in range
			p = tl->tlines[i];
			if (p < tl->txt || p+tline_size > tl->txt+tl->txt_size)
				return 0;
			
			//verify that original lines have sizes >= the unbroken lines
			if (oline_size < tline_size)
				return 0;
			
			//if sizes are inconsistent, make sure it is due to a backslash escape
			if (oline_size > tline_size) {
				p = line_start+tline_size;
				if (*p++ != '\\')
					return 0;
				while (p<e && cspace(*p)) p++;
				if (p != line_end)
					return 0;
			}
			
			//make sure the text of both copies match
			if ( memcmp(
				tl->olines[i],
				tl->tlines[i],
				tline_size) )
				return 0;
		}
	}
	
	if (initial && !token_list_sanity_check_initial(tl, err))
		failed("Initial sanity checks failed.  Has the list been modified after it was returned from tokenize() ?");
	
	return 1;
}

#undef failed

static char *sprint_token_flags(char buf[3], struct token_flags flags) {
	buf[0] = flags.pp ? 'p' : '-';
	buf[1] = flags.pp_directive ? 'D' : '-';
	buf[2] = 0;
	return buf;
}

void token_list_dump(const struct token_list *tl, FILE *f) {
	struct token *tok;
	darray_char buf = darray_new();
	size_t i = 0;
	char buf2[8];
	const char *token_type_str[] = {
		"TOK_INTEGER      ",
		"TOK_FLOATING     ",
		"TOK_OPERATOR     ",
		"TOK_KEYWORD      ",
		"TOK_IDENTIFIER   ",
		"TOK_CHAR         ",
		"TOK_STRING       ",
		"TOK_LEADING_POUND",
		"TOK_STRING_IQUOTE",
		"TOK_STRING_IANGLE",
		"TOK_CCOMMENT     ",
		"TOK_CPPCOMMENT   ",
		"TOK_WHITE        ",
		"TOK_STARTLINE    ",
		"TOK_STRAY        "
	};
	
	for (tok=tl->first; tok; tok=tok->next) {
		fprintf(f, "%lu\t%s\t%s\t\"%s\"", (unsigned long)(i++),
			token_type_str[tok->type],
			sprint_token_flags(buf2, tok->flags),
			escape_string(&buf, tok->txt, tok->txt_size));
		#if 1 //print tok->orig
		fprintf(f, "\t\"%s\"\n", escape_string(&buf, tok->orig, tok->orig_size));
		#else
		fprintf(f, "\n");
		#endif
	}
	
	darray_free(buf);
}

void tok_message_print(struct tok_message *m, struct token_list *tl) {
	struct tok_point pt;
	int resolved = tok_point_lookup(&pt, m->location, tl);
	
	if (tl->filename) {
		printf("%s:%s", tl->filename, resolved ? "" : " ");
	}
	
	if (resolved) {
		printf("%zu:%zu %s: %s\n",
			pt.line+1, pt.col+1,
			m->level==TM_DEBUG ? "debug" :
			m->level==TM_INFO ? "info" :
			m->level==TM_WARN ? "warning" :
			m->level==TM_ERROR ? "error" :
			m->level==TM_BUG ? "BUG" :
			"???",
			m->message);
	} else {
		printf("%s: %s\n",
			m->level==TM_DEBUG ? "debug" :
			m->level==TM_INFO ? "info" :
			m->level==TM_WARN ? "warning" :
			m->level==TM_ERROR ? "error" :
			m->level==TM_BUG ? "BUG" :
			"???",
			m->message);
	}
}

void tok_message_dump(struct tok_message *m) {
	printf("%s: %s: %s\n",
		m->level==TM_DEBUG ? "debug" :
		m->level==TM_INFO ? "info" :
		m->level==TM_WARN ? "warning" :
		m->level==TM_ERROR ? "error" :
		m->level==TM_BUG ? "BUG" :
		"???", m->path, m->message);
}

void tok_message_add(tok_message_queue *mq, enum tok_message_level level,
	const char *path, const char *loc, const char *fmt, ...) {
	struct tok_message msg = {.level=level, .path=path, .location=loc};
	va_list ap;
	
	if (!mq)
		return;
	
	va_start(ap, fmt);
	msg.message = talloc_vasprintf(mq->item, fmt, ap);
	va_end(ap);
	
	enqueue(*mq, msg);
}

void tok_message_queue_dump(const tok_message_queue *mq) {
	size_t i;
	for (i=0; i<queue_count(*mq); i++)
		tok_message_dump(&queue_item(*mq, i));
}


#undef add
#undef cstray
#undef cident