load.c 20 KB


  1. /*
  2. * Copyright (c) 2009, 2010 Petri Lehtinen <petri@digip.org>
  3. *
  4. * Jansson is free software; you can redistribute it and/or modify
  5. * it under the terms of the MIT license. See LICENSE for details.
  6. */
  7. #define _GNU_SOURCE
  8. #include <ctype.h>
  9. #include <errno.h>
  10. #include <limits.h>
  11. #include <stdio.h>
  12. #include <stdlib.h>
  13. #include <string.h>
  14. #include <stdarg.h>
  15. #include <assert.h>
  16. #include <jansson.h>
  17. #include "jansson_private.h"
  18. #include "strbuffer.h"
  19. #include "utf.h"
  20. #define TOKEN_INVALID -1
  21. #define TOKEN_EOF 0
  22. #define TOKEN_STRING 256
  23. #define TOKEN_INTEGER 257
  24. #define TOKEN_REAL 258
  25. #define TOKEN_TRUE 259
  26. #define TOKEN_FALSE 260
  27. #define TOKEN_NULL 261
  28. /* read one byte from stream, return EOF on end of file */
  29. typedef int (*get_func)(void *data);
  30. /* return non-zero if end of file has been reached */
  31. typedef int (*eof_func)(void *data);
  32. typedef struct {
  33. get_func get;
  34. eof_func eof;
  35. void *data;
  36. int stream_pos;
  37. char buffer[5];
  38. int buffer_pos;
  39. } stream_t;
  40. typedef struct {
  41. stream_t stream;
  42. strbuffer_t saved_text;
  43. int token;
  44. int line, column;
  45. union {
  46. char *string;
  47. int integer;
  48. double real;
  49. } value;
  50. } lex_t;
  51. /*** error reporting ***/
  52. static void error_init(json_error_t *error)
  53. {
  54. if(error)
  55. {
  56. error->text[0] = '\0';
  57. error->line = -1;
  58. }
  59. }
  60. static void error_set(json_error_t *error, const lex_t *lex,
  61. const char *msg, ...)
  62. {
  63. va_list ap;
  64. char text[JSON_ERROR_TEXT_LENGTH];
  65. if(!error || error->text[0] != '\0') {
  66. /* error already set */
  67. return;
  68. }
  69. va_start(ap, msg);
  70. vsnprintf(text, JSON_ERROR_TEXT_LENGTH, msg, ap);
  71. va_end(ap);
  72. if(lex)
  73. {
  74. const char *saved_text = strbuffer_value(&lex->saved_text);
  75. error->line = lex->line;
  76. if(saved_text && saved_text[0])
  77. {
  78. if(lex->saved_text.length <= 20) {
  79. snprintf(error->text, JSON_ERROR_TEXT_LENGTH,
  80. "%s near '%s'", text, saved_text);
  81. }
  82. else
  83. snprintf(error->text, JSON_ERROR_TEXT_LENGTH, "%s", text);
  84. }
  85. else
  86. {
  87. snprintf(error->text, JSON_ERROR_TEXT_LENGTH,
  88. "%s near end of file", text);
  89. }
  90. }
  91. else
  92. {
  93. error->line = -1;
  94. snprintf(error->text, JSON_ERROR_TEXT_LENGTH, "%s", text);
  95. }
  96. }
  97. /*** lexical analyzer ***/
  98. static void
  99. stream_init(stream_t *stream, get_func get, eof_func eof, void *data)
  100. {
  101. stream->get = get;
  102. stream->eof = eof;
  103. stream->data = data;
  104. stream->stream_pos = 0;
  105. stream->buffer[0] = '\0';
  106. stream->buffer_pos = 0;
  107. }
  108. static char stream_get(stream_t *stream, json_error_t *error)
  109. {
  110. char c;
  111. if(!stream->buffer[stream->buffer_pos])
  112. {
  113. stream->buffer[0] = stream->get(stream->data);
  114. stream->buffer_pos = 0;
  115. c = stream->buffer[0];
  116. if((unsigned char)c >= 0x80 && c != (char)EOF)
  117. {
  118. /* multi-byte UTF-8 sequence */
  119. int i, count;
  120. count = utf8_check_first(c);
  121. if(!count)
  122. goto out;
  123. assert(count >= 2);
  124. for(i = 1; i < count; i++)
  125. stream->buffer[i] = stream->get(stream->data);
  126. if(!utf8_check_full(stream->buffer, count, NULL))
  127. goto out;
  128. stream->stream_pos += count;
  129. stream->buffer[count] = '\0';
  130. }
  131. else {
  132. stream->buffer[1] = '\0';
  133. stream->stream_pos++;
  134. }
  135. }
  136. return stream->buffer[stream->buffer_pos++];
  137. out:
  138. error_set(error, NULL, "unable to decode byte 0x%x at position %d",
  139. (unsigned char)c, stream->stream_pos);
  140. stream->buffer[0] = EOF;
  141. stream->buffer[1] = '\0';
  142. stream->buffer_pos = 1;
  143. return EOF;
  144. }
  145. static void stream_unget(stream_t *stream, char c)
  146. {
  147. assert(stream->buffer_pos > 0);
  148. stream->buffer_pos--;
  149. assert(stream->buffer[stream->buffer_pos] == c);
  150. }
  151. static int lex_get(lex_t *lex, json_error_t *error)
  152. {
  153. return stream_get(&lex->stream, error);
  154. }
  155. static int lex_eof(lex_t *lex)
  156. {
  157. return lex->stream.eof(lex->stream.data);
  158. }
  159. static void lex_save(lex_t *lex, char c)
  160. {
  161. strbuffer_append_byte(&lex->saved_text, c);
  162. }
  163. static int lex_get_save(lex_t *lex, json_error_t *error)
  164. {
  165. char c = stream_get(&lex->stream, error);
  166. lex_save(lex, c);
  167. return c;
  168. }
  169. static void lex_unget_unsave(lex_t *lex, char c)
  170. {
  171. char d;
  172. stream_unget(&lex->stream, c);
  173. d = strbuffer_pop(&lex->saved_text);
  174. assert(c == d);
  175. }
  176. static void lex_save_cached(lex_t *lex)
  177. {
  178. while(lex->stream.buffer[lex->stream.buffer_pos] != '\0')
  179. {
  180. lex_save(lex, lex->stream.buffer[lex->stream.buffer_pos]);
  181. lex->stream.buffer_pos++;
  182. }
  183. }
  184. /* assumes that str points to 'u' plus at least 4 valid hex digits */
  185. static int32_t decode_unicode_escape(const char *str)
  186. {
  187. int i;
  188. int32_t value = 0;
  189. assert(str[0] == 'u');
  190. for(i = 1; i <= 4; i++) {
  191. char c = str[i];
  192. value <<= 4;
  193. if(isdigit(c))
  194. value += c - '0';
  195. else if(islower(c))
  196. value += c - 'a' + 10;
  197. else if(isupper(c))
  198. value += c - 'A' + 10;
  199. else
  200. assert(0);
  201. }
  202. return value;
  203. }
  204. static void lex_scan_string(lex_t *lex, json_error_t *error)
  205. {
  206. char c;
  207. const char *p;
  208. char *t;
  209. int i;
  210. lex->value.string = NULL;
  211. lex->token = TOKEN_INVALID;
  212. c = lex_get_save(lex, error);
  213. while(c != '"') {
  214. if(c == (char)EOF) {
  215. lex_unget_unsave(lex, c);
  216. if(lex_eof(lex))
  217. error_set(error, lex, "premature end of input");
  218. goto out;
  219. }
  220. else if((unsigned char)c <= 0x1F) {
  221. /* control character */
  222. lex_unget_unsave(lex, c);
  223. if(c == '\n')
  224. error_set(error, lex, "unexpected newline", c);
  225. else
  226. error_set(error, lex, "control character 0x%x", c);
  227. goto out;
  228. }
  229. else if(c == '\\') {
  230. c = lex_get_save(lex, error);
  231. if(c == 'u') {
  232. c = lex_get_save(lex, error);
  233. for(i = 0; i < 4; i++) {
  234. if(!isxdigit(c)) {
  235. lex_unget_unsave(lex, c);
  236. error_set(error, lex, "invalid escape");
  237. goto out;
  238. }
  239. c = lex_get_save(lex, error);
  240. }
  241. }
  242. else if(c == '"' || c == '\\' || c == '/' || c == 'b' ||
  243. c == 'f' || c == 'n' || c == 'r' || c == 't')
  244. c = lex_get_save(lex, error);
  245. else {
  246. lex_unget_unsave(lex, c);
  247. error_set(error, lex, "invalid escape");
  248. goto out;
  249. }
  250. }
  251. else
  252. c = lex_get_save(lex, error);
  253. }
  254. /* the actual value is at most of the same length as the source
  255. string, because:
  256. - shortcut escapes (e.g. "\t") (length 2) are converted to 1 byte
  257. - a single \uXXXX escape (length 6) is converted to at most 3 bytes
  258. - two \uXXXX escapes (length 12) forming an UTF-16 surrogate pair
  259. are converted to 4 bytes
  260. */
  261. lex->value.string = malloc(lex->saved_text.length + 1);
  262. if(!lex->value.string) {
  263. /* this is not very nice, since TOKEN_INVALID is returned */
  264. goto out;
  265. }
  266. /* the target */
  267. t = lex->value.string;
  268. /* + 1 to skip the " */
  269. p = strbuffer_value(&lex->saved_text) + 1;
  270. while(*p != '"') {
  271. if(*p == '\\') {
  272. p++;
  273. if(*p == 'u') {
  274. char buffer[4];
  275. int length;
  276. int32_t value;
  277. value = decode_unicode_escape(p);
  278. p += 5;
  279. if(0xD800 <= value && value <= 0xDBFF) {
  280. /* surrogate pair */
  281. if(*p == '\\' && *(p + 1) == 'u') {
  282. int32_t value2 = decode_unicode_escape(++p);
  283. p += 5;
  284. if(0xDC00 <= value2 && value2 <= 0xDFFF) {
  285. /* valid second surrogate */
  286. value =
  287. ((value - 0xD800) << 10) +
  288. (value2 - 0xDC00) +
  289. 0x10000;
  290. }
  291. else {
  292. /* invalid second surrogate */
  293. error_set(error, lex,
  294. "invalid Unicode '\\u%04X\\u%04X'",
  295. value, value2);
  296. goto out;
  297. }
  298. }
  299. else {
  300. /* no second surrogate */
  301. error_set(error, lex, "invalid Unicode '\\u%04X'",
  302. value);
  303. goto out;
  304. }
  305. }
  306. else if(0xDC00 <= value && value <= 0xDFFF) {
  307. error_set(error, lex, "invalid Unicode '\\u%04X'", value);
  308. goto out;
  309. }
  310. else if(value == 0)
  311. {
  312. error_set(error, lex, "\\u0000 is not allowed");
  313. goto out;
  314. }
  315. if(utf8_encode(value, buffer, &length))
  316. assert(0);
  317. memcpy(t, buffer, length);
  318. t += length;
  319. }
  320. else {
  321. switch(*p) {
  322. case '"': case '\\': case '/':
  323. *t = *p; break;
  324. case 'b': *t = '\b'; break;
  325. case 'f': *t = '\f'; break;
  326. case 'n': *t = '\n'; break;
  327. case 'r': *t = '\r'; break;
  328. case 't': *t = '\t'; break;
  329. default: assert(0);
  330. }
  331. t++;
  332. p++;
  333. }
  334. }
  335. else
  336. *(t++) = *(p++);
  337. }
  338. *t = '\0';
  339. lex->token = TOKEN_STRING;
  340. return;
  341. out:
  342. free(lex->value.string);
  343. }
  344. static int lex_scan_number(lex_t *lex, char c, json_error_t *error)
  345. {
  346. const char *saved_text;
  347. char *end;
  348. double value;
  349. lex->token = TOKEN_INVALID;
  350. if(c == '-')
  351. c = lex_get_save(lex, error);
  352. if(c == '0') {
  353. c = lex_get_save(lex, error);
  354. if(isdigit(c)) {
  355. lex_unget_unsave(lex, c);
  356. goto out;
  357. }
  358. }
  359. else if(isdigit(c)) {
  360. c = lex_get_save(lex, error);
  361. while(isdigit(c))
  362. c = lex_get_save(lex, error);
  363. }
  364. else {
  365. lex_unget_unsave(lex, c);
  366. goto out;
  367. }
  368. if(c != '.' && c != 'E' && c != 'e') {
  369. long value;
  370. lex_unget_unsave(lex, c);
  371. saved_text = strbuffer_value(&lex->saved_text);
  372. value = strtol(saved_text, &end, 10);
  373. assert(end == saved_text + lex->saved_text.length);
  374. if((value == LONG_MAX && errno == ERANGE) || value > INT_MAX) {
  375. error_set(error, lex, "too big integer");
  376. goto out;
  377. }
  378. else if((value == LONG_MIN && errno == ERANGE) || value < INT_MIN) {
  379. error_set(error, lex, "too big negative integer");
  380. goto out;
  381. }
  382. lex->token = TOKEN_INTEGER;
  383. lex->value.integer = (int)value;
  384. return 0;
  385. }
  386. if(c == '.') {
  387. c = lex_get(lex, error);
  388. if(!isdigit(c))
  389. goto out;
  390. lex_save(lex, c);
  391. c = lex_get_save(lex, error);
  392. while(isdigit(c))
  393. c = lex_get_save(lex, error);
  394. }
  395. if(c == 'E' || c == 'e') {
  396. c = lex_get_save(lex, error);
  397. if(c == '+' || c == '-')
  398. c = lex_get_save(lex, error);
  399. if(!isdigit(c)) {
  400. lex_unget_unsave(lex, c);
  401. goto out;
  402. }
  403. c = lex_get_save(lex, error);
  404. while(isdigit(c))
  405. c = lex_get_save(lex, error);
  406. }
  407. lex_unget_unsave(lex, c);
  408. saved_text = strbuffer_value(&lex->saved_text);
  409. value = strtod(saved_text, &end);
  410. assert(end == saved_text + lex->saved_text.length);
  411. if(errno == ERANGE && value != 0) {
  412. error_set(error, lex, "real number overflow");
  413. goto out;
  414. }
  415. lex->token = TOKEN_REAL;
  416. lex->value.real = value;
  417. return 0;
  418. out:
  419. return -1;
  420. }
  421. static int lex_scan(lex_t *lex, json_error_t *error)
  422. {
  423. char c;
  424. strbuffer_clear(&lex->saved_text);
  425. if(lex->token == TOKEN_STRING) {
  426. free(lex->value.string);
  427. lex->value.string = NULL;
  428. }
  429. c = lex_get(lex, error);
  430. while(c == ' ' || c == '\t' || c == '\n' || c == '\r')
  431. {
  432. if(c == '\n')
  433. lex->line++;
  434. c = lex_get(lex, error);
  435. }
  436. if(c == (char)EOF) {
  437. if(lex_eof(lex))
  438. lex->token = TOKEN_EOF;
  439. else
  440. lex->token = TOKEN_INVALID;
  441. goto out;
  442. }
  443. lex_save(lex, c);
  444. if(c == '{' || c == '}' || c == '[' || c == ']' || c == ':' || c == ',')
  445. lex->token = c;
  446. else if(c == '"')
  447. lex_scan_string(lex, error);
  448. else if(isdigit(c) || c == '-') {
  449. if(lex_scan_number(lex, c, error))
  450. goto out;
  451. }
  452. else if(isupper(c) || islower(c)) {
  453. /* eat up the whole identifier for clearer error messages */
  454. const char *saved_text;
  455. c = lex_get_save(lex, error);
  456. while(isupper(c) || islower(c))
  457. c = lex_get_save(lex, error);
  458. lex_unget_unsave(lex, c);
  459. saved_text = strbuffer_value(&lex->saved_text);
  460. if(strcmp(saved_text, "true") == 0)
  461. lex->token = TOKEN_TRUE;
  462. else if(strcmp(saved_text, "false") == 0)
  463. lex->token = TOKEN_FALSE;
  464. else if(strcmp(saved_text, "null") == 0)
  465. lex->token = TOKEN_NULL;
  466. else
  467. lex->token = TOKEN_INVALID;
  468. }
  469. else {
  470. /* save the rest of the input UTF-8 sequence to get an error
  471. message of valid UTF-8 */
  472. lex_save_cached(lex);
  473. lex->token = TOKEN_INVALID;
  474. }
  475. out:
  476. return lex->token;
  477. }
  478. static char *lex_steal_string(lex_t *lex)
  479. {
  480. char *result = NULL;
  481. if(lex->token == TOKEN_STRING)
  482. {
  483. result = lex->value.string;
  484. lex->value.string = NULL;
  485. }
  486. return result;
  487. }
  488. static int lex_init(lex_t *lex, get_func get, eof_func eof, void *data)
  489. {
  490. stream_init(&lex->stream, get, eof, data);
  491. if(strbuffer_init(&lex->saved_text))
  492. return -1;
  493. lex->token = TOKEN_INVALID;
  494. lex->line = 1;
  495. return 0;
  496. }
  497. static void lex_close(lex_t *lex)
  498. {
  499. if(lex->token == TOKEN_STRING)
  500. free(lex->value.string);
  501. strbuffer_close(&lex->saved_text);
  502. }
  503. /*** parser ***/
  504. static json_t *parse_value(lex_t *lex, json_error_t *error);
  505. static json_t *parse_object(lex_t *lex, json_error_t *error)
  506. {
  507. json_t *object = json_object();
  508. if(!object)
  509. return NULL;
  510. lex_scan(lex, error);
  511. if(lex->token == '}')
  512. return object;
  513. while(1) {
  514. char *key;
  515. json_t *value;
  516. if(lex->token != TOKEN_STRING) {
  517. error_set(error, lex, "string or '}' expected");
  518. goto error;
  519. }
  520. key = lex_steal_string(lex);
  521. if(!key)
  522. return NULL;
  523. lex_scan(lex, error);
  524. if(lex->token != ':') {
  525. free(key);
  526. error_set(error, lex, "':' expected");
  527. goto error;
  528. }
  529. lex_scan(lex, error);
  530. value = parse_value(lex, error);
  531. if(!value) {
  532. free(key);
  533. goto error;
  534. }
  535. if(json_object_set_nocheck(object, key, value)) {
  536. free(key);
  537. json_decref(value);
  538. goto error;
  539. }
  540. json_decref(value);
  541. free(key);
  542. lex_scan(lex, error);
  543. if(lex->token != ',')
  544. break;
  545. lex_scan(lex, error);
  546. }
  547. if(lex->token != '}') {
  548. error_set(error, lex, "'}' expected");
  549. goto error;
  550. }
  551. return object;
  552. error:
  553. json_decref(object);
  554. return NULL;
  555. }
  556. static json_t *parse_array(lex_t *lex, json_error_t *error)
  557. {
  558. json_t *array = json_array();
  559. if(!array)
  560. return NULL;
  561. lex_scan(lex, error);
  562. if(lex->token == ']')
  563. return array;
  564. while(lex->token) {
  565. json_t *elem = parse_value(lex, error);
  566. if(!elem)
  567. goto error;
  568. if(json_array_append(array, elem)) {
  569. json_decref(elem);
  570. goto error;
  571. }
  572. json_decref(elem);
  573. lex_scan(lex, error);
  574. if(lex->token != ',')
  575. break;
  576. lex_scan(lex, error);
  577. }
  578. if(lex->token != ']') {
  579. error_set(error, lex, "']' expected");
  580. goto error;
  581. }
  582. return array;
  583. error:
  584. json_decref(array);
  585. return NULL;
  586. }
  587. static json_t *parse_value(lex_t *lex, json_error_t *error)
  588. {
  589. json_t *json;
  590. switch(lex->token) {
  591. case TOKEN_STRING: {
  592. json = json_string_nocheck(lex->value.string);
  593. break;
  594. }
  595. case TOKEN_INTEGER: {
  596. json = json_integer(lex->value.integer);
  597. break;
  598. }
  599. case TOKEN_REAL: {
  600. json = json_real(lex->value.real);
  601. break;
  602. }
  603. case TOKEN_TRUE:
  604. json = json_true();
  605. break;
  606. case TOKEN_FALSE:
  607. json = json_false();
  608. break;
  609. case TOKEN_NULL:
  610. json = json_null();
  611. break;
  612. case '{':
  613. json = parse_object(lex, error);
  614. break;
  615. case '[':
  616. json = parse_array(lex, error);
  617. break;
  618. case TOKEN_INVALID:
  619. error_set(error, lex, "invalid token");
  620. return NULL;
  621. default:
  622. error_set(error, lex, "unexpected token");
  623. return NULL;
  624. }
  625. if(!json)
  626. return NULL;
  627. return json;
  628. }
  629. static json_t *parse_json(lex_t *lex, json_error_t *error)
  630. {
  631. error_init(error);
  632. lex_scan(lex, error);
  633. if(lex->token != '[' && lex->token != '{') {
  634. error_set(error, lex, "'[' or '{' expected");
  635. return NULL;
  636. }
  637. return parse_value(lex, error);
  638. }
  639. typedef struct
  640. {
  641. const char *data;
  642. int pos;
  643. } string_data_t;
  644. static int string_get(void *data)
  645. {
  646. char c;
  647. string_data_t *stream = (string_data_t *)data;
  648. c = stream->data[stream->pos];
  649. if(c == '\0')
  650. return EOF;
  651. else
  652. {
  653. stream->pos++;
  654. return c;
  655. }
  656. }
  657. static int string_eof(void *data)
  658. {
  659. string_data_t *stream = (string_data_t *)data;
  660. return (stream->data[stream->pos] == '\0');
  661. }
  662. json_t *json_loads(const char *string, json_error_t *error)
  663. {
  664. lex_t lex;
  665. json_t *result;
  666. string_data_t stream_data = {
  667. .data = string,
  668. .pos = 0
  669. };
  670. if(lex_init(&lex, string_get, string_eof, (void *)&stream_data))
  671. return NULL;
  672. result = parse_json(&lex, error);
  673. if(!result)
  674. goto out;
  675. lex_scan(&lex, error);
  676. if(lex.token != TOKEN_EOF) {
  677. error_set(error, &lex, "end of file expected");
  678. json_decref(result);
  679. result = NULL;
  680. }
  681. out:
  682. lex_close(&lex);
  683. return result;
  684. }
  685. json_t *json_loadf(FILE *input, json_error_t *error)
  686. {
  687. lex_t lex;
  688. json_t *result;
  689. if(lex_init(&lex, (get_func)fgetc, (eof_func)feof, input))
  690. return NULL;
  691. result = parse_json(&lex, error);
  692. if(!result)
  693. goto out;
  694. lex_scan(&lex, error);
  695. if(lex.token != TOKEN_EOF) {
  696. error_set(error, &lex, "end of file expected");
  697. json_decref(result);
  698. result = NULL;
  699. }
  700. out:
  701. lex_close(&lex);
  702. return result;
  703. }
  704. json_t *json_load_file(const char *path, json_error_t *error)
  705. {
  706. json_t *result;
  707. FILE *fp;
  708. error_init(error);
  709. fp = fopen(path, "r");
  710. if(!fp)
  711. {
  712. error_set(error, NULL, "unable to open %s: %s",
  713. path, strerror(errno));
  714. return NULL;
  715. }
  716. result = json_loadf(fp, error);
  717. fclose(fp);
  718. return result;
  719. }