From 05d0903db7a5822d046c32c8f294d71923ccbc40 Mon Sep 17 00:00:00 2001 From: David Malcolm Date: Tue, 23 May 2017 07:25:49 -0400 Subject: [PATCH 31/31] FIXME: json cleanups --- gcc/json.c | 134 ++++++++++++++++++++++++++++++++++++++++++++++++------------- gcc/json.h | 3 +- 2 files changed, 107 insertions(+), 30 deletions(-) diff --git a/gcc/json.c b/gcc/json.c index c9b3d97..4d3e8fa 100644 --- a/gcc/json.c +++ b/gcc/json.c @@ -28,7 +28,8 @@ using namespace json; /* class json::value. */ -/* FIXME. */ +/* Generate a char * for this json::value tree. + The returned value must be freed by the caller. */ char * value::to_str () const @@ -38,7 +39,9 @@ value::to_str () const return xstrdup (pp_formatted_text (&pp)); } -/* FIXME. */ +/* Dump this json::value tree to OUTF. + No formatting is done. There are no guarantees about the order + in which the key/value pairs of json::objects are printed. */ void value::dump (FILE *outf) const @@ -49,7 +52,8 @@ value::dump (FILE *outf) const pp_flush (&pp); } -/* FIXME. */ +/* If this json::value is a json::object, return it, + otherwise return NULL. */ const object * value::as_object () const @@ -59,7 +63,8 @@ value::as_object () const return static_cast (this); } -/* FIXME. */ +/* If this json::value is a json::array, return it, + otherwise return NULL. */ const array * value::as_array () const @@ -69,7 +74,8 @@ value::as_array () const return static_cast (this); } -/* FIXME. */ +/* If this json::value is a json::number, return it, + otherwise return NULL. */ const number * value::as_number () const @@ -79,7 +85,8 @@ value::as_number () const return static_cast (this); } -/* FIXME. */ +/* If this json::value is a json::string, return it, + otherwise return NULL. */ const string * value::as_string () const @@ -90,7 +97,10 @@ value::as_string () const } /* Attempt to get lookup the value of a key/value pair from this value - as if it is an object. + as if this value were an object. + + To succeed, THIS must be a json::object, and it must have a key named + NAME. On success, return true and write the value to OUT_VALUE. On failure, return false and write an error message to OUT_ERR @@ -155,7 +165,8 @@ value::get_string_by_key (const char *name, const char *&out_value, return true; } -/* class json::object, a subclass of json::value. */ +/* class json::object, a subclass of json::value, representing + an unordered collection of key/value pairs. */ /* json:object's dtor. */ @@ -222,7 +233,8 @@ object::set (const char *key, value *v) m_map.put (xstrdup (key), v); } -/* class json::array, a subclass of json::value. */ +/* class json::array, a subclass of json::value, representing + an ordered collection of values. */ /* json::array's dtor. */ @@ -264,7 +276,7 @@ array::clone () const return other; } -/* class json::number, a subclass of json::value. */ +/* class json::number, a subclass of json::value, wrapping a double. */ /* Implementation of json::value::print for json::number. */ @@ -348,16 +360,17 @@ literal::clone () const return new literal (m_kind); } + /* Declarations relating to parsing JSON, all within an anonymous namespace. */ namespace { -/* FIXME. */ +/* A typedef representing a single unicode character. */ typedef unsigned unichar; -/* FIXME. */ +/* An enum for discriminating different kinds of JSON token. */ enum token_id { @@ -382,7 +395,7 @@ enum token_id TOK_NUMBER }; -/* FIXME. */ +/* Human-readable descriptions of enum token_id. */ static const char *token_id_name[] = { "error", @@ -400,15 +413,23 @@ static const char *token_id_name[] = { "number" }; -/* FIXME. */ +/* Tokens within the JSON lexer. */ struct token { + /* The kind of token. */ enum token_id id; + + /* The location of this token within the unicode + character stream. */ int index; + union { + /* Value for TOK_STRING. */ char *string; + + /* Value for TOK_NUMBER. */ double number; } u; }; @@ -468,14 +489,14 @@ class parser /* Parser implementation. */ -/* FIXME. */ +/* lexer's ctor. */ lexer::lexer () : m_buffer (), m_next_char_idx (0), m_num_next_tokens (0) { } -/* FIXME. */ +/* Peek the next token. */ const token * lexer::peek () @@ -488,7 +509,7 @@ lexer::peek () return &m_next_tokens[0]; } -/* FIXME. */ +/* Consume the next token. */ void lexer::consume () @@ -514,17 +535,21 @@ lexer::consume () sizeof (token) * m_num_next_tokens); } -/* FIXME. */ +/* Add LENGTH bytes of UTF-8 encoded text from UTF8_BUF to this lexer' + buffer. */ void lexer::add_utf8 (size_t length, const char *utf8_buf) { - // FIXME: this assumes Latin-1. + /* FIXME: this blithely ignores the niceties of UTF-8 and simply pushes the + bytes into the buffer. */ for (size_t i = 0; i < length; i++) m_buffer.safe_push (utf8_buf[i]); } -/* FIXME. */ +/* Attempt to get the next unicode character from this lexer's buffer. + If successful, write it to OUT and return true. + Otherwise, return false. */ bool lexer::get_char (unichar &out) @@ -544,7 +569,9 @@ lexer::unget_char () --m_next_char_idx; } -/* FIXME. */ +/* Print a textual representation of TOK to OUTF. + This is intended for debugging the lexer and parser, + rather than for user-facing output. */ void lexer::dump_token (FILE *outf, const token *tok) @@ -609,7 +636,8 @@ lexer::dump_token (FILE *outf, const token *tok) } } -/* FIXME. */ +/* Attempt to lex the input buffer, writing the next token to OUT. + On errors, TOK_ERROR (or TOK_EOF) is written to OUT. */ void lexer::lex_token (token *out) @@ -709,7 +737,9 @@ lexer::lex_token (token *out) } } -/* FIXME. */ +/* Having consumed an open-quote character from the lexer's buffer, attempt + to lex the rest of a JSON string, writing the result to OUT (or TOK_ERROR) + if an error occurred. */ void lexer::lex_string (token *out) @@ -770,7 +800,9 @@ lexer::lex_string (token *out) // FIXME: leaks? have a json_context do the allocation } -/* FIXME. */ +/* Having consumed FIRST_CHAR, an initial digit or '-' character from + the lexer's buffer attempt to lex the rest of a JSON number, writing + the result to OUT (or TOK_ERROR) if an error occurred. */ void lexer::lex_number (token *out, unichar first_char) @@ -829,7 +861,7 @@ lexer::rest_of_literal_p (const char *suffix) const } } -/* FIXME. */ +/* parser's ctor. */ parser::parser (char **err_out) : m_lexer (), m_err_out (err_out) @@ -839,7 +871,8 @@ parser::parser (char **err_out) *err_out = NULL; } -/* FIXME. */ +/* Add LENGTH bytes of UTF-8 encoded text from UTF8_BUF to this parser's + lexer's buffer. */ void parser::add_utf8 (size_t length, const char *utf8_buf) @@ -996,7 +1029,7 @@ parser::parse_array () return result; } -/* FIXME. */ +/* Consume the next token, issuing an error if it is not of kind TOK_ID. */ void parser::require (enum token_id tok_id) @@ -1008,7 +1041,12 @@ parser::require (enum token_id tok_id) m_lexer.consume (); } -/* FIXME. */ +/* Issue a parsing error. If this is the first error that has occurred on + the parser, store it within the parser's m_err_out (the buffer will + eventually need to be free by the caller of the parser). + Otherwise the error is discarded. + + TODO: maybe provide a callback so that client code can print all errors? */ void parser::error_at (int index, const char *fmt, ...) @@ -1067,7 +1105,7 @@ json::parse_utf8_string (const char *utf8, char **err_out) return parse_utf8_string (strlen (utf8), utf8, err_out); } - + #if CHECKING_P namespace selftest { @@ -1095,6 +1133,12 @@ test_parse_string () ASSERT_EQ (JSON_STRING, jv->get_kind ()); ASSERT_STREQ ("foo", ((json::string *)jv)->get_string ()); assert_to_str_eq ("\"foo\"", jv); + + json::value *clone = jv->clone (); + ASSERT_EQ (JSON_STRING, clone->get_kind ()); + ASSERT_STREQ ("foo", ((json::string *)clone)->get_string ()); + assert_to_str_eq ("\"foo\"", clone); + delete clone; delete jv; const char *contains_quotes = "\"before \\\"quoted\\\" after\""; @@ -1104,6 +1148,38 @@ test_parse_string () ASSERT_STREQ ("before \"quoted\" after", ((json::string *)jv)->get_string ()); assert_to_str_eq (contains_quotes, jv); delete jv; + + /* Test of non-ASCII input. This string is the Japanese word "mojibake", + written as C octal-escaped UTF-8. */ + const char *mojibake = (/* Opening quote. */ + "\"" + /* U+6587 CJK UNIFIED IDEOGRAPH-6587 + UTF-8: 0xE6 0x96 0x87 + C octal escaped UTF-8: \346\226\207. */ + "\346\226\207" + /* U+5B57 CJK UNIFIED IDEOGRAPH-5B57 + UTF-8: 0xE5 0xAD 0x97 + C octal escaped UTF-8: \345\255\227. */ + "\345\255\227" + /* U+5316 CJK UNIFIED IDEOGRAPH-5316 + UTF-8: 0xE5 0x8C 0x96 + C octal escaped UTF-8: \345\214\226. */ + "\345\214\226" + /* U+3051 HIRAGANA LETTER KE + UTF-8: 0xE3 0x81 0x91 + C octal escaped UTF-8: \343\201\221. */ + "\343\201\221" + /* Closing quote. */ + "\""); + jv = parse_utf8_string (mojibake, &err); + ASSERT_EQ (NULL, err); + ASSERT_EQ (JSON_STRING, jv->get_kind ()); + /* Result of get_string should be UTF-8 encoded, without quotes. */ + ASSERT_STREQ ("\346\226\207" "\345\255\227" "\345\214\226" "\343\201\221", + ((json::string *)jv)->get_string ()); + /* Result of dump should be UTF-8 encoded, with quotes. */ + assert_to_str_eq (mojibake, jv); + delete jv; } /* FIXME. */ diff --git a/gcc/json.h b/gcc/json.h index 6e71beb..b6bdb13 100644 --- a/gcc/json.h +++ b/gcc/json.h @@ -100,7 +100,8 @@ class value char *&out_err) const; }; -/* Subclass of value for objects: key/value pairs. */ +/* Subclass of value for objects: an unordered collection of + key/value pairs. */ class object : public value { -- 1.8.5.3