/* gmarkup.c - Simple XML-like parser * * Copyright 2000, 2003 Red Hat, Inc. * Copyright 2007, 2008 Ryan Lortie * * GLib is free software; you can redistribute it and/or modify it * under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * GLib is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with GLib; see the file COPYING.LIB. If not, * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, * Boston, MA 02111-1307, USA. */ #include "config.h" #include #include #include #include #include #include "glib.h" #include "glibintl.h" #include "galias.h" GQuark g_markup_error_quark (void) { return g_quark_from_static_string ("g-markup-error-quark"); } typedef enum { STATE_START, STATE_AFTER_OPEN_ANGLE, STATE_AFTER_CLOSE_ANGLE, STATE_AFTER_ELISION_SLASH, /* the slash that obviates need for end element */ STATE_INSIDE_OPEN_TAG_NAME, STATE_INSIDE_ATTRIBUTE_NAME, STATE_AFTER_ATTRIBUTE_NAME, STATE_BETWEEN_ATTRIBUTES, STATE_AFTER_ATTRIBUTE_EQUALS_SIGN, STATE_INSIDE_ATTRIBUTE_VALUE_SQ, STATE_INSIDE_ATTRIBUTE_VALUE_DQ, STATE_INSIDE_TEXT, STATE_AFTER_CLOSE_TAG_SLASH, STATE_INSIDE_CLOSE_TAG_NAME, STATE_AFTER_CLOSE_TAG_NAME, STATE_INSIDE_PASSTHROUGH, STATE_ERROR } GMarkupParseState; typedef struct { const char *prev_element; const GMarkupParser *prev_parser; gpointer prev_user_data; } GMarkupRecursionTracker; struct _GMarkupParseContext { const GMarkupParser *parser; GMarkupParseFlags flags; gint line_number; gint char_number; gpointer user_data; GDestroyNotify dnotify; /* A piece of character data or an element that * hasn't "ended" yet so we haven't yet called * the callback for it. */ GString *partial_chunk; GMarkupParseState state; GSList *tag_stack; gchar **attr_names; gchar **attr_values; gint cur_attr; gint alloc_attrs; const gchar *current_text; gssize current_text_len; const gchar *current_text_end; GString *leftover_char_portion; /* used to save the start of the last interesting thingy */ const gchar *start; const gchar *iter; guint document_empty : 1; guint parsing : 1; guint awaiting_pop : 1; gint balance; /* subparser support */ GSList *subparser_stack; /* (GMarkupRecursionTracker *) */ const char *subparser_element; gpointer held_user_data; }; /** * g_markup_parse_context_new: * @parser: a #GMarkupParser * @flags: one or more #GMarkupParseFlags * @user_data: user data to pass to #GMarkupParser functions * @user_data_dnotify: user data destroy notifier called when the parse context is freed * * Creates a new parse context. A parse context is used to parse * marked-up documents. You can feed any number of documents into * a context, as long as no errors occur; once an error occurs, * the parse context can't continue to parse text (you have to free it * and create a new parse context). * * Return value: a new #GMarkupParseContext **/ GMarkupParseContext * g_markup_parse_context_new (const GMarkupParser *parser, GMarkupParseFlags flags, gpointer user_data, GDestroyNotify user_data_dnotify) { GMarkupParseContext *context; g_return_val_if_fail (parser != NULL, NULL); context = g_new (GMarkupParseContext, 1); context->parser = parser; context->flags = flags; context->user_data = user_data; context->dnotify = user_data_dnotify; context->line_number = 1; context->char_number = 1; context->partial_chunk = NULL; context->state = STATE_START; context->tag_stack = NULL; context->attr_names = NULL; context->attr_values = NULL; context->cur_attr = -1; context->alloc_attrs = 0; context->current_text = NULL; context->current_text_len = -1; context->current_text_end = NULL; context->leftover_char_portion = NULL; context->start = NULL; context->iter = NULL; context->document_empty = TRUE; context->parsing = FALSE; context->awaiting_pop = FALSE; context->subparser_stack = NULL; context->subparser_element = NULL; /* this is only looked at if awaiting_pop = TRUE. initialise anyway. */ context->held_user_data = NULL; context->balance = 0; return context; } /** * g_markup_parse_context_free: * @context: a #GMarkupParseContext * * Frees a #GMarkupParseContext. Can't be called from inside * one of the #GMarkupParser functions. Can't be called while * a subparser is pushed. **/ void g_markup_parse_context_free (GMarkupParseContext *context) { g_return_if_fail (context != NULL); g_return_if_fail (!context->parsing); g_return_if_fail (!context->subparser_stack); g_return_if_fail (!context->awaiting_pop); if (context->dnotify) (* context->dnotify) (context->user_data); g_strfreev (context->attr_names); g_strfreev (context->attr_values); g_slist_foreach (context->tag_stack, (GFunc)g_free, NULL); g_slist_free (context->tag_stack); if (context->partial_chunk) g_string_free (context->partial_chunk, TRUE); if (context->leftover_char_portion) g_string_free (context->leftover_char_portion, TRUE); g_free (context); } static void pop_subparser_stack (GMarkupParseContext *context); static void mark_error (GMarkupParseContext *context, GError *error) { context->state = STATE_ERROR; if (context->parser->error) (*context->parser->error) (context, error, context->user_data); /* report the error all the way up to free all the user-data */ while (context->subparser_stack) { pop_subparser_stack (context); context->awaiting_pop = FALSE; /* already been freed */ if (context->parser->error) (*context->parser->error) (context, error, context->user_data); } } static void set_error (GMarkupParseContext *context, GError **error, GMarkupError code, const gchar *format, ...) G_GNUC_PRINTF (4, 5); static void set_error (GMarkupParseContext *context, GError **error, GMarkupError code, const gchar *format, ...) { GError *tmp_error; gchar *s; gchar *s_valid; va_list args; va_start (args, format); s = g_strdup_vprintf (format, args); va_end (args); /* Make sure that the GError message is valid UTF-8 even if it is * complaining about invalid UTF-8 in the markup: */ s_valid = _g_utf8_make_valid (s); tmp_error = g_error_new_literal (G_MARKUP_ERROR, code, s_valid); g_free (s); g_free (s_valid); g_prefix_error (&tmp_error, _("Error on line %d char %d: "), context->line_number, context->char_number); mark_error (context, tmp_error); g_propagate_error (error, tmp_error); } static void propagate_error (GMarkupParseContext *context, GError **dest, GError *src) { if (context->flags & G_MARKUP_PREFIX_ERROR_POSITION) g_prefix_error (&src, _("Error on line %d char %d: "), context->line_number, context->char_number); mark_error (context, src); g_propagate_error (dest, src); } /* To make these faster, we first use the ascii-only tests, then check * for the usual non-alnum name-end chars, and only then call the * expensive unicode stuff. Nobody uses non-ascii in XML tag/attribute * names, so this is a reasonable hack that virtually always avoids * the guniprop call. */ #define IS_COMMON_NAME_END_CHAR(c) \ ((c) == '=' || (c) == '/' || (c) == '>' || (c) == ' ') static gboolean is_name_start_char (const gchar *p) { if (g_ascii_isalpha (*p) || (!IS_COMMON_NAME_END_CHAR (*p) && (*p == '_' || *p == ':' || g_unichar_isalpha (g_utf8_get_char (p))))) return TRUE; else return FALSE; } static gboolean is_name_char (const gchar *p) { if (g_ascii_isalnum (*p) || (!IS_COMMON_NAME_END_CHAR (*p) && (*p == '.' || *p == '-' || *p == '_' || *p == ':' || g_unichar_isalpha (g_utf8_get_char (p))))) return TRUE; else return FALSE; } static gchar* char_str (gunichar c, gchar *buf) { memset (buf, 0, 8); g_unichar_to_utf8 (c, buf); return buf; } static gchar* utf8_str (const gchar *utf8, gchar *buf) { char_str (g_utf8_get_char (utf8), buf); return buf; } static void set_unescape_error (GMarkupParseContext *context, GError **error, const gchar *remaining_text, const gchar *remaining_text_end, GMarkupError code, const gchar *format, ...) { GError *tmp_error; gchar *s; va_list args; gint remaining_newlines; const gchar *p; remaining_newlines = 0; p = remaining_text; while (p != remaining_text_end) { if (*p == '\n') ++remaining_newlines; ++p; } va_start (args, format); s = g_strdup_vprintf (format, args); va_end (args); tmp_error = g_error_new (G_MARKUP_ERROR, code, _("Error on line %d: %s"), context->line_number - remaining_newlines, s); g_free (s); mark_error (context, tmp_error); g_propagate_error (error, tmp_error); } typedef enum { USTATE_INSIDE_TEXT, USTATE_AFTER_AMPERSAND, USTATE_INSIDE_ENTITY_NAME, USTATE_AFTER_CHARREF_HASH } UnescapeState; typedef struct { GMarkupParseContext *context; GString *str; UnescapeState state; const gchar *text; const gchar *text_end; const gchar *entity_start; } UnescapeContext; static const gchar* unescape_text_state_inside_text (UnescapeContext *ucontext, const gchar *p, GError **error) { const gchar *start; gboolean normalize_attribute; if (ucontext->context->state == STATE_INSIDE_ATTRIBUTE_VALUE_SQ || ucontext->context->state == STATE_INSIDE_ATTRIBUTE_VALUE_DQ) normalize_attribute = TRUE; else normalize_attribute = FALSE; start = p; while (p != ucontext->text_end) { if (*p == '&') { break; } else if (normalize_attribute && (*p == '\t' || *p == '\n')) { g_string_append_len (ucontext->str, start, p - start); g_string_append_c (ucontext->str, ' '); p = g_utf8_next_char (p); start = p; } else if (*p == '\r') { g_string_append_len (ucontext->str, start, p - start); g_string_append_c (ucontext->str, normalize_attribute ? ' ' : '\n'); p = g_utf8_next_char (p); if (p != ucontext->text_end && *p == '\n') p = g_utf8_next_char (p); start = p; } else p = g_utf8_next_char (p); } if (p != start) g_string_append_len (ucontext->str, start, p - start); if (p != ucontext->text_end && *p == '&') { p = g_utf8_next_char (p); ucontext->state = USTATE_AFTER_AMPERSAND; } return p; } static const gchar* unescape_text_state_after_ampersand (UnescapeContext *ucontext, const gchar *p, GError **error) { ucontext->entity_start = NULL; if (*p == '#') { p = g_utf8_next_char (p); ucontext->entity_start = p; ucontext->state = USTATE_AFTER_CHARREF_HASH; } else if (!is_name_start_char (p)) { if (*p == ';') { set_unescape_error (ucontext->context, error, p, ucontext->text_end, G_MARKUP_ERROR_PARSE, _("Empty entity '&;' seen; valid " "entities are: & " < > '")); } else { gchar buf[8]; set_unescape_error (ucontext->context, error, p, ucontext->text_end, G_MARKUP_ERROR_PARSE, _("Character '%s' is not valid at " "the start of an entity name; " "the & character begins an entity; " "if this ampersand isn't supposed " "to be an entity, escape it as " "&"), utf8_str (p, buf)); } } else { ucontext->entity_start = p; ucontext->state = USTATE_INSIDE_ENTITY_NAME; } return p; } static const gchar* unescape_text_state_inside_entity_name (UnescapeContext *ucontext, const gchar *p, GError **error) { while (p != ucontext->text_end) { if (*p == ';') break; else if (!is_name_char (p)) { gchar ubuf[8]; set_unescape_error (ucontext->context, error, p, ucontext->text_end, G_MARKUP_ERROR_PARSE, _("Character '%s' is not valid " "inside an entity name"), utf8_str (p, ubuf)); break; } p = g_utf8_next_char (p); } if (ucontext->context->state != STATE_ERROR) { if (p != ucontext->text_end) { gint len = p - ucontext->entity_start; /* move to after semicolon */ p = g_utf8_next_char (p); ucontext->state = USTATE_INSIDE_TEXT; if (strncmp (ucontext->entity_start, "lt", len) == 0) g_string_append_c (ucontext->str, '<'); else if (strncmp (ucontext->entity_start, "gt", len) == 0) g_string_append_c (ucontext->str, '>'); else if (strncmp (ucontext->entity_start, "amp", len) == 0) g_string_append_c (ucontext->str, '&'); else if (strncmp (ucontext->entity_start, "quot", len) == 0) g_string_append_c (ucontext->str, '"'); else if (strncmp (ucontext->entity_start, "apos", len) == 0) g_string_append_c (ucontext->str, '\''); else { gchar *name; name = g_strndup (ucontext->entity_start, len); set_unescape_error (ucontext->context, error, p, ucontext->text_end, G_MARKUP_ERROR_PARSE, _("Entity name '%s' is not known"), name); g_free (name); } } else { set_unescape_error (ucontext->context, error, /* give line number of the & */ ucontext->entity_start, ucontext->text_end, G_MARKUP_ERROR_PARSE, _("Entity did not end with a semicolon; " "most likely you used an ampersand " "character without intending to start " "an entity - escape ampersand as &")); } } #undef MAX_ENT_LEN return p; } static const gchar* unescape_text_state_after_charref_hash (UnescapeContext *ucontext, const gchar *p, GError **error) { gboolean is_hex = FALSE; const char *start; start = ucontext->entity_start; if (*p == 'x') { is_hex = TRUE; p = g_utf8_next_char (p); start = p; } while (p != ucontext->text_end && *p != ';') p = g_utf8_next_char (p); if (p != ucontext->text_end) { g_assert (*p == ';'); /* digit is between start and p */ if (start != p) { gulong l; gchar *end = NULL; errno = 0; if (is_hex) l = strtoul (start, &end, 16); else l = strtoul (start, &end, 10); if (end != p || errno != 0) { set_unescape_error (ucontext->context, error, start, ucontext->text_end, G_MARKUP_ERROR_PARSE, _("Failed to parse '%-.*s', which " "should have been a digit " "inside a character reference " "(ê for example) - perhaps " "the digit is too large"), p - start, start); } else { /* characters XML permits */ if (l == 0x9 || l == 0xA || l == 0xD || (l >= 0x20 && l <= 0xD7FF) || (l >= 0xE000 && l <= 0xFFFD) || (l >= 0x10000 && l <= 0x10FFFF)) { gchar buf[8]; g_string_append (ucontext->str, char_str (l, buf)); } else { set_unescape_error (ucontext->context, error, start, ucontext->text_end, G_MARKUP_ERROR_PARSE, _("Character reference '%-.*s' does not " "encode a permitted character"), p - start, start); } } /* Move to next state */ p = g_utf8_next_char (p); /* past semicolon */ ucontext->state = USTATE_INSIDE_TEXT; } else { set_unescape_error (ucontext->context, error, start, ucontext->text_end, G_MARKUP_ERROR_PARSE, _("Empty character reference; " "should include a digit such as " "dž")); } } else { set_unescape_error (ucontext->context, error, start, ucontext->text_end, G_MARKUP_ERROR_PARSE, _("Character reference did not end with a " "semicolon; " "most likely you used an ampersand " "character without intending to start " "an entity - escape ampersand as &")); } return p; } static gboolean unescape_text (GMarkupParseContext *context, const gchar *text, const gchar *text_end, GString **unescaped, GError **error) { UnescapeContext ucontext; const gchar *p; ucontext.context = context; ucontext.text = text; ucontext.text_end = text_end; ucontext.entity_start = NULL; ucontext.str = g_string_sized_new (text_end - text); ucontext.state = USTATE_INSIDE_TEXT; p = text; while (p != text_end && context->state != STATE_ERROR) { g_assert (p < text_end); switch (ucontext.state) { case USTATE_INSIDE_TEXT: { p = unescape_text_state_inside_text (&ucontext, p, error); } break; case USTATE_AFTER_AMPERSAND: { p = unescape_text_state_after_ampersand (&ucontext, p, error); } break; case USTATE_INSIDE_ENTITY_NAME: { p = unescape_text_state_inside_entity_name (&ucontext, p, error); } break; case USTATE_AFTER_CHARREF_HASH: { p = unescape_text_state_after_charref_hash (&ucontext, p, error); } break; default: g_assert_not_reached (); break; } } if (context->state != STATE_ERROR) { switch (ucontext.state) { case USTATE_INSIDE_TEXT: break; case USTATE_AFTER_AMPERSAND: case USTATE_INSIDE_ENTITY_NAME: set_unescape_error (context, error, NULL, NULL, G_MARKUP_ERROR_PARSE, _("Unfinished entity reference")); break; case USTATE_AFTER_CHARREF_HASH: set_unescape_error (context, error, NULL, NULL, G_MARKUP_ERROR_PARSE, _("Unfinished character reference")); break; } } if (context->state == STATE_ERROR) { g_string_free (ucontext.str, TRUE); *unescaped = NULL; return FALSE; } else { *unescaped = ucontext.str; return TRUE; } } static inline gboolean advance_char (GMarkupParseContext *context) { context->iter = g_utf8_next_char (context->iter); context->char_number += 1; if (context->iter == context->current_text_end) { return FALSE; } else if (*context->iter == '\n') { context->line_number += 1; context->char_number = 1; } return TRUE; } static inline gboolean xml_isspace (char c) { return c == ' ' || c == '\t' || c == '\n' || c == '\r'; } static void skip_spaces (GMarkupParseContext *context) { do { if (!xml_isspace (*context->iter)) return; } while (advance_char (context)); } static void advance_to_name_end (GMarkupParseContext *context) { do { if (!is_name_char (context->iter)) return; } while (advance_char (context)); } static void add_to_partial (GMarkupParseContext *context, const gchar *text_start, const gchar *text_end) { if (context->partial_chunk == NULL) context->partial_chunk = g_string_sized_new (text_end - text_start); if (text_start != text_end) g_string_append_len (context->partial_chunk, text_start, text_end - text_start); /* Invariant here that partial_chunk exists */ } static void truncate_partial (GMarkupParseContext *context) { if (context->partial_chunk != NULL) { context->partial_chunk = g_string_truncate (context->partial_chunk, 0); } } static const gchar* current_element (GMarkupParseContext *context) { return context->tag_stack->data; } static void pop_subparser_stack (GMarkupParseContext *context) { GMarkupRecursionTracker *tracker; g_assert (context->subparser_stack); tracker = context->subparser_stack->data; context->awaiting_pop = TRUE; context->held_user_data = context->user_data; context->user_data = tracker->prev_user_data; context->parser = tracker->prev_parser; context->subparser_element = tracker->prev_element; g_slice_free (GMarkupRecursionTracker, tracker); context->subparser_stack = g_slist_delete_link (context->subparser_stack, context->subparser_stack); } static void possibly_finish_subparser (GMarkupParseContext *context) { if (current_element (context) == context->subparser_element) pop_subparser_stack (context); } static void ensure_no_outstanding_subparser (GMarkupParseContext *context) { if (context->awaiting_pop) g_critical ("During the first end_element call after invoking a " "subparser you must pop the subparser stack and handle " "the freeing of the subparser user_data. This can be " "done by calling the end function of the subparser. " "Very probably, your program just leaked memory."); /* let valgrind watch the pointer disappear... */ context->held_user_data = NULL; context->awaiting_pop = FALSE; } static const gchar* current_attribute (GMarkupParseContext *context) { g_assert (context->cur_attr >= 0); return context->attr_names[context->cur_attr]; } static void find_current_text_end (GMarkupParseContext *context) { /* This function must be safe (non-segfaulting) on invalid UTF8. * It assumes the string starts with a character start */ const gchar *end = context->current_text + context->current_text_len; const gchar *p; const gchar *next; g_assert (context->current_text_len > 0); p = g_utf8_find_prev_char (context->current_text, end); g_assert (p != NULL); /* since current_text was a char start */ /* p is now the start of the last character or character portion. */ g_assert (p != end); next = g_utf8_next_char (p); /* this only touches *p, nothing beyond */ if (next == end) { /* whole character */ context->current_text_end = end; } else { /* portion */ context->leftover_char_portion = g_string_new_len (p, end - p); context->current_text_len -= (end - p); context->current_text_end = p; } } static void add_attribute (GMarkupParseContext *context, char *name) { if (context->cur_attr + 2 >= context->alloc_attrs) { context->alloc_attrs += 5; /* silly magic number */ context->attr_names = g_realloc (context->attr_names, sizeof(char*)*context->alloc_attrs); context->attr_values = g_realloc (context->attr_values, sizeof(char*)*context->alloc_attrs); } context->cur_attr++; context->attr_names[context->cur_attr] = name; context->attr_values[context->cur_attr] = NULL; context->attr_names[context->cur_attr+1] = NULL; context->attr_values[context->cur_attr+1] = NULL; } /** * g_markup_parse_context_parse: * @context: a #GMarkupParseContext * @text: chunk of text to parse * @text_len: length of @text in bytes * @error: return location for a #GError * * Feed some data to the #GMarkupParseContext. The data need not * be valid UTF-8; an error will be signaled if it's invalid. * The data need not be an entire document; you can feed a document * into the parser incrementally, via multiple calls to this function. * Typically, as you receive data from a network connection or file, * you feed each received chunk of data into this function, aborting * the process if an error occurs. Once an error is reported, no further * data may be fed to the #GMarkupParseContext; all errors are fatal. * * Return value: %FALSE if an error occurred, %TRUE on success **/ gboolean g_markup_parse_context_parse (GMarkupParseContext *context, const gchar *text, gssize text_len, GError **error) { const gchar *first_invalid; g_return_val_if_fail (context != NULL, FALSE); g_return_val_if_fail (text != NULL, FALSE); g_return_val_if_fail (context->state != STATE_ERROR, FALSE); g_return_val_if_fail (!context->parsing, FALSE); if (text_len < 0) text_len = strlen (text); if (text_len == 0) return TRUE; context->parsing = TRUE; if (context->leftover_char_portion) { const gchar *first_char; if ((*text & 0xc0) != 0x80) first_char = text; else first_char = g_utf8_find_next_char (text, text + text_len); if (first_char) { /* leftover_char_portion was completed. Parse it. */ GString *portion = context->leftover_char_portion; g_string_append_len (context->leftover_char_portion, text, first_char - text); /* hacks to allow recursion */ context->parsing = FALSE; context->leftover_char_portion = NULL; if (!g_markup_parse_context_parse (context, portion->str, portion->len, error)) { g_assert (context->state == STATE_ERROR); } g_string_free (portion, TRUE); context->parsing = TRUE; /* Skip the fraction of char that was in this text */ text_len -= (first_char - text); text = first_char; } else { /* another little chunk of the leftover char; geez * someone is inefficient. */ g_string_append_len (context->leftover_char_portion, text, text_len); if (context->leftover_char_portion->len > 7) { /* The leftover char portion is too big to be * a UTF-8 character */ set_error (context, error, G_MARKUP_ERROR_BAD_UTF8, _("Invalid UTF-8 encoded text - overlong sequence")); } goto finished; } } context->current_text = text; context->current_text_len = text_len; context->iter = context->current_text; context->start = context->iter; /* Nothing left after finishing the leftover char, or nothing * passed in to begin with. */ if (context->current_text_len == 0) goto finished; /* find_current_text_end () assumes the string starts at * a character start, so we need to validate at least * that much. It doesn't assume any following bytes * are valid. */ if ((*context->current_text & 0xc0) == 0x80) /* not a char start */ { set_error (context, error, G_MARKUP_ERROR_BAD_UTF8, _("Invalid UTF-8 encoded text - not a start char")); goto finished; } /* Initialize context->current_text_end, possibly adjusting * current_text_len, and add any leftover char portion */ find_current_text_end (context); /* Validate UTF8 (must be done after we find the end, since * we could have a trailing incomplete char) */ if (!g_utf8_validate (context->current_text, context->current_text_len, &first_invalid)) { gint newlines = 0; const gchar *p, *q; q = p = context->current_text; while (p != first_invalid) { if (*p == '\n') { ++newlines; q = p + 1; context->char_number = 1; } ++p; } context->line_number += newlines; context->char_number += g_utf8_strlen (q, first_invalid - q); set_error (context, error, G_MARKUP_ERROR_BAD_UTF8, _("Invalid UTF-8 encoded text - not valid '%s'"), g_strndup (context->current_text, context->current_text_len)); goto finished; } while (context->iter != context->current_text_end) { switch (context->state) { case STATE_START: /* Possible next state: AFTER_OPEN_ANGLE */ g_assert (context->tag_stack == NULL); /* whitespace is ignored outside of any elements */ skip_spaces (context); if (context->iter != context->current_text_end) { if (*context->iter == '<') { /* Move after the open angle */ advance_char (context); context->state = STATE_AFTER_OPEN_ANGLE; /* this could start a passthrough */ context->start = context->iter; /* document is now non-empty */ context->document_empty = FALSE; } else { set_error (context, error, G_MARKUP_ERROR_PARSE, _("Document must begin with an element (e.g. )")); } } break; case STATE_AFTER_OPEN_ANGLE: /* Possible next states: INSIDE_OPEN_TAG_NAME, * AFTER_CLOSE_TAG_SLASH, INSIDE_PASSTHROUGH */ if (*context->iter == '?' || *context->iter == '!') { /* include < in the passthrough */ const gchar *openangle = "<"; add_to_partial (context, openangle, openangle + 1); context->start = context->iter; context->balance = 1; context->state = STATE_INSIDE_PASSTHROUGH; } else if (*context->iter == '/') { /* move after it */ advance_char (context); context->state = STATE_AFTER_CLOSE_TAG_SLASH; } else if (is_name_start_char (context->iter)) { context->state = STATE_INSIDE_OPEN_TAG_NAME; /* start of tag name */ context->start = context->iter; } else { gchar buf[8]; set_error (context, error, G_MARKUP_ERROR_PARSE, _("'%s' is not a valid character following " "a '<' character; it may not begin an " "element name"), utf8_str (context->iter, buf)); } break; /* The AFTER_CLOSE_ANGLE state is actually sort of * broken, because it doesn't correspond to a range * of characters in the input stream as the others do, * and thus makes things harder to conceptualize */ case STATE_AFTER_CLOSE_ANGLE: /* Possible next states: INSIDE_TEXT, STATE_START */ if (context->tag_stack == NULL) { context->start = NULL; context->state = STATE_START; } else { context->start = context->iter; context->state = STATE_INSIDE_TEXT; } break; case STATE_AFTER_ELISION_SLASH: /* Possible next state: AFTER_CLOSE_ANGLE */ { /* We need to pop the tag stack and call the end_element * function, since this is the close tag */ GError *tmp_error = NULL; g_assert (context->tag_stack != NULL); possibly_finish_subparser (context); tmp_error = NULL; if (context->parser->end_element) (* context->parser->end_element) (context, context->tag_stack->data, context->user_data, &tmp_error); ensure_no_outstanding_subparser (context); if (tmp_error) { mark_error (context, tmp_error); g_propagate_error (error, tmp_error); } else { if (*context->iter == '>') { /* move after the close angle */ advance_char (context); context->state = STATE_AFTER_CLOSE_ANGLE; } else { gchar buf[8]; set_error (context, error, G_MARKUP_ERROR_PARSE, _("Odd character '%s', expected a '>' character " "to end the start tag of element '%s'"), utf8_str (context->iter, buf), current_element (context)); } } g_free (context->tag_stack->data); context->tag_stack = g_slist_delete_link (context->tag_stack, context->tag_stack); } break; case STATE_INSIDE_OPEN_TAG_NAME: /* Possible next states: BETWEEN_ATTRIBUTES */ /* if there's a partial chunk then it's the first part of the * tag name. If there's a context->start then it's the start * of the tag name in current_text, the partial chunk goes * before that start though. */ advance_to_name_end (context); if (context->iter == context->current_text_end) { /* The name hasn't necessarily ended. Merge with * partial chunk, leave state unchanged. */ add_to_partial (context, context->start, context->iter); } else { /* The name has ended. Combine it with the partial chunk * if any; push it on the stack; enter next state. */ add_to_partial (context, context->start, context->iter); context->tag_stack = g_slist_prepend (context->tag_stack, g_string_free (context->partial_chunk, FALSE)); context->partial_chunk = NULL; context->state = STATE_BETWEEN_ATTRIBUTES; context->start = NULL; } break; case STATE_INSIDE_ATTRIBUTE_NAME: /* Possible next states: AFTER_ATTRIBUTE_NAME */ advance_to_name_end (context); add_to_partial (context, context->start, context->iter); /* read the full name, if we enter the equals sign state * then add the attribute to the list (without the value), * otherwise store a partial chunk to be prepended later. */ if (context->iter != context->current_text_end) context->state = STATE_AFTER_ATTRIBUTE_NAME; break; case STATE_AFTER_ATTRIBUTE_NAME: /* Possible next states: AFTER_ATTRIBUTE_EQUALS_SIGN */ skip_spaces (context); if (context->iter != context->current_text_end) { /* The name has ended. Combine it with the partial chunk * if any; push it on the stack; enter next state. */ add_attribute (context, g_string_free (context->partial_chunk, FALSE)); context->partial_chunk = NULL; context->start = NULL; if (*context->iter == '=') { advance_char (context); context->state = STATE_AFTER_ATTRIBUTE_EQUALS_SIGN; } else { gchar buf[8]; set_error (context, error, G_MARKUP_ERROR_PARSE, _("Odd character '%s', expected a '=' after " "attribute name '%s' of element '%s'"), utf8_str (context->iter, buf), current_attribute (context), current_element (context)); } } break; case STATE_BETWEEN_ATTRIBUTES: /* Possible next states: AFTER_CLOSE_ANGLE, * AFTER_ELISION_SLASH, INSIDE_ATTRIBUTE_NAME */ skip_spaces (context); if (context->iter != context->current_text_end) { if (*context->iter == '/') { advance_char (context); context->state = STATE_AFTER_ELISION_SLASH; } else if (*context->iter == '>') { advance_char (context); context->state = STATE_AFTER_CLOSE_ANGLE; } else if (is_name_start_char (context->iter)) { context->state = STATE_INSIDE_ATTRIBUTE_NAME; /* start of attribute name */ context->start = context->iter; } else { gchar buf[8]; set_error (context, error, G_MARKUP_ERROR_PARSE, _("Odd character '%s', expected a '>' or '/' " "character to end the start tag of " "element '%s', or optionally an attribute; " "perhaps you used an invalid character in " "an attribute name"), utf8_str (context->iter, buf), current_element (context)); } /* If we're done with attributes, invoke * the start_element callback */ if (context->state == STATE_AFTER_ELISION_SLASH || context->state == STATE_AFTER_CLOSE_ANGLE) { const gchar *start_name; /* Ugly, but the current code expects an empty array instead of NULL */ const gchar *empty = NULL; const gchar **attr_names = ∅ const gchar **attr_values = ∅ GError *tmp_error; /* Call user callback for element start */ start_name = current_element (context); if (context->cur_attr >= 0) { attr_names = (const gchar**)context->attr_names; attr_values = (const gchar**)context->attr_values; } tmp_error = NULL; if (context->parser->start_element) (* context->parser->start_element) (context, start_name, (const gchar **)attr_names, (const gchar **)attr_values, context->user_data, &tmp_error); /* Go ahead and free the attributes. */ for (; context->cur_attr >= 0; context->cur_attr--) { int pos = context->cur_attr; g_free (context->attr_names[pos]); g_free (context->attr_values[pos]); context->attr_names[pos] = context->attr_values[pos] = NULL; } g_assert (context->cur_attr == -1); g_assert (context->attr_names == NULL || context->attr_names[0] == NULL); g_assert (context->attr_values == NULL || context->attr_values[0] == NULL); if (tmp_error != NULL) propagate_error (context, error, tmp_error); } } break; case STATE_AFTER_ATTRIBUTE_EQUALS_SIGN: /* Possible next state: INSIDE_ATTRIBUTE_VALUE_[SQ/DQ] */ skip_spaces (context); if (context->iter != context->current_text_end) { if (*context->iter == '"') { advance_char (context); context->state = STATE_INSIDE_ATTRIBUTE_VALUE_DQ; context->start = context->iter; } else if (*context->iter == '\'') { advance_char (context); context->state = STATE_INSIDE_ATTRIBUTE_VALUE_SQ; context->start = context->iter; } else { gchar buf[8]; set_error (context, error, G_MARKUP_ERROR_PARSE, _("Odd character '%s', expected an open quote mark " "after the equals sign when giving value for " "attribute '%s' of element '%s'"), utf8_str (context->iter, buf), current_attribute (context), current_element (context)); } } break; case STATE_INSIDE_ATTRIBUTE_VALUE_SQ: case STATE_INSIDE_ATTRIBUTE_VALUE_DQ: /* Possible next states: BETWEEN_ATTRIBUTES */ { gchar delim; if (context->state == STATE_INSIDE_ATTRIBUTE_VALUE_SQ) { delim = '\''; } else { delim = '"'; } do { if (*context->iter == delim) break; } while (advance_char (context)); } if (context->iter == context->current_text_end) { /* The value hasn't necessarily ended. Merge with * partial chunk, leave state unchanged. */ add_to_partial (context, context->start, context->iter); } else { /* The value has ended at the quote mark. Combine it * with the partial chunk if any; set it for the current * attribute. */ GString *unescaped; add_to_partial (context, context->start, context->iter); g_assert (context->cur_attr >= 0); if (unescape_text (context, context->partial_chunk->str, context->partial_chunk->str + context->partial_chunk->len, &unescaped, error)) { /* success, advance past quote and set state. */ context->attr_values[context->cur_attr] = g_string_free (unescaped, FALSE); advance_char (context); context->state = STATE_BETWEEN_ATTRIBUTES; context->start = NULL; } truncate_partial (context); } break; case STATE_INSIDE_TEXT: /* Possible next states: AFTER_OPEN_ANGLE */ do { if (*context->iter == '<') break; } while (advance_char (context)); /* The text hasn't necessarily ended. Merge with * partial chunk, leave state unchanged. */ add_to_partial (context, context->start, context->iter); if (context->iter != context->current_text_end) { GString *unescaped = NULL; /* The text has ended at the open angle. Call the text * callback. */ if (unescape_text (context, context->partial_chunk->str, context->partial_chunk->str + context->partial_chunk->len, &unescaped, error)) { GError *tmp_error = NULL; if (context->parser->text) (*context->parser->text) (context, unescaped->str, unescaped->len, context->user_data, &tmp_error); g_string_free (unescaped, TRUE); if (tmp_error == NULL) { /* advance past open angle and set state. */ advance_char (context); context->state = STATE_AFTER_OPEN_ANGLE; /* could begin a passthrough */ context->start = context->iter; } else propagate_error (context, error, tmp_error); } truncate_partial (context); } break; case STATE_AFTER_CLOSE_TAG_SLASH: /* Possible next state: INSIDE_CLOSE_TAG_NAME */ if (is_name_start_char (context->iter)) { context->state = STATE_INSIDE_CLOSE_TAG_NAME; /* start of tag name */ context->start = context->iter; } else { gchar buf[8]; set_error (context, error, G_MARKUP_ERROR_PARSE, _("'%s' is not a valid character following " "the characters 'iter, buf), utf8_str (context->iter, buf)); } break; case STATE_INSIDE_CLOSE_TAG_NAME: /* Possible next state: AFTER_CLOSE_TAG_NAME */ advance_to_name_end (context); add_to_partial (context, context->start, context->iter); if (context->iter != context->current_text_end) context->state = STATE_AFTER_CLOSE_TAG_NAME; break; case STATE_AFTER_CLOSE_TAG_NAME: /* Possible next state: AFTER_CLOSE_TAG_SLASH */ skip_spaces (context); if (context->iter != context->current_text_end) { gchar *close_name; /* The name has ended. Combine it with the partial chunk * if any; check that it matches stack top and pop * stack; invoke proper callback; enter next state. */ close_name = g_string_free (context->partial_chunk, FALSE); context->partial_chunk = NULL; if (*context->iter != '>') { gchar buf[8]; set_error (context, error, G_MARKUP_ERROR_PARSE, _("'%s' is not a valid character following " "the close element name '%s'; the allowed " "character is '>'"), utf8_str (context->iter, buf), close_name); } else if (context->tag_stack == NULL) { set_error (context, error, G_MARKUP_ERROR_PARSE, _("Element '%s' was closed, no element " "is currently open"), close_name); } else if (strcmp (close_name, current_element (context)) != 0) { set_error (context, error, G_MARKUP_ERROR_PARSE, _("Element '%s' was closed, but the currently " "open element is '%s'"), close_name, current_element (context)); } else { GError *tmp_error; advance_char (context); context->state = STATE_AFTER_CLOSE_ANGLE; context->start = NULL; possibly_finish_subparser (context); /* call the end_element callback */ tmp_error = NULL; if (context->parser->end_element) (* context->parser->end_element) (context, close_name, context->user_data, &tmp_error); ensure_no_outstanding_subparser (context); /* Pop the tag stack */ g_free (context->tag_stack->data); context->tag_stack = g_slist_delete_link (context->tag_stack, context->tag_stack); if (tmp_error) propagate_error (context, error, tmp_error); } g_free (close_name); } break; case STATE_INSIDE_PASSTHROUGH: /* Possible next state: AFTER_CLOSE_ANGLE */ do { if (*context->iter == '<') context->balance++; if (*context->iter == '>') { gchar *str; gsize len; context->balance--; add_to_partial (context, context->start, context->iter); context->start = context->iter; str = context->partial_chunk->str; len = context->partial_chunk->len; if (str[1] == '?' && str[len - 1] == '?') break; if (strncmp (str, "