From 733d209b14f1b88288e08241a64ef77284108505 Mon Sep 17 00:00:00 2001 From: Matthias Clasen Date: Sun, 8 Aug 2010 23:21:54 -0400 Subject: [PATCH] Move GRegex docs inline --- docs/reference/glib/tmpl/gregex.sgml | 290 ++++++++------------------- glib/gregex.c | 60 ++++++ glib/gregex.h | 225 +++++++++++++++++++++ 3 files changed, 365 insertions(+), 210 deletions(-) diff --git a/docs/reference/glib/tmpl/gregex.sgml b/docs/reference/glib/tmpl/gregex.sgml index f3d44794a..d342b977d 100644 --- a/docs/reference/glib/tmpl/gregex.sgml +++ b/docs/reference/glib/tmpl/gregex.sgml @@ -2,69 +2,11 @@ Perl-compatible regular expressions -matches strings against regular expressions + -The g_regex_*() functions implement regular -expression pattern matching using syntax and semantics similar to -Perl regular expression. - - -Some functions accept a start_position argument, -setting it differs from just passing over a shortened string and setting -#G_REGEX_MATCH_NOTBOL in the case of a pattern that begins with any kind -of lookbehind assertion. -For example, consider the pattern "\Biss\B" which finds occurrences of "iss" -in the middle of words. ("\B" matches only if the current position in the -subject is not a word boundary.) When applied to the string "Mississipi" -from the fourth byte, namely "issipi", it does not match, because "\B" is -always false at the start of the subject, which is deemed to be a word -boundary. However, if the entire string is passed , but with -start_position set to 4, it finds the second -occurrence of "iss" because it is able to look behind the starting point -to discover that it is preceded by a letter. - - -Note that, unless you set the #G_REGEX_RAW flag, all the strings passed -to these functions must be encoded in UTF-8. The lengths and the positions -inside the strings are in bytes and not in characters, so, for instance, -"\xc3\xa0" (i.e. "à") is two bytes long but it is treated as a single -character. If you set #G_REGEX_RAW the strings can be non-valid UTF-8 -strings and a byte is treated as a character, so "\xc3\xa0" is two bytes -and two characters long. - - -When matching a pattern, "\n" matches only against a "\n" character in the -string, and "\r" matches only a "\r" character. To match any newline sequence -use "\R". This particular group matches either the two-character sequence -CR + LF ("\r\n"), or one of the single characters LF (linefeed, U+000A, "\n"), VT -(vertical tab, U+000B, "\v"), FF (formfeed, U+000C, "\f"), CR (carriage return, -U+000D, "\r"), NEL (next line, U+0085), LS (line separator, U+2028), or PS -(paragraph separator, U+2029). - - -The behaviour of the dot, circumflex, and dollar metacharacters are affected by -newline characters, the default is to recognize any newline character (the same -characters recognized by "\R"). This can be changed with #G_REGEX_NEWLINE_CR, -#G_REGEX_NEWLINE_LF and #G_REGEX_NEWLINE_CRLF compile options, -and with #G_REGEX_MATCH_NEWLINE_ANY, #G_REGEX_MATCH_NEWLINE_CR, -#G_REGEX_MATCH_NEWLINE_LF and #G_REGEX_MATCH_NEWLINE_CRLF match options. -These settings are also relevant when compiling a pattern if -#G_REGEX_EXTENDED is set, and an unescaped "#" outside a character class is -encountered. This indicates a comment that lasts until after the next -newline. - - -Creating and manipulating the same #GRegex structure from different -threads is not a problem as #GRegex does not modify its internal -state between creation and destruction, on the other hand #GMatchInfo is -not threadsafe. - - -The regular expressions low level functionalities are obtained through -the excellent PCRE library -written by Philip Hazel. + @@ -80,184 +22,112 @@ written by Philip Hazel. -Error codes returned by regular expressions functions. + -@G_REGEX_ERROR_COMPILE: Compilation of the regular expression failed. -@G_REGEX_ERROR_OPTIMIZE: Optimization of the regular expression failed. -@G_REGEX_ERROR_REPLACE: Replacement failed due to an ill-formed replacement string. -@G_REGEX_ERROR_MATCH: The match process failed. -@G_REGEX_ERROR_INTERNAL: Internal error of the regular expression engine. Since 2.16 -@G_REGEX_ERROR_STRAY_BACKSLASH: "\\" at end of pattern. Since 2.16 -@G_REGEX_ERROR_MISSING_CONTROL_CHAR: "\\c" at end of pattern. Since 2.16 -@G_REGEX_ERROR_UNRECOGNIZED_ESCAPE: Unrecognized character follows "\\". Since 2.16 -@G_REGEX_ERROR_QUANTIFIERS_OUT_OF_ORDER: Numbers out of order in "{}" quantifier. Since 2.16 -@G_REGEX_ERROR_QUANTIFIER_TOO_BIG: Number too big in "{}" quantifier. Since 2.16 -@G_REGEX_ERROR_UNTERMINATED_CHARACTER_CLASS: Missing terminating "]" for character class. Since 2.16 -@G_REGEX_ERROR_INVALID_ESCAPE_IN_CHARACTER_CLASS: Invalid escape sequence in character class. Since 2.16 -@G_REGEX_ERROR_RANGE_OUT_OF_ORDER: Range out of order in character class. Since 2.16 -@G_REGEX_ERROR_NOTHING_TO_REPEAT: Nothing to repeat. Since 2.16 -@G_REGEX_ERROR_UNRECOGNIZED_CHARACTER: Unrecognized character after "(?", "(?<" or "(?P". Since 2.16 -@G_REGEX_ERROR_POSIX_NAMED_CLASS_OUTSIDE_CLASS: POSIX named classes are supported only within a class. Since 2.16 -@G_REGEX_ERROR_UNMATCHED_PARENTHESIS: Missing terminating ")" or ")" without opening "(". Since 2.16 -@G_REGEX_ERROR_INEXISTENT_SUBPATTERN_REFERENCE: Reference to non-existent subpattern. Since 2.16 -@G_REGEX_ERROR_UNTERMINATED_COMMENT: Missing terminating ")" after comment. Since 2.16 -@G_REGEX_ERROR_EXPRESSION_TOO_LARGE: Regular expression too large. Since 2.16 -@G_REGEX_ERROR_MEMORY_ERROR: Failed to get memory. Since 2.16 -@G_REGEX_ERROR_VARIABLE_LENGTH_LOOKBEHIND: Lookbehind assertion is not fixed length. Since 2.16 -@G_REGEX_ERROR_MALFORMED_CONDITION: Malformed number or name after "(?(". Since 2.16 -@G_REGEX_ERROR_TOO_MANY_CONDITIONAL_BRANCHES: Conditional group contains more than two branches. Since 2.16 -@G_REGEX_ERROR_ASSERTION_EXPECTED: Assertion expected after "(?(". Since 2.16 -@G_REGEX_ERROR_UNKNOWN_POSIX_CLASS_NAME: Unknown POSIX class name. Since 2.16 -@G_REGEX_ERROR_POSIX_COLLATING_ELEMENTS_NOT_SUPPORTED: POSIX collating elements are not supported. Since 2.16 -@G_REGEX_ERROR_HEX_CODE_TOO_LARGE: Character value in "\\x{...}" sequence is too large. Since 2.16 -@G_REGEX_ERROR_INVALID_CONDITION: Invalid condition "(?(0)". Since 2.16 -@G_REGEX_ERROR_SINGLE_BYTE_MATCH_IN_LOOKBEHIND: \\C not allowed in lookbehind assertion. Since 2.16 -@G_REGEX_ERROR_INFINITE_LOOP: Recursive call could loop indefinitely. Since 2.16 -@G_REGEX_ERROR_MISSING_SUBPATTERN_NAME_TERMINATOR: Missing terminator in subpattern name. Since 2.16 -@G_REGEX_ERROR_DUPLICATE_SUBPATTERN_NAME: Two named subpatterns have the same name. Since 2.16 -@G_REGEX_ERROR_MALFORMED_PROPERTY: Malformed "\\P" or "\\p" sequence. Since 2.16 -@G_REGEX_ERROR_UNKNOWN_PROPERTY: Unknown property name after "\\P" or "\\p". Since 2.16 -@G_REGEX_ERROR_SUBPATTERN_NAME_TOO_LONG: Subpattern name is too long (maximum 32 characters). Since 2.16 -@G_REGEX_ERROR_TOO_MANY_SUBPATTERNS: Too many named subpatterns (maximum 10,000). Since 2.16 -@G_REGEX_ERROR_INVALID_OCTAL_VALUE: Octal value is greater than "\\377". Since 2.16 -@G_REGEX_ERROR_TOO_MANY_BRANCHES_IN_DEFINE: "DEFINE" group contains more than one branch. Since 2.16 -@G_REGEX_ERROR_DEFINE_REPETION: Repeating a "DEFINE" group is not allowed. Since 2.16 -@G_REGEX_ERROR_INCONSISTENT_NEWLINE_OPTIONS: Inconsistent newline options. Since 2.16 -@G_REGEX_ERROR_MISSING_BACK_REFERENCE: "\\g" is not followed by a braced name or an -optionally braced non-zero number. Since 2.16 -@Since: 2.14 +@G_REGEX_ERROR_COMPILE: +@G_REGEX_ERROR_OPTIMIZE: +@G_REGEX_ERROR_REPLACE: +@G_REGEX_ERROR_MATCH: +@G_REGEX_ERROR_INTERNAL: +@G_REGEX_ERROR_STRAY_BACKSLASH: +@G_REGEX_ERROR_MISSING_CONTROL_CHAR: +@G_REGEX_ERROR_UNRECOGNIZED_ESCAPE: +@G_REGEX_ERROR_QUANTIFIERS_OUT_OF_ORDER: +@G_REGEX_ERROR_QUANTIFIER_TOO_BIG: +@G_REGEX_ERROR_UNTERMINATED_CHARACTER_CLASS: +@G_REGEX_ERROR_INVALID_ESCAPE_IN_CHARACTER_CLASS: +@G_REGEX_ERROR_RANGE_OUT_OF_ORDER: +@G_REGEX_ERROR_NOTHING_TO_REPEAT: +@G_REGEX_ERROR_UNRECOGNIZED_CHARACTER: +@G_REGEX_ERROR_POSIX_NAMED_CLASS_OUTSIDE_CLASS: +@G_REGEX_ERROR_UNMATCHED_PARENTHESIS: +@G_REGEX_ERROR_INEXISTENT_SUBPATTERN_REFERENCE: +@G_REGEX_ERROR_UNTERMINATED_COMMENT: +@G_REGEX_ERROR_EXPRESSION_TOO_LARGE: +@G_REGEX_ERROR_MEMORY_ERROR: +@G_REGEX_ERROR_VARIABLE_LENGTH_LOOKBEHIND: +@G_REGEX_ERROR_MALFORMED_CONDITION: +@G_REGEX_ERROR_TOO_MANY_CONDITIONAL_BRANCHES: +@G_REGEX_ERROR_ASSERTION_EXPECTED: +@G_REGEX_ERROR_UNKNOWN_POSIX_CLASS_NAME: +@G_REGEX_ERROR_POSIX_COLLATING_ELEMENTS_NOT_SUPPORTED: +@G_REGEX_ERROR_HEX_CODE_TOO_LARGE: +@G_REGEX_ERROR_INVALID_CONDITION: +@G_REGEX_ERROR_SINGLE_BYTE_MATCH_IN_LOOKBEHIND: +@G_REGEX_ERROR_INFINITE_LOOP: +@G_REGEX_ERROR_MISSING_SUBPATTERN_NAME_TERMINATOR: +@G_REGEX_ERROR_DUPLICATE_SUBPATTERN_NAME: +@G_REGEX_ERROR_MALFORMED_PROPERTY: +@G_REGEX_ERROR_UNKNOWN_PROPERTY: +@G_REGEX_ERROR_SUBPATTERN_NAME_TOO_LONG: +@G_REGEX_ERROR_TOO_MANY_SUBPATTERNS: +@G_REGEX_ERROR_INVALID_OCTAL_VALUE: +@G_REGEX_ERROR_TOO_MANY_BRANCHES_IN_DEFINE: +@G_REGEX_ERROR_DEFINE_REPETION: +@G_REGEX_ERROR_INCONSISTENT_NEWLINE_OPTIONS: +@G_REGEX_ERROR_MISSING_BACK_REFERENCE: -Error domain for regular expressions. Errors in this domain will be from the #GRegexError enumeration. See #GError for information on error domains. + -@Since: 2.14 +@Since: -Flags specifying compile-time options. + -@G_REGEX_CASELESS: Letters in the pattern match both upper and lower case -letters. This option can be changed within a pattern by a "(?i)" option -setting. -@G_REGEX_MULTILINE: By default, GRegex treats the strings as consisting -of a single line of characters (even if it actually contains newlines). -The "start of line" metacharacter ("^") matches only at the start of the -string, while the "end of line" metacharacter ("$") matches only at the -end of the string, or before a terminating newline (unless -#G_REGEX_DOLLAR_ENDONLY is set). When #G_REGEX_MULTILINE is set, -the "start of line" and "end of line" constructs match immediately following -or immediately before any newline in the string, respectively, as well -as at the very start and end. This can be changed within a pattern by a -"(?m)" option setting. -@G_REGEX_DOTALL: A dot metacharater (".") in the pattern matches all -characters, including newlines. Without it, newlines are excluded. This -option can be changed within a pattern by a ("?s") option setting. -@G_REGEX_EXTENDED: Whitespace data characters in the pattern are -totally ignored except when escaped or inside a character class. -Whitespace does not include the VT character (code 11). In addition, -characters between an unescaped "#" outside a character class and -the next newline character, inclusive, are also ignored. This can be -changed within a pattern by a "(?x)" option setting. -@G_REGEX_ANCHORED: The pattern is forced to be "anchored", that is, -it is constrained to match only at the first matching point in the string -that is being searched. This effect can also be achieved by appropriate -constructs in the pattern itself such as the "^" metacharater. -@G_REGEX_DOLLAR_ENDONLY: A dollar metacharacter ("$") in the pattern -matches only at the end of the string. Without this option, a dollar also -matches immediately before the final character if it is a newline (but -not before any other newlines). This option is ignored if -#G_REGEX_MULTILINE is set. -@G_REGEX_UNGREEDY: Inverts the "greediness" of the -quantifiers so that they are not greedy by default, but become greedy -if followed by "?". It can also be set by a "(?U)" option setting within -the pattern. -@G_REGEX_RAW: Usually strings must be valid UTF-8 strings, using this -flag they are considered as a raw sequence of bytes. -@G_REGEX_NO_AUTO_CAPTURE: Disables the use of numbered capturing -parentheses in the pattern. Any opening parenthesis that is not followed -by "?" behaves as if it were followed by "?:" but named parentheses can -still be used for capturing (and they acquire numbers in the usual way). -@G_REGEX_OPTIMIZE: Optimize the regular expression. If the pattern will -be used many times, then it may be worth the effort to optimize it to -improve the speed of matches. -@G_REGEX_DUPNAMES: Names used to identify capturing subpatterns need not -be unique. This can be helpful for certain types of pattern when it is known -that only one instance of the named subpattern can ever be matched. -@G_REGEX_NEWLINE_CR: Usually any newline character is recognized, if this -option is set, the only recognized newline character is '\r'. -@G_REGEX_NEWLINE_LF: Usually any newline character is recognized, if this -option is set, the only recognized newline character is '\n'. -@G_REGEX_NEWLINE_CRLF: Usually any newline character is recognized, if this -option is set, the only recognized newline character sequence is '\r\n'. -@Since: 2.14 +@G_REGEX_CASELESS: +@G_REGEX_MULTILINE: +@G_REGEX_DOTALL: +@G_REGEX_EXTENDED: +@G_REGEX_ANCHORED: +@G_REGEX_DOLLAR_ENDONLY: +@G_REGEX_UNGREEDY: +@G_REGEX_RAW: +@G_REGEX_NO_AUTO_CAPTURE: +@G_REGEX_OPTIMIZE: +@G_REGEX_DUPNAMES: +@G_REGEX_NEWLINE_CR: +@G_REGEX_NEWLINE_LF: +@G_REGEX_NEWLINE_CRLF: + -Flags specifying match-time options. + -@G_REGEX_MATCH_ANCHORED: The pattern is forced to be "anchored", that is, -it is constrained to match only at the first matching point in the string -that is being searched. This effect can also be achieved by appropriate -constructs in the pattern itself such as the "^" metacharater. -@G_REGEX_MATCH_NOTBOL: Specifies that first character of the string is -not the beginning of a line, so the circumflex metacharacter should not -match before it. Setting this without G_REGEX_MULTILINE (at compile time) -causes circumflex never to match. This option affects only the behaviour of -the circumflex metacharacter, it does not affect "\A". -@G_REGEX_MATCH_NOTEOL: Specifies that the end of the subject string is -not the end of a line, so the dollar metacharacter should not match it nor -(except in multiline mode) a newline immediately before it. Setting this -without G_REGEX_MULTILINE (at compile time) causes dollar never to match. -This option affects only the behaviour of the dollar metacharacter, it does -not affect "\Z" or "\z". -@G_REGEX_MATCH_NOTEMPTY: An empty string is not considered to be a valid -match if this option is set. If there are alternatives in the pattern, they -are tried. If all the alternatives match the empty string, the entire match -fails. For example, if the pattern "a?b?" is applied to a string not beginning -with "a" or "b", it matches the empty string at the start of the string. -With this flag set, this match is not valid, so GRegex searches further -into the string for occurrences of "a" or "b". -@G_REGEX_MATCH_PARTIAL: Turns on the partial matching feature, for more -documentation on partial matching see g_match_info_is_partial_match(). -@G_REGEX_MATCH_NEWLINE_CR: Overrides the newline definition set when creating -a new #GRegex, setting the '\r' character as line terminator. -@G_REGEX_MATCH_NEWLINE_LF: Overrides the newline definition set when creating -a new #GRegex, setting the '\n' character as line terminator. -@G_REGEX_MATCH_NEWLINE_CRLF: Overrides the newline definition set when creating -a new #GRegex, setting the '\r\n' characters as line terminator. -@G_REGEX_MATCH_NEWLINE_ANY: Overrides the newline definition set when creating -a new #GRegex, any newline character or character sequence is recognized. -@Since: 2.14 +@G_REGEX_MATCH_ANCHORED: +@G_REGEX_MATCH_NOTBOL: +@G_REGEX_MATCH_NOTEOL: +@G_REGEX_MATCH_NOTEMPTY: +@G_REGEX_MATCH_PARTIAL: +@G_REGEX_MATCH_NEWLINE_CR: +@G_REGEX_MATCH_NEWLINE_LF: +@G_REGEX_MATCH_NEWLINE_CRLF: +@G_REGEX_MATCH_NEWLINE_ANY: + -A GRegex is the "compiled" form of a regular expression pattern. This -structure is opaque and its fields cannot be accessed directly. + -@Since: 2.14 -Specifies the type of the function passed to g_regex_replace_eval(). -It is called for each occurance of the pattern in the string passed -to g_regex_replace_eval(), and it should append the replacement to -@result. + -@match_info: the #GMatchInfo generated by the match. -Use g_match_info_get_regex() and g_match_info_get_string() if you -need the #GRegex or the matched string. -@result: a #GString containing the new string -@user_data: user data passed to g_regex_replace_eval() -@Returns: %FALSE to continue the replacement process, %TRUE to stop it -@Since: 2.14 +@match_info: +@result: +@user_data: +@Returns: diff --git a/glib/gregex.c b/glib/gregex.c index bf4380a38..f067b0000 100644 --- a/glib/gregex.c +++ b/glib/gregex.c @@ -33,6 +33,66 @@ #include "pcre/pcre.h" #endif +/** + * SECTION:gregex + * @title: Perl-compatible regular expressions + * @short_description: matches strings against regular expressions + * @see_also: + * + * The g_regex_*() functions implement regular + * expression pattern matching using syntax and semantics similar to + * Perl regular expression. + * + * Some functions accept a @start_position argument, setting it differs + * from just passing over a shortened string and setting #G_REGEX_MATCH_NOTBOL + * in the case of a pattern that begins with any kind of lookbehind assertion. + * For example, consider the pattern "\Biss\B" which finds occurrences of "iss" + * in the middle of words. ("\B" matches only if the current position in the + * subject is not a word boundary.) When applied to the string "Mississipi" + * from the fourth byte, namely "issipi", it does not match, because "\B" is + * always false at the start of the subject, which is deemed to be a word + * boundary. However, if the entire string is passed , but with + * @start_position set to 4, it finds the second occurrence of "iss" because + * it is able to look behind the starting point to discover that it is + * preceded by a letter. + * + * Note that, unless you set the #G_REGEX_RAW flag, all the strings passed + * to these functions must be encoded in UTF-8. The lengths and the positions + * inside the strings are in bytes and not in characters, so, for instance, + * "\xc3\xa0" (i.e. "à") is two bytes long but it is treated as a + * single character. If you set #G_REGEX_RAW the strings can be non-valid + * UTF-8 strings and a byte is treated as a character, so "\xc3\xa0" is two + * bytes and two characters long. + * + * When matching a pattern, "\n" matches only against a "\n" character in + * the string, and "\r" matches only a "\r" character. To match any newline + * sequence use "\R". This particular group matches either the two-character + * sequence CR + LF ("\r\n"), or one of the single characters LF (linefeed, + * U+000A, "\n"), VT vertical tab, U+000B, "\v"), FF (formfeed, U+000C, "\f"), + * CR (carriage return, U+000D, "\r"), NEL (next line, U+0085), LS (line + * separator, U+2028), or PS (paragraph separator, U+2029). + * + * The behaviour of the dot, circumflex, and dollar metacharacters are + * affected by newline characters, the default is to recognize any newline + * character (the same characters recognized by "\R"). This can be changed + * with #G_REGEX_NEWLINE_CR, #G_REGEX_NEWLINE_LF and #G_REGEX_NEWLINE_CRLF + * compile options, and with #G_REGEX_MATCH_NEWLINE_ANY, + * #G_REGEX_MATCH_NEWLINE_CR, #G_REGEX_MATCH_NEWLINE_LF and + * #G_REGEX_MATCH_NEWLINE_CRLF match options. These settings are also + * relevant when compiling a pattern if #G_REGEX_EXTENDED is set, and an + * unescaped "#" outside a character class is encountered. This indicates + * a comment that lasts until after the next newline. + * + * Creating and manipulating the same #GRegex structure from different + * threads is not a problem as #GRegex does not modify its internal + * state between creation and destruction, on the other hand #GMatchInfo + * is not threadsafe. + * + * The regular expressions low-level functionalities are obtained through + * the excellent PCRE library + * written by Philip Hazel. + */ + /* Mask of all the possible values for GRegexCompileFlags. */ #define G_REGEX_COMPILE_MASK (G_REGEX_CASELESS | \ G_REGEX_MULTILINE | \ diff --git a/glib/gregex.h b/glib/gregex.h index 87d4dede6..ce1b44a97 100644 --- a/glib/gregex.h +++ b/glib/gregex.h @@ -31,6 +31,89 @@ G_BEGIN_DECLS +/** + * GRegexError: + * @G_REGEX_ERROR_COMPILE: Compilation of the regular expression failed. + * @G_REGEX_ERROR_OPTIMIZE: Optimization of the regular expression failed. + * @G_REGEX_ERROR_REPLACE: Replacement failed due to an ill-formed replacement + * string. + * @G_REGEX_ERROR_MATCH: The match process failed. + * @G_REGEX_ERROR_INTERNAL: Internal error of the regular expression engine. + * Since 2.16 + * @G_REGEX_ERROR_STRAY_BACKSLASH: "\\" at end of pattern. Since 2.16 + * @G_REGEX_ERROR_MISSING_CONTROL_CHAR: "\\c" at end of pattern. Since 2.16 + * @G_REGEX_ERROR_UNRECOGNIZED_ESCAPE: Unrecognized character follows "\\". + * Since 2.16 + * @G_REGEX_ERROR_QUANTIFIERS_OUT_OF_ORDER: Numbers out of order in "{}" + * quantifier. Since 2.16 + * @G_REGEX_ERROR_QUANTIFIER_TOO_BIG: Number too big in "{}" quantifier. + * Since 2.16 + * @G_REGEX_ERROR_UNTERMINATED_CHARACTER_CLASS: Missing terminating "]" for + * character class. Since 2.16 + * @G_REGEX_ERROR_INVALID_ESCAPE_IN_CHARACTER_CLASS: Invalid escape sequence + * in character class. Since 2.16 + * @G_REGEX_ERROR_RANGE_OUT_OF_ORDER: Range out of order in character class. + * Since 2.16 + * @G_REGEX_ERROR_NOTHING_TO_REPEAT: Nothing to repeat. Since 2.16 + * @G_REGEX_ERROR_UNRECOGNIZED_CHARACTER: Unrecognized character after "(?", + * "(?<" or "(?P". Since 2.16 + * @G_REGEX_ERROR_POSIX_NAMED_CLASS_OUTSIDE_CLASS: POSIX named classes are + * supported only within a class. Since 2.16 + * @G_REGEX_ERROR_UNMATCHED_PARENTHESIS: Missing terminating ")" or ")" + * without opening "(". Since 2.16 + * @G_REGEX_ERROR_INEXISTENT_SUBPATTERN_REFERENCE: Reference to non-existent + * subpattern. Since 2.16 + * @G_REGEX_ERROR_UNTERMINATED_COMMENT: Missing terminating ")" after comment. + * Since 2.16 + * @G_REGEX_ERROR_EXPRESSION_TOO_LARGE: Regular expression too large. + * Since 2.16 + * @G_REGEX_ERROR_MEMORY_ERROR: Failed to get memory. Since 2.16 + * @G_REGEX_ERROR_VARIABLE_LENGTH_LOOKBEHIND: Lookbehind assertion is not + * fixed length. Since 2.16 + * @G_REGEX_ERROR_MALFORMED_CONDITION: Malformed number or name after "(?(". + * Since 2.16 + * @G_REGEX_ERROR_TOO_MANY_CONDITIONAL_BRANCHES: Conditional group contains + * more than two branches. Since 2.16 + * @G_REGEX_ERROR_ASSERTION_EXPECTED: Assertion expected after "(?(". + * Since 2.16 + * @G_REGEX_ERROR_UNKNOWN_POSIX_CLASS_NAME: Unknown POSIX class name. + * Since 2.16 + * @G_REGEX_ERROR_POSIX_COLLATING_ELEMENTS_NOT_SUPPORTED: POSIX collating + * elements are not supported. Since 2.16 + * @G_REGEX_ERROR_HEX_CODE_TOO_LARGE: Character value in "\\x{...}" sequence + * is too large. Since 2.16 + * @G_REGEX_ERROR_INVALID_CONDITION: Invalid condition "(?(0)". Since 2.16 + * @G_REGEX_ERROR_SINGLE_BYTE_MATCH_IN_LOOKBEHIND: \\C not allowed in + * lookbehind assertion. Since 2.16 + * @G_REGEX_ERROR_INFINITE_LOOP: Recursive call could loop indefinitely. + * Since 2.16 + * @G_REGEX_ERROR_MISSING_SUBPATTERN_NAME_TERMINATOR: Missing terminator + * in subpattern name. Since 2.16 + * @G_REGEX_ERROR_DUPLICATE_SUBPATTERN_NAME: Two named subpatterns have + * the same name. Since 2.16 + * @G_REGEX_ERROR_MALFORMED_PROPERTY: Malformed "\\P" or "\\p" sequence. + * Since 2.16 + * @G_REGEX_ERROR_UNKNOWN_PROPERTY: Unknown property name after "\\P" or + * "\\p". Since 2.16 + * @G_REGEX_ERROR_SUBPATTERN_NAME_TOO_LONG: Subpattern name is too long + * (maximum 32 characters). Since 2.16 + * @G_REGEX_ERROR_TOO_MANY_SUBPATTERNS: Too many named subpatterns (maximum + * 10,000). Since 2.16 + * @G_REGEX_ERROR_INVALID_OCTAL_VALUE: Octal value is greater than "\\377". + * Since 2.16 + * @G_REGEX_ERROR_TOO_MANY_BRANCHES_IN_DEFINE: "DEFINE" group contains more + * than one branch. Since 2.16 + * @G_REGEX_ERROR_DEFINE_REPETION: Repeating a "DEFINE" group is not allowed. + * Since 2.16 + * @G_REGEX_ERROR_INCONSISTENT_NEWLINE_OPTIONS: Inconsistent newline options. + * Since 2.16 + * @G_REGEX_ERROR_MISSING_BACK_REFERENCE: "\\g" is not followed by a braced + * name or an optionally braced non-zero number. Since 2.16 + * + * Error codes returned by regular expressions functions. + * + * Since: 2.14 + */ typedef enum { G_REGEX_ERROR_COMPILE, @@ -79,10 +162,82 @@ typedef enum G_REGEX_ERROR_MISSING_BACK_REFERENCE = 157 } GRegexError; +/** + * G_REGEX_ERROR: + * + * Error domain for regular expressions. Errors in this domain will be + * from the #GRegexError enumeration. See #GError for information on + * error domains. + * + * Since: 2.14 + */ #define G_REGEX_ERROR g_regex_error_quark () GQuark g_regex_error_quark (void); +/** + * GRegexCompileFlags: + * @G_REGEX_CASELESS: Letters in the pattern match both upper- and + * lowercase letters. This option can be changed within a pattern + * by a "(?i)" option setting. + * @G_REGEX_MULTILINE: By default, GRegex treats the strings as consisting + * of a single line of characters (even if it actually contains + * newlines). The "start of line" metacharacter ("^") matches only + * at the start of the string, while the "end of line" metacharacter + * ("$") matches only at the end of the string, or before a terminating + * newline (unless #G_REGEX_DOLLAR_ENDONLY is set). When + * #G_REGEX_MULTILINE is set, the "start of line" and "end of line" + * constructs match immediately following or immediately before any + * newline in the string, respectively, as well as at the very start + * and end. This can be changed within a pattern by a "(?m)" option + * setting. + * @G_REGEX_DOTALL: A dot metacharater (".") in the pattern matches all + * characters, including newlines. Without it, newlines are excluded. + * This option can be changed within a pattern by a ("?s") option setting. + * @G_REGEX_EXTENDED: Whitespace data characters in the pattern are + * totally ignored except when escaped or inside a character class. + * Whitespace does not include the VT character (code 11). In addition, + * characters between an unescaped "#" outside a character class and + * the next newline character, inclusive, are also ignored. This can + * be changed within a pattern by a "(?x)" option setting. + * @G_REGEX_ANCHORED: The pattern is forced to be "anchored", that is, + * it is constrained to match only at the first matching point in the + * string that is being searched. This effect can also be achieved by + * appropriate constructs in the pattern itself such as the "^" + * metacharater. + * @G_REGEX_DOLLAR_ENDONLY: A dollar metacharacter ("$") in the pattern + * matches only at the end of the string. Without this option, a + * dollar also matches immediately before the final character if + * it is a newline (but not before any other newlines). This option + * is ignored if #G_REGEX_MULTILINE is set. + * @G_REGEX_UNGREEDY: Inverts the "greediness" of the quantifiers so that + * they are not greedy by default, but become greedy if followed by "?". + * It can also be set by a "(?U)" option setting within the pattern. + * @G_REGEX_RAW: Usually strings must be valid UTF-8 strings, using this + * flag they are considered as a raw sequence of bytes. + * @G_REGEX_NO_AUTO_CAPTURE: Disables the use of numbered capturing + * parentheses in the pattern. Any opening parenthesis that is not + * followed by "?" behaves as if it were followed by "?:" but named + * parentheses can still be used for capturing (and they acquire numbers + * in the usual way). + * @G_REGEX_OPTIMIZE: Optimize the regular expression. If the pattern will + * be used many times, then it may be worth the effort to optimize it + * to improve the speed of matches. + * @G_REGEX_DUPNAMES: Names used to identify capturing subpatterns need not + * be unique. This can be helpful for certain types of pattern when it + * is known that only one instance of the named subpattern can ever be + * matched. + * @G_REGEX_NEWLINE_CR: Usually any newline character is recognized, if this + * option is set, the only recognized newline character is '\r'. + * @G_REGEX_NEWLINE_LF: Usually any newline character is recognized, if this + * option is set, the only recognized newline character is '\n'. + * @G_REGEX_NEWLINE_CRLF: Usually any newline character is recognized, if this + * option is set, the only recognized newline character sequence is '\r\n'. + * + * Flags specifying compile-time options. + * + * Since: 2.14 + */ /* Remember to update G_REGEX_COMPILE_MASK in gregex.c after * adding a new flag. */ typedef enum @@ -103,6 +258,49 @@ typedef enum G_REGEX_NEWLINE_CRLF = G_REGEX_NEWLINE_CR | G_REGEX_NEWLINE_LF } GRegexCompileFlags; +/** + * GRegexMatchFlags: + * @G_REGEX_MATCH_ANCHORED: The pattern is forced to be "anchored", that is, + * it is constrained to match only at the first matching point in the + * string that is being searched. This effect can also be achieved by + * appropriate constructs in the pattern itself such as the "^" + * metacharater. + * @G_REGEX_MATCH_NOTBOL: Specifies that first character of the string is + * not the beginning of a line, so the circumflex metacharacter should + * not match before it. Setting this without #G_REGEX_MULTILINE (at + * compile time) causes circumflex never to match. This option affects + * only the behaviour of the circumflex metacharacter, it does not + * affect "\A". + * @G_REGEX_MATCH_NOTEOL: Specifies that the end of the subject string is + * not the end of a line, so the dollar metacharacter should not match + * it nor (except in multiline mode) a newline immediately before it. + * Setting this without #G_REGEX_MULTILINE (at compile time) causes + * dollar never to match. This option affects only the behaviour of + * the dollar metacharacter, it does not affect "\Z" or "\z". + * @G_REGEX_MATCH_NOTEMPTY: An empty string is not considered to be a valid + * match if this option is set. If there are alternatives in the pattern, + * they are tried. If all the alternatives match the empty string, the + * entire match fails. For example, if the pattern "a?b?" is applied to + * a string not beginning with "a" or "b", it matches the empty string + * at the start of the string. With this flag set, this match is not + * valid, so GRegex searches further into the string for occurrences + * of "a" or "b". + * @G_REGEX_MATCH_PARTIAL: Turns on the partial matching feature, for more + * documentation on partial matching see g_match_info_is_partial_match(). + * @G_REGEX_MATCH_NEWLINE_CR: Overrides the newline definition set when + * creating a new #GRegex, setting the '\r' character as line terminator. + * @G_REGEX_MATCH_NEWLINE_LF: Overrides the newline definition set when + * creating a new #GRegex, setting the '\n' character as line terminator. + * @G_REGEX_MATCH_NEWLINE_CRLF: Overrides the newline definition set when + * creating a new #GRegex, setting the '\r\n' characters as line terminator. + * @G_REGEX_MATCH_NEWLINE_ANY: Overrides the newline definition set when + * creating a new #GRegex, any newline character or character sequence + * is recognized. + * + * Flags specifying match-time options. + * + * Since: 2.14 + */ /* Remember to update G_REGEX_MATCH_MASK in gregex.c after * adding a new flag. */ typedef enum @@ -118,9 +316,36 @@ typedef enum G_REGEX_MATCH_NEWLINE_ANY = 1 << 22 } GRegexMatchFlags; +/** + * GRegex: + * + * A GRegex is the "compiled" form of a regular expression pattern. This + * structure is opaque and its fields cannot be accessed directly. + * + * Since: 2.14 + */ typedef struct _GRegex GRegex; + + typedef struct _GMatchInfo GMatchInfo; +/** + * GRegexEvalCallback: + * @match_info: the #GMatchInfo generated by the match. + * Use g_match_info_get_regex() and g_match_info_get_string() if you + * need the #GRegex or the matched string. + * @result: a #GString containing the new string + * @user_data: user data passed to g_regex_replace_eval() + * + * Specifies the type of the function passed to g_regex_replace_eval(). + * It is called for each occurance of the pattern in the string passed + * to g_regex_replace_eval(), and it should append the replacement to + * @result. + * + * Returns: %FALSE to continue the replacement process, %TRUE to stop it + * + * Since: 2.14 + */ typedef gboolean (*GRegexEvalCallback) (const GMatchInfo *match_info, GString *result, gpointer user_data);