mirror of
https://gitlab.gnome.org/GNOME/glib.git
synced 2025-01-26 05:56:14 +01:00
Move GRegex docs inline
This commit is contained in:
parent
c3e5b3cca8
commit
733d209b14
@ -2,69 +2,11 @@
|
||||
Perl-compatible regular expressions
|
||||
|
||||
<!-- ##### SECTION Short_Description ##### -->
|
||||
matches strings against regular expressions
|
||||
|
||||
|
||||
<!-- ##### SECTION Long_Description ##### -->
|
||||
<para>
|
||||
The <function>g_regex_*()</function> functions implement regular
|
||||
expression pattern matching using syntax and semantics similar to
|
||||
Perl regular expression.
|
||||
</para>
|
||||
<para>
|
||||
Some functions accept a <parameter>start_position</parameter> argument,
|
||||
setting it differs from just passing over a shortened string and setting
|
||||
#G_REGEX_MATCH_NOTBOL in the case of a pattern that begins with any kind
|
||||
of lookbehind assertion.
|
||||
For example, consider the pattern "\Biss\B" which finds occurrences of "iss"
|
||||
in the middle of words. ("\B" matches only if the current position in the
|
||||
subject is not a word boundary.) When applied to the string "Mississipi"
|
||||
from the fourth byte, namely "issipi", it does not match, because "\B" is
|
||||
always false at the start of the subject, which is deemed to be a word
|
||||
boundary. However, if the entire string is passed , but with
|
||||
<parameter>start_position</parameter> set to 4, it finds the second
|
||||
occurrence of "iss" because it is able to look behind the starting point
|
||||
to discover that it is preceded by a letter.
|
||||
</para>
|
||||
<para>
|
||||
Note that, unless you set the #G_REGEX_RAW flag, all the strings passed
|
||||
to these functions must be encoded in UTF-8. The lengths and the positions
|
||||
inside the strings are in bytes and not in characters, so, for instance,
|
||||
"\xc3\xa0" (i.e. "à") is two bytes long but it is treated as a single
|
||||
character. If you set #G_REGEX_RAW the strings can be non-valid UTF-8
|
||||
strings and a byte is treated as a character, so "\xc3\xa0" is two bytes
|
||||
and two characters long.
|
||||
</para>
|
||||
<para>
|
||||
When matching a pattern, "\n" matches only against a "\n" character in the
|
||||
string, and "\r" matches only a "\r" character. To match any newline sequence
|
||||
use "\R". This particular group matches either the two-character sequence
|
||||
CR + LF ("\r\n"), or one of the single characters LF (linefeed, U+000A, "\n"), VT
|
||||
(vertical tab, U+000B, "\v"), FF (formfeed, U+000C, "\f"), CR (carriage return,
|
||||
U+000D, "\r"), NEL (next line, U+0085), LS (line separator, U+2028), or PS
|
||||
(paragraph separator, U+2029).
|
||||
</para>
|
||||
<para>
|
||||
The behaviour of the dot, circumflex, and dollar metacharacters are affected by
|
||||
newline characters, the default is to recognize any newline character (the same
|
||||
characters recognized by "\R"). This can be changed with #G_REGEX_NEWLINE_CR,
|
||||
#G_REGEX_NEWLINE_LF and #G_REGEX_NEWLINE_CRLF compile options,
|
||||
and with #G_REGEX_MATCH_NEWLINE_ANY, #G_REGEX_MATCH_NEWLINE_CR,
|
||||
#G_REGEX_MATCH_NEWLINE_LF and #G_REGEX_MATCH_NEWLINE_CRLF match options.
|
||||
These settings are also relevant when compiling a pattern if
|
||||
#G_REGEX_EXTENDED is set, and an unescaped "#" outside a character class is
|
||||
encountered. This indicates a comment that lasts until after the next
|
||||
newline.
|
||||
</para>
|
||||
<para>
|
||||
Creating and manipulating the same #GRegex structure from different
|
||||
threads is not a problem as #GRegex does not modify its internal
|
||||
state between creation and destruction, on the other hand #GMatchInfo is
|
||||
not threadsafe.
|
||||
</para>
|
||||
<para>
|
||||
The regular expressions low level functionalities are obtained through
|
||||
the excellent <ulink url="http://www.pcre.org/">PCRE</ulink> library
|
||||
written by Philip Hazel.
|
||||
|
||||
</para>
|
||||
|
||||
<!-- ##### SECTION See_Also ##### -->
|
||||
@ -80,184 +22,112 @@ written by Philip Hazel.
|
||||
|
||||
<!-- ##### ENUM GRegexError ##### -->
|
||||
<para>
|
||||
Error codes returned by regular expressions functions.
|
||||
|
||||
</para>
|
||||
|
||||
@G_REGEX_ERROR_COMPILE: Compilation of the regular expression failed.
|
||||
@G_REGEX_ERROR_OPTIMIZE: Optimization of the regular expression failed.
|
||||
@G_REGEX_ERROR_REPLACE: Replacement failed due to an ill-formed replacement string.
|
||||
@G_REGEX_ERROR_MATCH: The match process failed.
|
||||
@G_REGEX_ERROR_INTERNAL: Internal error of the regular expression engine. Since 2.16
|
||||
@G_REGEX_ERROR_STRAY_BACKSLASH: "\\" at end of pattern. Since 2.16
|
||||
@G_REGEX_ERROR_MISSING_CONTROL_CHAR: "\\c" at end of pattern. Since 2.16
|
||||
@G_REGEX_ERROR_UNRECOGNIZED_ESCAPE: Unrecognized character follows "\\". Since 2.16
|
||||
@G_REGEX_ERROR_QUANTIFIERS_OUT_OF_ORDER: Numbers out of order in "{}" quantifier. Since 2.16
|
||||
@G_REGEX_ERROR_QUANTIFIER_TOO_BIG: Number too big in "{}" quantifier. Since 2.16
|
||||
@G_REGEX_ERROR_UNTERMINATED_CHARACTER_CLASS: Missing terminating "]" for character class. Since 2.16
|
||||
@G_REGEX_ERROR_INVALID_ESCAPE_IN_CHARACTER_CLASS: Invalid escape sequence in character class. Since 2.16
|
||||
@G_REGEX_ERROR_RANGE_OUT_OF_ORDER: Range out of order in character class. Since 2.16
|
||||
@G_REGEX_ERROR_NOTHING_TO_REPEAT: Nothing to repeat. Since 2.16
|
||||
@G_REGEX_ERROR_UNRECOGNIZED_CHARACTER: Unrecognized character after "(?", "(?<" or "(?P". Since 2.16
|
||||
@G_REGEX_ERROR_POSIX_NAMED_CLASS_OUTSIDE_CLASS: POSIX named classes are supported only within a class. Since 2.16
|
||||
@G_REGEX_ERROR_UNMATCHED_PARENTHESIS: Missing terminating ")" or ")" without opening "(". Since 2.16
|
||||
@G_REGEX_ERROR_INEXISTENT_SUBPATTERN_REFERENCE: Reference to non-existent subpattern. Since 2.16
|
||||
@G_REGEX_ERROR_UNTERMINATED_COMMENT: Missing terminating ")" after comment. Since 2.16
|
||||
@G_REGEX_ERROR_EXPRESSION_TOO_LARGE: Regular expression too large. Since 2.16
|
||||
@G_REGEX_ERROR_MEMORY_ERROR: Failed to get memory. Since 2.16
|
||||
@G_REGEX_ERROR_VARIABLE_LENGTH_LOOKBEHIND: Lookbehind assertion is not fixed length. Since 2.16
|
||||
@G_REGEX_ERROR_MALFORMED_CONDITION: Malformed number or name after "(?(". Since 2.16
|
||||
@G_REGEX_ERROR_TOO_MANY_CONDITIONAL_BRANCHES: Conditional group contains more than two branches. Since 2.16
|
||||
@G_REGEX_ERROR_ASSERTION_EXPECTED: Assertion expected after "(?(". Since 2.16
|
||||
@G_REGEX_ERROR_UNKNOWN_POSIX_CLASS_NAME: Unknown POSIX class name. Since 2.16
|
||||
@G_REGEX_ERROR_POSIX_COLLATING_ELEMENTS_NOT_SUPPORTED: POSIX collating elements are not supported. Since 2.16
|
||||
@G_REGEX_ERROR_HEX_CODE_TOO_LARGE: Character value in "\\x{...}" sequence is too large. Since 2.16
|
||||
@G_REGEX_ERROR_INVALID_CONDITION: Invalid condition "(?(0)". Since 2.16
|
||||
@G_REGEX_ERROR_SINGLE_BYTE_MATCH_IN_LOOKBEHIND: \\C not allowed in lookbehind assertion. Since 2.16
|
||||
@G_REGEX_ERROR_INFINITE_LOOP: Recursive call could loop indefinitely. Since 2.16
|
||||
@G_REGEX_ERROR_MISSING_SUBPATTERN_NAME_TERMINATOR: Missing terminator in subpattern name. Since 2.16
|
||||
@G_REGEX_ERROR_DUPLICATE_SUBPATTERN_NAME: Two named subpatterns have the same name. Since 2.16
|
||||
@G_REGEX_ERROR_MALFORMED_PROPERTY: Malformed "\\P" or "\\p" sequence. Since 2.16
|
||||
@G_REGEX_ERROR_UNKNOWN_PROPERTY: Unknown property name after "\\P" or "\\p". Since 2.16
|
||||
@G_REGEX_ERROR_SUBPATTERN_NAME_TOO_LONG: Subpattern name is too long (maximum 32 characters). Since 2.16
|
||||
@G_REGEX_ERROR_TOO_MANY_SUBPATTERNS: Too many named subpatterns (maximum 10,000). Since 2.16
|
||||
@G_REGEX_ERROR_INVALID_OCTAL_VALUE: Octal value is greater than "\\377". Since 2.16
|
||||
@G_REGEX_ERROR_TOO_MANY_BRANCHES_IN_DEFINE: "DEFINE" group contains more than one branch. Since 2.16
|
||||
@G_REGEX_ERROR_DEFINE_REPETION: Repeating a "DEFINE" group is not allowed. Since 2.16
|
||||
@G_REGEX_ERROR_INCONSISTENT_NEWLINE_OPTIONS: Inconsistent newline options. Since 2.16
|
||||
@G_REGEX_ERROR_MISSING_BACK_REFERENCE: "\\g" is not followed by a braced name or an
|
||||
optionally braced non-zero number. Since 2.16
|
||||
@Since: 2.14
|
||||
@G_REGEX_ERROR_COMPILE:
|
||||
@G_REGEX_ERROR_OPTIMIZE:
|
||||
@G_REGEX_ERROR_REPLACE:
|
||||
@G_REGEX_ERROR_MATCH:
|
||||
@G_REGEX_ERROR_INTERNAL:
|
||||
@G_REGEX_ERROR_STRAY_BACKSLASH:
|
||||
@G_REGEX_ERROR_MISSING_CONTROL_CHAR:
|
||||
@G_REGEX_ERROR_UNRECOGNIZED_ESCAPE:
|
||||
@G_REGEX_ERROR_QUANTIFIERS_OUT_OF_ORDER:
|
||||
@G_REGEX_ERROR_QUANTIFIER_TOO_BIG:
|
||||
@G_REGEX_ERROR_UNTERMINATED_CHARACTER_CLASS:
|
||||
@G_REGEX_ERROR_INVALID_ESCAPE_IN_CHARACTER_CLASS:
|
||||
@G_REGEX_ERROR_RANGE_OUT_OF_ORDER:
|
||||
@G_REGEX_ERROR_NOTHING_TO_REPEAT:
|
||||
@G_REGEX_ERROR_UNRECOGNIZED_CHARACTER:
|
||||
@G_REGEX_ERROR_POSIX_NAMED_CLASS_OUTSIDE_CLASS:
|
||||
@G_REGEX_ERROR_UNMATCHED_PARENTHESIS:
|
||||
@G_REGEX_ERROR_INEXISTENT_SUBPATTERN_REFERENCE:
|
||||
@G_REGEX_ERROR_UNTERMINATED_COMMENT:
|
||||
@G_REGEX_ERROR_EXPRESSION_TOO_LARGE:
|
||||
@G_REGEX_ERROR_MEMORY_ERROR:
|
||||
@G_REGEX_ERROR_VARIABLE_LENGTH_LOOKBEHIND:
|
||||
@G_REGEX_ERROR_MALFORMED_CONDITION:
|
||||
@G_REGEX_ERROR_TOO_MANY_CONDITIONAL_BRANCHES:
|
||||
@G_REGEX_ERROR_ASSERTION_EXPECTED:
|
||||
@G_REGEX_ERROR_UNKNOWN_POSIX_CLASS_NAME:
|
||||
@G_REGEX_ERROR_POSIX_COLLATING_ELEMENTS_NOT_SUPPORTED:
|
||||
@G_REGEX_ERROR_HEX_CODE_TOO_LARGE:
|
||||
@G_REGEX_ERROR_INVALID_CONDITION:
|
||||
@G_REGEX_ERROR_SINGLE_BYTE_MATCH_IN_LOOKBEHIND:
|
||||
@G_REGEX_ERROR_INFINITE_LOOP:
|
||||
@G_REGEX_ERROR_MISSING_SUBPATTERN_NAME_TERMINATOR:
|
||||
@G_REGEX_ERROR_DUPLICATE_SUBPATTERN_NAME:
|
||||
@G_REGEX_ERROR_MALFORMED_PROPERTY:
|
||||
@G_REGEX_ERROR_UNKNOWN_PROPERTY:
|
||||
@G_REGEX_ERROR_SUBPATTERN_NAME_TOO_LONG:
|
||||
@G_REGEX_ERROR_TOO_MANY_SUBPATTERNS:
|
||||
@G_REGEX_ERROR_INVALID_OCTAL_VALUE:
|
||||
@G_REGEX_ERROR_TOO_MANY_BRANCHES_IN_DEFINE:
|
||||
@G_REGEX_ERROR_DEFINE_REPETION:
|
||||
@G_REGEX_ERROR_INCONSISTENT_NEWLINE_OPTIONS:
|
||||
@G_REGEX_ERROR_MISSING_BACK_REFERENCE:
|
||||
|
||||
<!-- ##### MACRO G_REGEX_ERROR ##### -->
|
||||
<para>
|
||||
Error domain for regular expressions. Errors in this domain will be from the #GRegexError enumeration. See #GError for information on error domains.
|
||||
|
||||
</para>
|
||||
|
||||
@Since: 2.14
|
||||
@Since:
|
||||
|
||||
|
||||
<!-- ##### ENUM GRegexCompileFlags ##### -->
|
||||
<para>
|
||||
Flags specifying compile-time options.
|
||||
|
||||
</para>
|
||||
|
||||
@G_REGEX_CASELESS: Letters in the pattern match both upper and lower case
|
||||
letters. This option can be changed within a pattern by a "(?i)" option
|
||||
setting.
|
||||
@G_REGEX_MULTILINE: By default, GRegex treats the strings as consisting
|
||||
of a single line of characters (even if it actually contains newlines).
|
||||
The "start of line" metacharacter ("^") matches only at the start of the
|
||||
string, while the "end of line" metacharacter ("$") matches only at the
|
||||
end of the string, or before a terminating newline (unless
|
||||
#G_REGEX_DOLLAR_ENDONLY is set). When #G_REGEX_MULTILINE is set,
|
||||
the "start of line" and "end of line" constructs match immediately following
|
||||
or immediately before any newline in the string, respectively, as well
|
||||
as at the very start and end. This can be changed within a pattern by a
|
||||
"(?m)" option setting.
|
||||
@G_REGEX_DOTALL: A dot metacharater (".") in the pattern matches all
|
||||
characters, including newlines. Without it, newlines are excluded. This
|
||||
option can be changed within a pattern by a ("?s") option setting.
|
||||
@G_REGEX_EXTENDED: Whitespace data characters in the pattern are
|
||||
totally ignored except when escaped or inside a character class.
|
||||
Whitespace does not include the VT character (code 11). In addition,
|
||||
characters between an unescaped "#" outside a character class and
|
||||
the next newline character, inclusive, are also ignored. This can be
|
||||
changed within a pattern by a "(?x)" option setting.
|
||||
@G_REGEX_ANCHORED: The pattern is forced to be "anchored", that is,
|
||||
it is constrained to match only at the first matching point in the string
|
||||
that is being searched. This effect can also be achieved by appropriate
|
||||
constructs in the pattern itself such as the "^" metacharater.
|
||||
@G_REGEX_DOLLAR_ENDONLY: A dollar metacharacter ("$") in the pattern
|
||||
matches only at the end of the string. Without this option, a dollar also
|
||||
matches immediately before the final character if it is a newline (but
|
||||
not before any other newlines). This option is ignored if
|
||||
#G_REGEX_MULTILINE is set.
|
||||
@G_REGEX_UNGREEDY: Inverts the "greediness" of the
|
||||
quantifiers so that they are not greedy by default, but become greedy
|
||||
if followed by "?". It can also be set by a "(?U)" option setting within
|
||||
the pattern.
|
||||
@G_REGEX_RAW: Usually strings must be valid UTF-8 strings, using this
|
||||
flag they are considered as a raw sequence of bytes.
|
||||
@G_REGEX_NO_AUTO_CAPTURE: Disables the use of numbered capturing
|
||||
parentheses in the pattern. Any opening parenthesis that is not followed
|
||||
by "?" behaves as if it were followed by "?:" but named parentheses can
|
||||
still be used for capturing (and they acquire numbers in the usual way).
|
||||
@G_REGEX_OPTIMIZE: Optimize the regular expression. If the pattern will
|
||||
be used many times, then it may be worth the effort to optimize it to
|
||||
improve the speed of matches.
|
||||
@G_REGEX_DUPNAMES: Names used to identify capturing subpatterns need not
|
||||
be unique. This can be helpful for certain types of pattern when it is known
|
||||
that only one instance of the named subpattern can ever be matched.
|
||||
@G_REGEX_NEWLINE_CR: Usually any newline character is recognized, if this
|
||||
option is set, the only recognized newline character is '\r'.
|
||||
@G_REGEX_NEWLINE_LF: Usually any newline character is recognized, if this
|
||||
option is set, the only recognized newline character is '\n'.
|
||||
@G_REGEX_NEWLINE_CRLF: Usually any newline character is recognized, if this
|
||||
option is set, the only recognized newline character sequence is '\r\n'.
|
||||
@Since: 2.14
|
||||
@G_REGEX_CASELESS:
|
||||
@G_REGEX_MULTILINE:
|
||||
@G_REGEX_DOTALL:
|
||||
@G_REGEX_EXTENDED:
|
||||
@G_REGEX_ANCHORED:
|
||||
@G_REGEX_DOLLAR_ENDONLY:
|
||||
@G_REGEX_UNGREEDY:
|
||||
@G_REGEX_RAW:
|
||||
@G_REGEX_NO_AUTO_CAPTURE:
|
||||
@G_REGEX_OPTIMIZE:
|
||||
@G_REGEX_DUPNAMES:
|
||||
@G_REGEX_NEWLINE_CR:
|
||||
@G_REGEX_NEWLINE_LF:
|
||||
@G_REGEX_NEWLINE_CRLF:
|
||||
|
||||
|
||||
<!-- ##### ENUM GRegexMatchFlags ##### -->
|
||||
<para>
|
||||
Flags specifying match-time options.
|
||||
|
||||
</para>
|
||||
|
||||
@G_REGEX_MATCH_ANCHORED: The pattern is forced to be "anchored", that is,
|
||||
it is constrained to match only at the first matching point in the string
|
||||
that is being searched. This effect can also be achieved by appropriate
|
||||
constructs in the pattern itself such as the "^" metacharater.
|
||||
@G_REGEX_MATCH_NOTBOL: Specifies that first character of the string is
|
||||
not the beginning of a line, so the circumflex metacharacter should not
|
||||
match before it. Setting this without G_REGEX_MULTILINE (at compile time)
|
||||
causes circumflex never to match. This option affects only the behaviour of
|
||||
the circumflex metacharacter, it does not affect "\A".
|
||||
@G_REGEX_MATCH_NOTEOL: Specifies that the end of the subject string is
|
||||
not the end of a line, so the dollar metacharacter should not match it nor
|
||||
(except in multiline mode) a newline immediately before it. Setting this
|
||||
without G_REGEX_MULTILINE (at compile time) causes dollar never to match.
|
||||
This option affects only the behaviour of the dollar metacharacter, it does
|
||||
not affect "\Z" or "\z".
|
||||
@G_REGEX_MATCH_NOTEMPTY: An empty string is not considered to be a valid
|
||||
match if this option is set. If there are alternatives in the pattern, they
|
||||
are tried. If all the alternatives match the empty string, the entire match
|
||||
fails. For example, if the pattern "a?b?" is applied to a string not beginning
|
||||
with "a" or "b", it matches the empty string at the start of the string.
|
||||
With this flag set, this match is not valid, so GRegex searches further
|
||||
into the string for occurrences of "a" or "b".
|
||||
@G_REGEX_MATCH_PARTIAL: Turns on the partial matching feature, for more
|
||||
documentation on partial matching see g_match_info_is_partial_match().
|
||||
@G_REGEX_MATCH_NEWLINE_CR: Overrides the newline definition set when creating
|
||||
a new #GRegex, setting the '\r' character as line terminator.
|
||||
@G_REGEX_MATCH_NEWLINE_LF: Overrides the newline definition set when creating
|
||||
a new #GRegex, setting the '\n' character as line terminator.
|
||||
@G_REGEX_MATCH_NEWLINE_CRLF: Overrides the newline definition set when creating
|
||||
a new #GRegex, setting the '\r\n' characters as line terminator.
|
||||
@G_REGEX_MATCH_NEWLINE_ANY: Overrides the newline definition set when creating
|
||||
a new #GRegex, any newline character or character sequence is recognized.
|
||||
@Since: 2.14
|
||||
@G_REGEX_MATCH_ANCHORED:
|
||||
@G_REGEX_MATCH_NOTBOL:
|
||||
@G_REGEX_MATCH_NOTEOL:
|
||||
@G_REGEX_MATCH_NOTEMPTY:
|
||||
@G_REGEX_MATCH_PARTIAL:
|
||||
@G_REGEX_MATCH_NEWLINE_CR:
|
||||
@G_REGEX_MATCH_NEWLINE_LF:
|
||||
@G_REGEX_MATCH_NEWLINE_CRLF:
|
||||
@G_REGEX_MATCH_NEWLINE_ANY:
|
||||
|
||||
|
||||
<!-- ##### STRUCT GRegex ##### -->
|
||||
<para>
|
||||
A GRegex is the "compiled" form of a regular expression pattern. This
|
||||
structure is opaque and its fields cannot be accessed directly.
|
||||
|
||||
</para>
|
||||
|
||||
@Since: 2.14
|
||||
|
||||
<!-- ##### USER_FUNCTION GRegexEvalCallback ##### -->
|
||||
<para>
|
||||
Specifies the type of the function passed to g_regex_replace_eval().
|
||||
It is called for each occurance of the pattern in the string passed
|
||||
to g_regex_replace_eval(), and it should append the replacement to
|
||||
@result.
|
||||
|
||||
</para>
|
||||
|
||||
@match_info: the #GMatchInfo generated by the match.
|
||||
Use g_match_info_get_regex() and g_match_info_get_string() if you
|
||||
need the #GRegex or the matched string.
|
||||
@result: a #GString containing the new string
|
||||
@user_data: user data passed to g_regex_replace_eval()
|
||||
@Returns: %FALSE to continue the replacement process, %TRUE to stop it
|
||||
@Since: 2.14
|
||||
@match_info:
|
||||
@result:
|
||||
@user_data:
|
||||
@Returns:
|
||||
|
||||
|
||||
<!-- ##### FUNCTION g_regex_new ##### -->
|
||||
|
@ -33,6 +33,66 @@
|
||||
#include "pcre/pcre.h"
|
||||
#endif
|
||||
|
||||
/**
|
||||
* SECTION:gregex
|
||||
* @title: Perl-compatible regular expressions
|
||||
* @short_description: matches strings against regular expressions
|
||||
* @see_also: <xref linkend="glib-regex-syntax">
|
||||
*
|
||||
* The <function>g_regex_*()</function> functions implement regular
|
||||
* expression pattern matching using syntax and semantics similar to
|
||||
* Perl regular expression.
|
||||
*
|
||||
* Some functions accept a @start_position argument, setting it differs
|
||||
* from just passing over a shortened string and setting #G_REGEX_MATCH_NOTBOL
|
||||
* in the case of a pattern that begins with any kind of lookbehind assertion.
|
||||
* For example, consider the pattern "\Biss\B" which finds occurrences of "iss"
|
||||
* in the middle of words. ("\B" matches only if the current position in the
|
||||
* subject is not a word boundary.) When applied to the string "Mississipi"
|
||||
* from the fourth byte, namely "issipi", it does not match, because "\B" is
|
||||
* always false at the start of the subject, which is deemed to be a word
|
||||
* boundary. However, if the entire string is passed , but with
|
||||
* @start_position set to 4, it finds the second occurrence of "iss" because
|
||||
* it is able to look behind the starting point to discover that it is
|
||||
* preceded by a letter.
|
||||
*
|
||||
* Note that, unless you set the #G_REGEX_RAW flag, all the strings passed
|
||||
* to these functions must be encoded in UTF-8. The lengths and the positions
|
||||
* inside the strings are in bytes and not in characters, so, for instance,
|
||||
* "\xc3\xa0" (i.e. "à") is two bytes long but it is treated as a
|
||||
* single character. If you set #G_REGEX_RAW the strings can be non-valid
|
||||
* UTF-8 strings and a byte is treated as a character, so "\xc3\xa0" is two
|
||||
* bytes and two characters long.
|
||||
*
|
||||
* When matching a pattern, "\n" matches only against a "\n" character in
|
||||
* the string, and "\r" matches only a "\r" character. To match any newline
|
||||
* sequence use "\R". This particular group matches either the two-character
|
||||
* sequence CR + LF ("\r\n"), or one of the single characters LF (linefeed,
|
||||
* U+000A, "\n"), VT vertical tab, U+000B, "\v"), FF (formfeed, U+000C, "\f"),
|
||||
* CR (carriage return, U+000D, "\r"), NEL (next line, U+0085), LS (line
|
||||
* separator, U+2028), or PS (paragraph separator, U+2029).
|
||||
*
|
||||
* The behaviour of the dot, circumflex, and dollar metacharacters are
|
||||
* affected by newline characters, the default is to recognize any newline
|
||||
* character (the same characters recognized by "\R"). This can be changed
|
||||
* with #G_REGEX_NEWLINE_CR, #G_REGEX_NEWLINE_LF and #G_REGEX_NEWLINE_CRLF
|
||||
* compile options, and with #G_REGEX_MATCH_NEWLINE_ANY,
|
||||
* #G_REGEX_MATCH_NEWLINE_CR, #G_REGEX_MATCH_NEWLINE_LF and
|
||||
* #G_REGEX_MATCH_NEWLINE_CRLF match options. These settings are also
|
||||
* relevant when compiling a pattern if #G_REGEX_EXTENDED is set, and an
|
||||
* unescaped "#" outside a character class is encountered. This indicates
|
||||
* a comment that lasts until after the next newline.
|
||||
*
|
||||
* Creating and manipulating the same #GRegex structure from different
|
||||
* threads is not a problem as #GRegex does not modify its internal
|
||||
* state between creation and destruction, on the other hand #GMatchInfo
|
||||
* is not threadsafe.
|
||||
*
|
||||
* The regular expressions low-level functionalities are obtained through
|
||||
* the excellent <ulink url="http://www.pcre.org/">PCRE</ulink> library
|
||||
* written by Philip Hazel.
|
||||
*/
|
||||
|
||||
/* Mask of all the possible values for GRegexCompileFlags. */
|
||||
#define G_REGEX_COMPILE_MASK (G_REGEX_CASELESS | \
|
||||
G_REGEX_MULTILINE | \
|
||||
|
225
glib/gregex.h
225
glib/gregex.h
@ -31,6 +31,89 @@
|
||||
|
||||
G_BEGIN_DECLS
|
||||
|
||||
/**
|
||||
* GRegexError:
|
||||
* @G_REGEX_ERROR_COMPILE: Compilation of the regular expression failed.
|
||||
* @G_REGEX_ERROR_OPTIMIZE: Optimization of the regular expression failed.
|
||||
* @G_REGEX_ERROR_REPLACE: Replacement failed due to an ill-formed replacement
|
||||
* string.
|
||||
* @G_REGEX_ERROR_MATCH: The match process failed.
|
||||
* @G_REGEX_ERROR_INTERNAL: Internal error of the regular expression engine.
|
||||
* Since 2.16
|
||||
* @G_REGEX_ERROR_STRAY_BACKSLASH: "\\" at end of pattern. Since 2.16
|
||||
* @G_REGEX_ERROR_MISSING_CONTROL_CHAR: "\\c" at end of pattern. Since 2.16
|
||||
* @G_REGEX_ERROR_UNRECOGNIZED_ESCAPE: Unrecognized character follows "\\".
|
||||
* Since 2.16
|
||||
* @G_REGEX_ERROR_QUANTIFIERS_OUT_OF_ORDER: Numbers out of order in "{}"
|
||||
* quantifier. Since 2.16
|
||||
* @G_REGEX_ERROR_QUANTIFIER_TOO_BIG: Number too big in "{}" quantifier.
|
||||
* Since 2.16
|
||||
* @G_REGEX_ERROR_UNTERMINATED_CHARACTER_CLASS: Missing terminating "]" for
|
||||
* character class. Since 2.16
|
||||
* @G_REGEX_ERROR_INVALID_ESCAPE_IN_CHARACTER_CLASS: Invalid escape sequence
|
||||
* in character class. Since 2.16
|
||||
* @G_REGEX_ERROR_RANGE_OUT_OF_ORDER: Range out of order in character class.
|
||||
* Since 2.16
|
||||
* @G_REGEX_ERROR_NOTHING_TO_REPEAT: Nothing to repeat. Since 2.16
|
||||
* @G_REGEX_ERROR_UNRECOGNIZED_CHARACTER: Unrecognized character after "(?",
|
||||
* "(?<" or "(?P". Since 2.16
|
||||
* @G_REGEX_ERROR_POSIX_NAMED_CLASS_OUTSIDE_CLASS: POSIX named classes are
|
||||
* supported only within a class. Since 2.16
|
||||
* @G_REGEX_ERROR_UNMATCHED_PARENTHESIS: Missing terminating ")" or ")"
|
||||
* without opening "(". Since 2.16
|
||||
* @G_REGEX_ERROR_INEXISTENT_SUBPATTERN_REFERENCE: Reference to non-existent
|
||||
* subpattern. Since 2.16
|
||||
* @G_REGEX_ERROR_UNTERMINATED_COMMENT: Missing terminating ")" after comment.
|
||||
* Since 2.16
|
||||
* @G_REGEX_ERROR_EXPRESSION_TOO_LARGE: Regular expression too large.
|
||||
* Since 2.16
|
||||
* @G_REGEX_ERROR_MEMORY_ERROR: Failed to get memory. Since 2.16
|
||||
* @G_REGEX_ERROR_VARIABLE_LENGTH_LOOKBEHIND: Lookbehind assertion is not
|
||||
* fixed length. Since 2.16
|
||||
* @G_REGEX_ERROR_MALFORMED_CONDITION: Malformed number or name after "(?(".
|
||||
* Since 2.16
|
||||
* @G_REGEX_ERROR_TOO_MANY_CONDITIONAL_BRANCHES: Conditional group contains
|
||||
* more than two branches. Since 2.16
|
||||
* @G_REGEX_ERROR_ASSERTION_EXPECTED: Assertion expected after "(?(".
|
||||
* Since 2.16
|
||||
* @G_REGEX_ERROR_UNKNOWN_POSIX_CLASS_NAME: Unknown POSIX class name.
|
||||
* Since 2.16
|
||||
* @G_REGEX_ERROR_POSIX_COLLATING_ELEMENTS_NOT_SUPPORTED: POSIX collating
|
||||
* elements are not supported. Since 2.16
|
||||
* @G_REGEX_ERROR_HEX_CODE_TOO_LARGE: Character value in "\\x{...}" sequence
|
||||
* is too large. Since 2.16
|
||||
* @G_REGEX_ERROR_INVALID_CONDITION: Invalid condition "(?(0)". Since 2.16
|
||||
* @G_REGEX_ERROR_SINGLE_BYTE_MATCH_IN_LOOKBEHIND: \\C not allowed in
|
||||
* lookbehind assertion. Since 2.16
|
||||
* @G_REGEX_ERROR_INFINITE_LOOP: Recursive call could loop indefinitely.
|
||||
* Since 2.16
|
||||
* @G_REGEX_ERROR_MISSING_SUBPATTERN_NAME_TERMINATOR: Missing terminator
|
||||
* in subpattern name. Since 2.16
|
||||
* @G_REGEX_ERROR_DUPLICATE_SUBPATTERN_NAME: Two named subpatterns have
|
||||
* the same name. Since 2.16
|
||||
* @G_REGEX_ERROR_MALFORMED_PROPERTY: Malformed "\\P" or "\\p" sequence.
|
||||
* Since 2.16
|
||||
* @G_REGEX_ERROR_UNKNOWN_PROPERTY: Unknown property name after "\\P" or
|
||||
* "\\p". Since 2.16
|
||||
* @G_REGEX_ERROR_SUBPATTERN_NAME_TOO_LONG: Subpattern name is too long
|
||||
* (maximum 32 characters). Since 2.16
|
||||
* @G_REGEX_ERROR_TOO_MANY_SUBPATTERNS: Too many named subpatterns (maximum
|
||||
* 10,000). Since 2.16
|
||||
* @G_REGEX_ERROR_INVALID_OCTAL_VALUE: Octal value is greater than "\\377".
|
||||
* Since 2.16
|
||||
* @G_REGEX_ERROR_TOO_MANY_BRANCHES_IN_DEFINE: "DEFINE" group contains more
|
||||
* than one branch. Since 2.16
|
||||
* @G_REGEX_ERROR_DEFINE_REPETION: Repeating a "DEFINE" group is not allowed.
|
||||
* Since 2.16
|
||||
* @G_REGEX_ERROR_INCONSISTENT_NEWLINE_OPTIONS: Inconsistent newline options.
|
||||
* Since 2.16
|
||||
* @G_REGEX_ERROR_MISSING_BACK_REFERENCE: "\\g" is not followed by a braced
|
||||
* name or an optionally braced non-zero number. Since 2.16
|
||||
*
|
||||
* Error codes returned by regular expressions functions.
|
||||
*
|
||||
* Since: 2.14
|
||||
*/
|
||||
typedef enum
|
||||
{
|
||||
G_REGEX_ERROR_COMPILE,
|
||||
@ -79,10 +162,82 @@ typedef enum
|
||||
G_REGEX_ERROR_MISSING_BACK_REFERENCE = 157
|
||||
} GRegexError;
|
||||
|
||||
/**
|
||||
* G_REGEX_ERROR:
|
||||
*
|
||||
* Error domain for regular expressions. Errors in this domain will be
|
||||
* from the #GRegexError enumeration. See #GError for information on
|
||||
* error domains.
|
||||
*
|
||||
* Since: 2.14
|
||||
*/
|
||||
#define G_REGEX_ERROR g_regex_error_quark ()
|
||||
|
||||
GQuark g_regex_error_quark (void);
|
||||
|
||||
/**
|
||||
* GRegexCompileFlags:
|
||||
* @G_REGEX_CASELESS: Letters in the pattern match both upper- and
|
||||
* lowercase letters. This option can be changed within a pattern
|
||||
* by a "(?i)" option setting.
|
||||
* @G_REGEX_MULTILINE: By default, GRegex treats the strings as consisting
|
||||
* of a single line of characters (even if it actually contains
|
||||
* newlines). The "start of line" metacharacter ("^") matches only
|
||||
* at the start of the string, while the "end of line" metacharacter
|
||||
* ("$") matches only at the end of the string, or before a terminating
|
||||
* newline (unless #G_REGEX_DOLLAR_ENDONLY is set). When
|
||||
* #G_REGEX_MULTILINE is set, the "start of line" and "end of line"
|
||||
* constructs match immediately following or immediately before any
|
||||
* newline in the string, respectively, as well as at the very start
|
||||
* and end. This can be changed within a pattern by a "(?m)" option
|
||||
* setting.
|
||||
* @G_REGEX_DOTALL: A dot metacharater (".") in the pattern matches all
|
||||
* characters, including newlines. Without it, newlines are excluded.
|
||||
* This option can be changed within a pattern by a ("?s") option setting.
|
||||
* @G_REGEX_EXTENDED: Whitespace data characters in the pattern are
|
||||
* totally ignored except when escaped or inside a character class.
|
||||
* Whitespace does not include the VT character (code 11). In addition,
|
||||
* characters between an unescaped "#" outside a character class and
|
||||
* the next newline character, inclusive, are also ignored. This can
|
||||
* be changed within a pattern by a "(?x)" option setting.
|
||||
* @G_REGEX_ANCHORED: The pattern is forced to be "anchored", that is,
|
||||
* it is constrained to match only at the first matching point in the
|
||||
* string that is being searched. This effect can also be achieved by
|
||||
* appropriate constructs in the pattern itself such as the "^"
|
||||
* metacharater.
|
||||
* @G_REGEX_DOLLAR_ENDONLY: A dollar metacharacter ("$") in the pattern
|
||||
* matches only at the end of the string. Without this option, a
|
||||
* dollar also matches immediately before the final character if
|
||||
* it is a newline (but not before any other newlines). This option
|
||||
* is ignored if #G_REGEX_MULTILINE is set.
|
||||
* @G_REGEX_UNGREEDY: Inverts the "greediness" of the quantifiers so that
|
||||
* they are not greedy by default, but become greedy if followed by "?".
|
||||
* It can also be set by a "(?U)" option setting within the pattern.
|
||||
* @G_REGEX_RAW: Usually strings must be valid UTF-8 strings, using this
|
||||
* flag they are considered as a raw sequence of bytes.
|
||||
* @G_REGEX_NO_AUTO_CAPTURE: Disables the use of numbered capturing
|
||||
* parentheses in the pattern. Any opening parenthesis that is not
|
||||
* followed by "?" behaves as if it were followed by "?:" but named
|
||||
* parentheses can still be used for capturing (and they acquire numbers
|
||||
* in the usual way).
|
||||
* @G_REGEX_OPTIMIZE: Optimize the regular expression. If the pattern will
|
||||
* be used many times, then it may be worth the effort to optimize it
|
||||
* to improve the speed of matches.
|
||||
* @G_REGEX_DUPNAMES: Names used to identify capturing subpatterns need not
|
||||
* be unique. This can be helpful for certain types of pattern when it
|
||||
* is known that only one instance of the named subpattern can ever be
|
||||
* matched.
|
||||
* @G_REGEX_NEWLINE_CR: Usually any newline character is recognized, if this
|
||||
* option is set, the only recognized newline character is '\r'.
|
||||
* @G_REGEX_NEWLINE_LF: Usually any newline character is recognized, if this
|
||||
* option is set, the only recognized newline character is '\n'.
|
||||
* @G_REGEX_NEWLINE_CRLF: Usually any newline character is recognized, if this
|
||||
* option is set, the only recognized newline character sequence is '\r\n'.
|
||||
*
|
||||
* Flags specifying compile-time options.
|
||||
*
|
||||
* Since: 2.14
|
||||
*/
|
||||
/* Remember to update G_REGEX_COMPILE_MASK in gregex.c after
|
||||
* adding a new flag. */
|
||||
typedef enum
|
||||
@ -103,6 +258,49 @@ typedef enum
|
||||
G_REGEX_NEWLINE_CRLF = G_REGEX_NEWLINE_CR | G_REGEX_NEWLINE_LF
|
||||
} GRegexCompileFlags;
|
||||
|
||||
/**
|
||||
* GRegexMatchFlags:
|
||||
* @G_REGEX_MATCH_ANCHORED: The pattern is forced to be "anchored", that is,
|
||||
* it is constrained to match only at the first matching point in the
|
||||
* string that is being searched. This effect can also be achieved by
|
||||
* appropriate constructs in the pattern itself such as the "^"
|
||||
* metacharater.
|
||||
* @G_REGEX_MATCH_NOTBOL: Specifies that first character of the string is
|
||||
* not the beginning of a line, so the circumflex metacharacter should
|
||||
* not match before it. Setting this without #G_REGEX_MULTILINE (at
|
||||
* compile time) causes circumflex never to match. This option affects
|
||||
* only the behaviour of the circumflex metacharacter, it does not
|
||||
* affect "\A".
|
||||
* @G_REGEX_MATCH_NOTEOL: Specifies that the end of the subject string is
|
||||
* not the end of a line, so the dollar metacharacter should not match
|
||||
* it nor (except in multiline mode) a newline immediately before it.
|
||||
* Setting this without #G_REGEX_MULTILINE (at compile time) causes
|
||||
* dollar never to match. This option affects only the behaviour of
|
||||
* the dollar metacharacter, it does not affect "\Z" or "\z".
|
||||
* @G_REGEX_MATCH_NOTEMPTY: An empty string is not considered to be a valid
|
||||
* match if this option is set. If there are alternatives in the pattern,
|
||||
* they are tried. If all the alternatives match the empty string, the
|
||||
* entire match fails. For example, if the pattern "a?b?" is applied to
|
||||
* a string not beginning with "a" or "b", it matches the empty string
|
||||
* at the start of the string. With this flag set, this match is not
|
||||
* valid, so GRegex searches further into the string for occurrences
|
||||
* of "a" or "b".
|
||||
* @G_REGEX_MATCH_PARTIAL: Turns on the partial matching feature, for more
|
||||
* documentation on partial matching see g_match_info_is_partial_match().
|
||||
* @G_REGEX_MATCH_NEWLINE_CR: Overrides the newline definition set when
|
||||
* creating a new #GRegex, setting the '\r' character as line terminator.
|
||||
* @G_REGEX_MATCH_NEWLINE_LF: Overrides the newline definition set when
|
||||
* creating a new #GRegex, setting the '\n' character as line terminator.
|
||||
* @G_REGEX_MATCH_NEWLINE_CRLF: Overrides the newline definition set when
|
||||
* creating a new #GRegex, setting the '\r\n' characters as line terminator.
|
||||
* @G_REGEX_MATCH_NEWLINE_ANY: Overrides the newline definition set when
|
||||
* creating a new #GRegex, any newline character or character sequence
|
||||
* is recognized.
|
||||
*
|
||||
* Flags specifying match-time options.
|
||||
*
|
||||
* Since: 2.14
|
||||
*/
|
||||
/* Remember to update G_REGEX_MATCH_MASK in gregex.c after
|
||||
* adding a new flag. */
|
||||
typedef enum
|
||||
@ -118,9 +316,36 @@ typedef enum
|
||||
G_REGEX_MATCH_NEWLINE_ANY = 1 << 22
|
||||
} GRegexMatchFlags;
|
||||
|
||||
/**
|
||||
* GRegex:
|
||||
*
|
||||
* A GRegex is the "compiled" form of a regular expression pattern. This
|
||||
* structure is opaque and its fields cannot be accessed directly.
|
||||
*
|
||||
* Since: 2.14
|
||||
*/
|
||||
typedef struct _GRegex GRegex;
|
||||
|
||||
|
||||
typedef struct _GMatchInfo GMatchInfo;
|
||||
|
||||
/**
|
||||
* GRegexEvalCallback:
|
||||
* @match_info: the #GMatchInfo generated by the match.
|
||||
* Use g_match_info_get_regex() and g_match_info_get_string() if you
|
||||
* need the #GRegex or the matched string.
|
||||
* @result: a #GString containing the new string
|
||||
* @user_data: user data passed to g_regex_replace_eval()
|
||||
*
|
||||
* Specifies the type of the function passed to g_regex_replace_eval().
|
||||
* It is called for each occurance of the pattern in the string passed
|
||||
* to g_regex_replace_eval(), and it should append the replacement to
|
||||
* @result.
|
||||
*
|
||||
* Returns: %FALSE to continue the replacement process, %TRUE to stop it
|
||||
*
|
||||
* Since: 2.14
|
||||
*/
|
||||
typedef gboolean (*GRegexEvalCallback) (const GMatchInfo *match_info,
|
||||
GString *result,
|
||||
gpointer user_data);
|
||||
|
Loading…
Reference in New Issue
Block a user