mirror of
https://gitlab.gnome.org/GNOME/glib.git
synced 2025-03-15 20:25:12 +01:00
Move Unicode docs inline, ditch template
This commit is contained in:
parent
2358616d6a
commit
09e2d2a61c
1
docs/reference/glib/tmpl/.gitignore
vendored
1
docs/reference/glib/tmpl/.gitignore
vendored
@ -43,4 +43,5 @@ trees-binary.sgml
|
||||
trees-nary.sgml
|
||||
timers.sgml
|
||||
timezone.sgml
|
||||
unicode.sgml
|
||||
version.sgml
|
||||
|
@ -1,908 +0,0 @@
|
||||
<!-- ##### SECTION Title ##### -->
|
||||
Unicode Manipulation
|
||||
|
||||
<!-- ##### SECTION Short_Description ##### -->
|
||||
functions operating on Unicode characters and UTF-8 strings
|
||||
|
||||
<!-- ##### SECTION Long_Description ##### -->
|
||||
<para>
|
||||
This section describes a number of functions for dealing with
|
||||
Unicode characters and strings. There are analogues of the
|
||||
traditional <filename>ctype.h</filename> character classification
|
||||
and case conversion functions, UTF-8 analogues of some string utility
|
||||
functions, functions to perform normalization, case conversion and
|
||||
collation on UTF-8 strings and finally functions to convert between
|
||||
the UTF-8, UTF-16 and UCS-4 encodings of Unicode.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
The implementations of the Unicode functions in GLib are based
|
||||
on the Unicode Character Data tables, which are available from
|
||||
<ulink url="http://www.unicode.org/">www.unicode.org</ulink>.
|
||||
GLib 2.8 supports Unicode 4.0, GLib 2.10 supports Unicode 4.1,
|
||||
GLib 2.12 supports Unicode 5.0, GLib 2.16.3 supports Unicode 5.1.
|
||||
</para>
|
||||
|
||||
<!-- ##### SECTION See_Also ##### -->
|
||||
<para>
|
||||
<variablelist>
|
||||
|
||||
<varlistentry>
|
||||
<term>g_locale_to_utf8(), g_locale_from_utf8()</term>
|
||||
<listitem><para>
|
||||
Convenience functions for converting between UTF-8 and the locale encoding.
|
||||
</para></listitem>
|
||||
</varlistentry>
|
||||
|
||||
</variablelist>
|
||||
</para>
|
||||
|
||||
<!-- ##### SECTION Stability_Level ##### -->
|
||||
|
||||
|
||||
<!-- ##### SECTION Image ##### -->
|
||||
|
||||
|
||||
<!-- ##### TYPEDEF gunichar ##### -->
|
||||
<para>
|
||||
A type which can hold any UTF-32 or UCS-4 character code, also known
|
||||
as a Unicode code point.
|
||||
</para>
|
||||
<para>
|
||||
If you want to produce the UTF-8 representation of a #gunichar,
|
||||
use g_ucs4_to_utf8(). See also g_utf8_to_ucs4() for the reverse process.
|
||||
</para>
|
||||
<para>
|
||||
To print/scan values of this type as integer, use
|
||||
%G_GINT32_MODIFIER and/or %G_GUINT32_FORMAT.
|
||||
</para>
|
||||
<para>
|
||||
The notation to express a Unicode code point in running text is as a
|
||||
hexadecimal number with four to six digits and uppercase letters, prefixed
|
||||
by the string "U+". Leading zeros are omitted, unless the code point would
|
||||
have fewer than four hexadecimal digits.
|
||||
For example, "U+0041 LATIN CAPITAL LETTER A".
|
||||
To print a code point in the U+-notation, use the format string
|
||||
"U+%04"G_GINT32_FORMAT"X".
|
||||
To scan, use the format string "U+%06"G_GINT32_FORMAT"X".
|
||||
<informalexample>
|
||||
<programlisting>
|
||||
gunichar c;
|
||||
sscanf ("U+0041", "U+%06"G_GINT32_FORMAT"X", &c)
|
||||
g_print ("Read U+%04"G_GINT32_FORMAT"X", c);
|
||||
</programlisting>
|
||||
</informalexample>
|
||||
</para>
|
||||
|
||||
|
||||
<!-- ##### TYPEDEF gunichar2 ##### -->
|
||||
<para>
|
||||
A type which can hold any UTF-16 code
|
||||
point<footnote id="utf16_surrogate_pairs">UTF-16 also has so called
|
||||
<firstterm>surrogate pairs</firstterm> to encode characters beyond the
|
||||
BMP as pairs of 16bit numbers. Surrogate pairs cannot be stored in a
|
||||
single gunichar2 field, but all GLib functions accepting gunichar2 arrays
|
||||
will correctly interpret surrogate pairs.</footnote>.
|
||||
</para>
|
||||
<para>
|
||||
To print/scan values of this type to/from text you need to convert
|
||||
to/from UTF-8, using g_utf16_to_utf8()/g_utf8_to_utf16().
|
||||
</para>
|
||||
<para>
|
||||
To print/scan values of this type as integer, use
|
||||
%G_GINT16_MODIFIER and/or %G_GUINT16_FORMAT.
|
||||
</para>
|
||||
|
||||
|
||||
<!-- ##### FUNCTION g_unichar_validate ##### -->
|
||||
<para>
|
||||
|
||||
</para>
|
||||
|
||||
@ch:
|
||||
@Returns:
|
||||
|
||||
|
||||
<!-- ##### FUNCTION g_unichar_isalnum ##### -->
|
||||
<para>
|
||||
|
||||
</para>
|
||||
|
||||
@c:
|
||||
@Returns:
|
||||
|
||||
|
||||
<!-- ##### FUNCTION g_unichar_isalpha ##### -->
|
||||
<para>
|
||||
|
||||
</para>
|
||||
|
||||
@c:
|
||||
@Returns:
|
||||
|
||||
|
||||
<!-- ##### FUNCTION g_unichar_iscntrl ##### -->
|
||||
<para>
|
||||
|
||||
</para>
|
||||
|
||||
@c:
|
||||
@Returns:
|
||||
|
||||
|
||||
<!-- ##### FUNCTION g_unichar_isdefined ##### -->
|
||||
<para>
|
||||
|
||||
</para>
|
||||
|
||||
@c:
|
||||
@Returns:
|
||||
|
||||
|
||||
<!-- ##### FUNCTION g_unichar_isdigit ##### -->
|
||||
<para>
|
||||
|
||||
</para>
|
||||
|
||||
@c:
|
||||
@Returns:
|
||||
|
||||
|
||||
<!-- ##### FUNCTION g_unichar_isgraph ##### -->
|
||||
<para>
|
||||
|
||||
</para>
|
||||
|
||||
@c:
|
||||
@Returns:
|
||||
|
||||
|
||||
<!-- ##### FUNCTION g_unichar_islower ##### -->
|
||||
<para>
|
||||
|
||||
</para>
|
||||
|
||||
@c:
|
||||
@Returns:
|
||||
|
||||
|
||||
<!-- ##### FUNCTION g_unichar_ismark ##### -->
|
||||
<para>
|
||||
|
||||
</para>
|
||||
|
||||
@c:
|
||||
@Returns:
|
||||
|
||||
|
||||
<!-- ##### FUNCTION g_unichar_isprint ##### -->
|
||||
<para>
|
||||
|
||||
</para>
|
||||
|
||||
@c:
|
||||
@Returns:
|
||||
|
||||
|
||||
<!-- ##### FUNCTION g_unichar_ispunct ##### -->
|
||||
<para>
|
||||
|
||||
</para>
|
||||
|
||||
@c:
|
||||
@Returns:
|
||||
|
||||
|
||||
<!-- ##### FUNCTION g_unichar_isspace ##### -->
|
||||
<para>
|
||||
|
||||
</para>
|
||||
|
||||
@c:
|
||||
@Returns:
|
||||
|
||||
|
||||
<!-- ##### FUNCTION g_unichar_istitle ##### -->
|
||||
<para>
|
||||
|
||||
</para>
|
||||
|
||||
@c:
|
||||
@Returns:
|
||||
|
||||
|
||||
<!-- ##### FUNCTION g_unichar_isupper ##### -->
|
||||
<para>
|
||||
|
||||
</para>
|
||||
|
||||
@c:
|
||||
@Returns:
|
||||
|
||||
|
||||
<!-- ##### FUNCTION g_unichar_isxdigit ##### -->
|
||||
<para>
|
||||
|
||||
</para>
|
||||
|
||||
@c:
|
||||
@Returns:
|
||||
|
||||
|
||||
<!-- ##### FUNCTION g_unichar_iswide ##### -->
|
||||
<para>
|
||||
|
||||
</para>
|
||||
|
||||
@c:
|
||||
@Returns:
|
||||
|
||||
|
||||
<!-- ##### FUNCTION g_unichar_iswide_cjk ##### -->
|
||||
<para>
|
||||
|
||||
</para>
|
||||
|
||||
@c:
|
||||
@Returns:
|
||||
|
||||
|
||||
<!-- ##### FUNCTION g_unichar_iszerowidth ##### -->
|
||||
<para>
|
||||
|
||||
</para>
|
||||
|
||||
@c:
|
||||
@Returns:
|
||||
|
||||
|
||||
<!-- ##### FUNCTION g_unichar_toupper ##### -->
|
||||
<para>
|
||||
|
||||
</para>
|
||||
|
||||
@c:
|
||||
@Returns:
|
||||
|
||||
|
||||
<!-- ##### FUNCTION g_unichar_tolower ##### -->
|
||||
<para>
|
||||
|
||||
</para>
|
||||
|
||||
@c:
|
||||
@Returns:
|
||||
|
||||
|
||||
<!-- ##### FUNCTION g_unichar_totitle ##### -->
|
||||
<para>
|
||||
|
||||
</para>
|
||||
|
||||
@c:
|
||||
@Returns:
|
||||
|
||||
|
||||
<!-- ##### FUNCTION g_unichar_digit_value ##### -->
|
||||
<para>
|
||||
|
||||
</para>
|
||||
|
||||
@c:
|
||||
@Returns:
|
||||
|
||||
|
||||
<!-- ##### FUNCTION g_unichar_xdigit_value ##### -->
|
||||
<para>
|
||||
|
||||
</para>
|
||||
|
||||
@c:
|
||||
@Returns:
|
||||
|
||||
|
||||
<!-- ##### ENUM GUnicodeType ##### -->
|
||||
<para>
|
||||
These are the possible character classifications from the
|
||||
Unicode specification.
|
||||
See <ulink url="http://www.unicode.org/Public/UNIDATA/UnicodeData.html"
|
||||
>http://www.unicode.org/Public/UNIDATA/UnicodeData.html</ulink>.
|
||||
</para>
|
||||
|
||||
@G_UNICODE_CONTROL: General category "Other, Control" (Cc)
|
||||
@G_UNICODE_FORMAT: General category "Other, Format" (Cf)
|
||||
@G_UNICODE_UNASSIGNED: General category "Other, Not Assigned" (Cn)
|
||||
@G_UNICODE_PRIVATE_USE: General category "Other, Private Use" (Co)
|
||||
@G_UNICODE_SURROGATE: General category "Other, Surrogate" (Cs)
|
||||
@G_UNICODE_LOWERCASE_LETTER: General category "Letter, Lowercase" (Ll)
|
||||
@G_UNICODE_MODIFIER_LETTER: General category "Letter, Modifier" (Lm)
|
||||
@G_UNICODE_OTHER_LETTER: General category "Letter, Other" (Lo)
|
||||
@G_UNICODE_TITLECASE_LETTER: General category "Letter, Titlecase" (Lt)
|
||||
@G_UNICODE_UPPERCASE_LETTER: General category "Letter, Uppercase" (Lu)
|
||||
@G_UNICODE_COMBINING_MARK: General category "Mark, Spacing Combining" (Mc)
|
||||
@G_UNICODE_ENCLOSING_MARK: General category "Mark, Enclosing" (Me)
|
||||
@G_UNICODE_NON_SPACING_MARK: General category "Mark, Nonspacing" (Mn)
|
||||
@G_UNICODE_DECIMAL_NUMBER: General category "Number, Decimal Digit" (Nd)
|
||||
@G_UNICODE_LETTER_NUMBER: General category "Number, Letter" (Nl)
|
||||
@G_UNICODE_OTHER_NUMBER: General category "Number, Other" (No)
|
||||
@G_UNICODE_CONNECT_PUNCTUATION: General category "Punctuation, Connector" (Pc)
|
||||
@G_UNICODE_DASH_PUNCTUATION: General category "Punctuation, Dash" (Pd)
|
||||
@G_UNICODE_CLOSE_PUNCTUATION: General category "Punctuation, Close" (Pe)
|
||||
@G_UNICODE_FINAL_PUNCTUATION: General category "Punctuation, Final quote" (Pf)
|
||||
@G_UNICODE_INITIAL_PUNCTUATION: General category "Punctuation, Initial quote" (Pi)
|
||||
@G_UNICODE_OTHER_PUNCTUATION: General category "Punctuation, Other" (Po)
|
||||
@G_UNICODE_OPEN_PUNCTUATION: General category "Punctuation, Open" (Ps)
|
||||
@G_UNICODE_CURRENCY_SYMBOL: General category "Symbol, Currency" (Sc)
|
||||
@G_UNICODE_MODIFIER_SYMBOL: General category "Symbol, Modifier" (Sk)
|
||||
@G_UNICODE_MATH_SYMBOL: General category "Symbol, Math" (Sm)
|
||||
@G_UNICODE_OTHER_SYMBOL: General category "Symbol, Other" (So)
|
||||
@G_UNICODE_LINE_SEPARATOR: General category "Separator, Line" (Zl)
|
||||
@G_UNICODE_PARAGRAPH_SEPARATOR: General category "Separator, Paragraph" (Zp)
|
||||
@G_UNICODE_SPACE_SEPARATOR: General category "Separator, Space" (Zs)
|
||||
|
||||
<!-- ##### FUNCTION g_unichar_type ##### -->
|
||||
<para>
|
||||
|
||||
</para>
|
||||
|
||||
@c:
|
||||
@Returns:
|
||||
|
||||
|
||||
<!-- ##### ENUM GUnicodeBreakType ##### -->
|
||||
<para>
|
||||
These are the possible line break classifications.
|
||||
The five Hangul types were added in Unicode 4.1, so, has been
|
||||
introduced in GLib 2.10. Note that new types may be added in the future.
|
||||
Applications should be ready to handle unknown values.
|
||||
They may be regarded as %G_UNICODE_BREAK_UNKNOWN.
|
||||
See <ulink url="http://www.unicode.org/unicode/reports/tr14/"
|
||||
>http://www.unicode.org/unicode/reports/tr14/</ulink>.
|
||||
</para>
|
||||
|
||||
@G_UNICODE_BREAK_MANDATORY: Mandatory Break (BK)
|
||||
@G_UNICODE_BREAK_CARRIAGE_RETURN: Carriage Return (CR)
|
||||
@G_UNICODE_BREAK_LINE_FEED: Line Feed (LF)
|
||||
@G_UNICODE_BREAK_COMBINING_MARK: Attached Characters and Combining Marks (CM)
|
||||
@G_UNICODE_BREAK_SURROGATE: Surrogates (SG)
|
||||
@G_UNICODE_BREAK_ZERO_WIDTH_SPACE: Zero Width Space (ZW)
|
||||
@G_UNICODE_BREAK_INSEPARABLE: Inseparable (IN)
|
||||
@G_UNICODE_BREAK_NON_BREAKING_GLUE: Non-breaking ("Glue") (GL)
|
||||
@G_UNICODE_BREAK_CONTINGENT: Contingent Break Opportunity (CB)
|
||||
@G_UNICODE_BREAK_SPACE: Space (SP)
|
||||
@G_UNICODE_BREAK_AFTER: Break Opportunity After (BA)
|
||||
@G_UNICODE_BREAK_BEFORE: Break Opportunity Before (BB)
|
||||
@G_UNICODE_BREAK_BEFORE_AND_AFTER: Break Opportunity Before and After (B2)
|
||||
@G_UNICODE_BREAK_HYPHEN: Hyphen (HY)
|
||||
@G_UNICODE_BREAK_NON_STARTER: Nonstarter (NS)
|
||||
@G_UNICODE_BREAK_OPEN_PUNCTUATION: Opening Punctuation (OP)
|
||||
@G_UNICODE_BREAK_CLOSE_PUNCTUATION: Closing Punctuation (CL)
|
||||
@G_UNICODE_BREAK_QUOTATION: Ambiguous Quotation (QU)
|
||||
@G_UNICODE_BREAK_EXCLAMATION: Exclamation/Interrogation (EX)
|
||||
@G_UNICODE_BREAK_IDEOGRAPHIC: Ideographic (ID)
|
||||
@G_UNICODE_BREAK_NUMERIC: Numeric (NU)
|
||||
@G_UNICODE_BREAK_INFIX_SEPARATOR: Infix Separator (Numeric) (IS)
|
||||
@G_UNICODE_BREAK_SYMBOL: Symbols Allowing Break After (SY)
|
||||
@G_UNICODE_BREAK_ALPHABETIC: Ordinary Alphabetic and Symbol Characters (AL)
|
||||
@G_UNICODE_BREAK_PREFIX: Prefix (Numeric) (PR)
|
||||
@G_UNICODE_BREAK_POSTFIX: Postfix (Numeric) (PO)
|
||||
@G_UNICODE_BREAK_COMPLEX_CONTEXT: Complex Content Dependent (South East Asian) (SA)
|
||||
@G_UNICODE_BREAK_AMBIGUOUS: Ambiguous (Alphabetic or Ideographic) (AI)
|
||||
@G_UNICODE_BREAK_UNKNOWN: Unknown (XX)
|
||||
@G_UNICODE_BREAK_NEXT_LINE: Next Line (NL)
|
||||
@G_UNICODE_BREAK_WORD_JOINER: Word Joiner (WJ)
|
||||
@G_UNICODE_BREAK_HANGUL_L_JAMO: Hangul L Jamo (JL)
|
||||
@G_UNICODE_BREAK_HANGUL_V_JAMO: Hangul V Jamo (JV)
|
||||
@G_UNICODE_BREAK_HANGUL_T_JAMO: Hangul T Jamo (JT)
|
||||
@G_UNICODE_BREAK_HANGUL_LV_SYLLABLE: Hangul LV Syllable (H2)
|
||||
@G_UNICODE_BREAK_HANGUL_LVT_SYLLABLE: Hangul LVT Syllable (H3)
|
||||
@G_UNICODE_BREAK_CLOSE_PARANTHESIS: Closing Parenthesis (CP). Since 2.28
|
||||
|
||||
<!-- ##### FUNCTION g_unichar_break_type ##### -->
|
||||
<para>
|
||||
|
||||
</para>
|
||||
|
||||
@c:
|
||||
@Returns:
|
||||
|
||||
|
||||
<!-- ##### FUNCTION g_unichar_combining_class ##### -->
|
||||
<para>
|
||||
|
||||
</para>
|
||||
|
||||
@uc:
|
||||
@Returns:
|
||||
|
||||
|
||||
<!-- ##### FUNCTION g_unicode_canonical_ordering ##### -->
|
||||
<para>
|
||||
|
||||
</para>
|
||||
|
||||
@string:
|
||||
@len:
|
||||
|
||||
|
||||
<!-- ##### FUNCTION g_unicode_canonical_decomposition ##### -->
|
||||
<para>
|
||||
|
||||
</para>
|
||||
|
||||
@ch:
|
||||
@result_len:
|
||||
@Returns:
|
||||
|
||||
|
||||
<!-- ##### FUNCTION g_unichar_get_mirror_char ##### -->
|
||||
<para>
|
||||
|
||||
</para>
|
||||
|
||||
@ch:
|
||||
@mirrored_ch:
|
||||
@Returns:
|
||||
|
||||
|
||||
<!-- ##### ENUM GUnicodeScript ##### -->
|
||||
<para>
|
||||
The #GUnicodeScript enumeration identifies different writing
|
||||
systems. The values correspond to the names as defined in the
|
||||
Unicode standard. The enumeration has been added in GLib 2.14,
|
||||
and is interchangeable with #PangoScript.
|
||||
Note that new types may be added in the future. Applications
|
||||
should be ready to handle unknown values.
|
||||
See <ulink
|
||||
url="http://www.unicode.org/reports/tr24/">Unicode Standard Annex
|
||||
#24: Script names</ulink>.
|
||||
</para>
|
||||
|
||||
@G_UNICODE_SCRIPT_INVALID_CODE: a value never returned from g_unichar_get_script()
|
||||
@G_UNICODE_SCRIPT_COMMON: a character used by multiple different scripts
|
||||
@G_UNICODE_SCRIPT_INHERITED: a mark glyph that takes its script from the
|
||||
base glyph to which it is attached
|
||||
@G_UNICODE_SCRIPT_ARABIC: Arabic
|
||||
@G_UNICODE_SCRIPT_ARMENIAN: Armenian
|
||||
@G_UNICODE_SCRIPT_BENGALI: Bengali
|
||||
@G_UNICODE_SCRIPT_BOPOMOFO: Bopomofo
|
||||
@G_UNICODE_SCRIPT_CHEROKEE: Cherokee
|
||||
@G_UNICODE_SCRIPT_COPTIC: Coptic
|
||||
@G_UNICODE_SCRIPT_CYRILLIC: Cyrillic
|
||||
@G_UNICODE_SCRIPT_DESERET: Deseret
|
||||
@G_UNICODE_SCRIPT_DEVANAGARI: Devanagari
|
||||
@G_UNICODE_SCRIPT_ETHIOPIC: Ethiopic
|
||||
@G_UNICODE_SCRIPT_GEORGIAN: Georgian
|
||||
@G_UNICODE_SCRIPT_GOTHIC: Gothic
|
||||
@G_UNICODE_SCRIPT_GREEK: Greek
|
||||
@G_UNICODE_SCRIPT_GUJARATI: Gujarati
|
||||
@G_UNICODE_SCRIPT_GURMUKHI: Gurmukhi
|
||||
@G_UNICODE_SCRIPT_HAN: Han
|
||||
@G_UNICODE_SCRIPT_HANGUL: Hangul
|
||||
@G_UNICODE_SCRIPT_HEBREW: Hebrew
|
||||
@G_UNICODE_SCRIPT_HIRAGANA: Hiragana
|
||||
@G_UNICODE_SCRIPT_KANNADA: Kannada
|
||||
@G_UNICODE_SCRIPT_KATAKANA: Katakana
|
||||
@G_UNICODE_SCRIPT_KHMER: Khmer
|
||||
@G_UNICODE_SCRIPT_LAO: Lao
|
||||
@G_UNICODE_SCRIPT_LATIN: Latin
|
||||
@G_UNICODE_SCRIPT_MALAYALAM: Malayalam
|
||||
@G_UNICODE_SCRIPT_MONGOLIAN: Mongolian
|
||||
@G_UNICODE_SCRIPT_MYANMAR: Myanmar
|
||||
@G_UNICODE_SCRIPT_OGHAM: Ogham
|
||||
@G_UNICODE_SCRIPT_OLD_ITALIC: Old Italic
|
||||
@G_UNICODE_SCRIPT_ORIYA: Oriya
|
||||
@G_UNICODE_SCRIPT_RUNIC: Runic
|
||||
@G_UNICODE_SCRIPT_SINHALA: Sinhala
|
||||
@G_UNICODE_SCRIPT_SYRIAC: Syriac
|
||||
@G_UNICODE_SCRIPT_TAMIL: Tamil
|
||||
@G_UNICODE_SCRIPT_TELUGU: Telugu
|
||||
@G_UNICODE_SCRIPT_THAANA: Thaana
|
||||
@G_UNICODE_SCRIPT_THAI: Thai
|
||||
@G_UNICODE_SCRIPT_TIBETAN: Tibetan
|
||||
@G_UNICODE_SCRIPT_CANADIAN_ABORIGINAL:
|
||||
Canadian Aboriginal
|
||||
@G_UNICODE_SCRIPT_YI: Yi
|
||||
@G_UNICODE_SCRIPT_TAGALOG: Tagalog
|
||||
@G_UNICODE_SCRIPT_HANUNOO: Hanunoo
|
||||
@G_UNICODE_SCRIPT_BUHID: Buhid
|
||||
@G_UNICODE_SCRIPT_TAGBANWA: Tagbanwa
|
||||
@G_UNICODE_SCRIPT_BRAILLE: Braille
|
||||
@G_UNICODE_SCRIPT_CYPRIOT: Cypriot
|
||||
@G_UNICODE_SCRIPT_LIMBU: Limbu
|
||||
@G_UNICODE_SCRIPT_OSMANYA: Osmanya
|
||||
@G_UNICODE_SCRIPT_SHAVIAN: Shavian
|
||||
@G_UNICODE_SCRIPT_LINEAR_B: Linear B
|
||||
@G_UNICODE_SCRIPT_TAI_LE: Tai Le
|
||||
@G_UNICODE_SCRIPT_UGARITIC: Ugaritic
|
||||
@G_UNICODE_SCRIPT_NEW_TAI_LUE: New Tai Lue
|
||||
@G_UNICODE_SCRIPT_BUGINESE: Buginese
|
||||
@G_UNICODE_SCRIPT_GLAGOLITIC: Glagolitic
|
||||
@G_UNICODE_SCRIPT_TIFINAGH: Tifinagh
|
||||
@G_UNICODE_SCRIPT_SYLOTI_NAGRI: Syloti Nagri
|
||||
@G_UNICODE_SCRIPT_OLD_PERSIAN: Old Persian
|
||||
@G_UNICODE_SCRIPT_KHAROSHTHI: Kharoshthi
|
||||
@G_UNICODE_SCRIPT_UNKNOWN: an unassigned code point
|
||||
@G_UNICODE_SCRIPT_BALINESE: Balinese
|
||||
@G_UNICODE_SCRIPT_CUNEIFORM: Cuneiform
|
||||
@G_UNICODE_SCRIPT_PHOENICIAN: Phoenician
|
||||
@G_UNICODE_SCRIPT_PHAGS_PA: Phags-pa
|
||||
@G_UNICODE_SCRIPT_NKO: N'Ko
|
||||
@G_UNICODE_SCRIPT_KAYAH_LI: Kayah Li. Since 2.16.3
|
||||
@G_UNICODE_SCRIPT_LEPCHA: Lepcha. Since 2.16.3
|
||||
@G_UNICODE_SCRIPT_REJANG: Rejang. Since 2.16.3
|
||||
@G_UNICODE_SCRIPT_SUNDANESE: Sundanese. Since 2.16.3
|
||||
@G_UNICODE_SCRIPT_SAURASHTRA: Saurashtra. Since 2.16.3
|
||||
@G_UNICODE_SCRIPT_CHAM: Cham. Since 2.16.3
|
||||
@G_UNICODE_SCRIPT_OL_CHIKI: Ol Chiki. Since 2.16.3
|
||||
@G_UNICODE_SCRIPT_VAI: Vai. Since 2.16.3
|
||||
@G_UNICODE_SCRIPT_CARIAN: Carian. Since 2.16.3
|
||||
@G_UNICODE_SCRIPT_LYCIAN: Lycian. Since 2.16.3
|
||||
@G_UNICODE_SCRIPT_LYDIAN: Lydian. Since 2.16.3
|
||||
@G_UNICODE_SCRIPT_AVESTAN: Avestan. Since 2.26
|
||||
@G_UNICODE_SCRIPT_BAMUM: Bamum. Since 2.26
|
||||
@G_UNICODE_SCRIPT_EGYPTIAN_HIEROGLYPHS: Egyptian Hieroglpyhs. Since 2.26
|
||||
@G_UNICODE_SCRIPT_IMPERIAL_ARAMAIC: Imperial Aramaic. Since 2.26
|
||||
@G_UNICODE_SCRIPT_INSCRIPTIONAL_PAHLAVI: Inscriptional Pahlavi. Since 2.26
|
||||
@G_UNICODE_SCRIPT_INSCRIPTIONAL_PARTHIAN: Inscriptional Parthian. Since 2.26
|
||||
@G_UNICODE_SCRIPT_JAVANESE: Javanese. Since 2.26
|
||||
@G_UNICODE_SCRIPT_KAITHI: Kaithi. Since 2.26
|
||||
@G_UNICODE_SCRIPT_LISU: Lisu. Since 2.26
|
||||
@G_UNICODE_SCRIPT_MEETEI_MAYEK: Meetei Mayek. Since 2.26
|
||||
@G_UNICODE_SCRIPT_OLD_SOUTH_ARABIAN: Old South Arabian. Since 2.26
|
||||
@G_UNICODE_SCRIPT_OLD_TURKIC: Old Turkic. Since 2.28
|
||||
@G_UNICODE_SCRIPT_SAMARITAN: Samaritan. Since 2.26
|
||||
@G_UNICODE_SCRIPT_TAI_THAM: Tai Tham. Since 2.26
|
||||
@G_UNICODE_SCRIPT_TAI_VIET: Tai Viet. Since 2.26
|
||||
@G_UNICODE_SCRIPT_BATAK: Batak. Since 2.28
|
||||
@G_UNICODE_SCRIPT_BRAHMI: Brahmi. Since 2.28
|
||||
@G_UNICODE_SCRIPT_MANDAIC: Mandaic. Since 2.28
|
||||
|
||||
<!-- ##### FUNCTION g_unichar_get_script ##### -->
|
||||
<para>
|
||||
|
||||
</para>
|
||||
|
||||
@ch:
|
||||
@Returns:
|
||||
|
||||
|
||||
<!-- ##### MACRO g_utf8_next_char ##### -->
|
||||
<para>
|
||||
Skips to the next character in a UTF-8 string. The string must be
|
||||
valid; this macro is as fast as possible, and has no error-checking.
|
||||
You would use this macro to iterate over a string character by
|
||||
character. The macro returns the start of the next UTF-8 character.
|
||||
Before using this macro, use g_utf8_validate() to validate strings
|
||||
that may contain invalid UTF-8.
|
||||
</para>
|
||||
|
||||
@p: Pointer to the start of a valid UTF-8 character.
|
||||
|
||||
|
||||
<!-- ##### FUNCTION g_utf8_get_char ##### -->
|
||||
<para>
|
||||
|
||||
</para>
|
||||
|
||||
@p:
|
||||
@Returns:
|
||||
|
||||
|
||||
<!-- ##### FUNCTION g_utf8_get_char_validated ##### -->
|
||||
<para>
|
||||
|
||||
</para>
|
||||
|
||||
@p:
|
||||
@max_len:
|
||||
@Returns:
|
||||
|
||||
|
||||
<!-- ##### FUNCTION g_utf8_offset_to_pointer ##### -->
|
||||
<para>
|
||||
|
||||
</para>
|
||||
|
||||
@str:
|
||||
@offset:
|
||||
@Returns:
|
||||
|
||||
|
||||
<!-- ##### FUNCTION g_utf8_pointer_to_offset ##### -->
|
||||
<para>
|
||||
|
||||
</para>
|
||||
|
||||
@str:
|
||||
@pos:
|
||||
@Returns:
|
||||
|
||||
|
||||
<!-- ##### FUNCTION g_utf8_prev_char ##### -->
|
||||
<para>
|
||||
|
||||
</para>
|
||||
|
||||
@p:
|
||||
@Returns:
|
||||
|
||||
|
||||
<!-- ##### FUNCTION g_utf8_find_next_char ##### -->
|
||||
<para>
|
||||
|
||||
</para>
|
||||
|
||||
@p:
|
||||
@end:
|
||||
@Returns:
|
||||
|
||||
|
||||
<!-- ##### FUNCTION g_utf8_find_prev_char ##### -->
|
||||
<para>
|
||||
|
||||
</para>
|
||||
|
||||
@str:
|
||||
@p:
|
||||
@Returns:
|
||||
|
||||
|
||||
<!-- ##### FUNCTION g_utf8_strlen ##### -->
|
||||
<para>
|
||||
|
||||
</para>
|
||||
|
||||
@p:
|
||||
@max:
|
||||
@Returns:
|
||||
|
||||
|
||||
<!-- ##### FUNCTION g_utf8_strncpy ##### -->
|
||||
<para>
|
||||
|
||||
</para>
|
||||
|
||||
@dest:
|
||||
@src:
|
||||
@n:
|
||||
@Returns:
|
||||
|
||||
|
||||
<!-- ##### FUNCTION g_utf8_strchr ##### -->
|
||||
<para>
|
||||
|
||||
</para>
|
||||
|
||||
@p:
|
||||
@len:
|
||||
@c:
|
||||
@Returns:
|
||||
|
||||
|
||||
<!-- ##### FUNCTION g_utf8_strrchr ##### -->
|
||||
<para>
|
||||
|
||||
</para>
|
||||
|
||||
@p:
|
||||
@len:
|
||||
@c:
|
||||
@Returns:
|
||||
|
||||
|
||||
<!-- ##### FUNCTION g_utf8_strreverse ##### -->
|
||||
<para>
|
||||
|
||||
</para>
|
||||
|
||||
@str:
|
||||
@len:
|
||||
@Returns:
|
||||
|
||||
|
||||
<!-- ##### FUNCTION g_utf8_validate ##### -->
|
||||
<para>
|
||||
|
||||
</para>
|
||||
|
||||
@str:
|
||||
@max_len:
|
||||
@end:
|
||||
@Returns:
|
||||
|
||||
|
||||
<!-- ##### FUNCTION g_utf8_strup ##### -->
|
||||
<para>
|
||||
|
||||
</para>
|
||||
|
||||
@str:
|
||||
@len:
|
||||
@Returns:
|
||||
|
||||
|
||||
<!-- ##### FUNCTION g_utf8_strdown ##### -->
|
||||
<para>
|
||||
|
||||
</para>
|
||||
|
||||
@str:
|
||||
@len:
|
||||
@Returns:
|
||||
|
||||
|
||||
<!-- ##### FUNCTION g_utf8_casefold ##### -->
|
||||
<para>
|
||||
|
||||
</para>
|
||||
|
||||
@str:
|
||||
@len:
|
||||
@Returns:
|
||||
|
||||
|
||||
<!-- ##### FUNCTION g_utf8_normalize ##### -->
|
||||
<para>
|
||||
|
||||
</para>
|
||||
|
||||
@str:
|
||||
@len:
|
||||
@mode:
|
||||
@Returns:
|
||||
|
||||
|
||||
<!-- ##### ENUM GNormalizeMode ##### -->
|
||||
<para>
|
||||
Defines how a Unicode string is transformed in a canonical
|
||||
form, standardizing such issues as whether a character with an accent is
|
||||
represented as a base character and combining accent or as a single precomposed
|
||||
character. Unicode strings should generally be normalized before comparing them.
|
||||
</para>
|
||||
|
||||
@G_NORMALIZE_DEFAULT: standardize differences that do not affect the
|
||||
text content, such as the above-mentioned accent representation.
|
||||
@G_NORMALIZE_NFD: another name for %G_NORMALIZE_DEFAULT.
|
||||
@G_NORMALIZE_DEFAULT_COMPOSE: like %G_NORMALIZE_DEFAULT, but with composed
|
||||
forms rather than a maximally decomposed form.
|
||||
@G_NORMALIZE_NFC: another name for %G_NORMALIZE_DEFAULT_COMPOSE.
|
||||
@G_NORMALIZE_ALL: beyond %G_NORMALIZE_DEFAULT also standardize the
|
||||
"compatibility" characters in Unicode, such as SUPERSCRIPT THREE to the
|
||||
standard forms (in this case DIGIT THREE). Formatting information may be
|
||||
lost but for most text operations such characters should be considered the
|
||||
same.
|
||||
@G_NORMALIZE_NFKD: another name for %G_NORMALIZE_ALL.
|
||||
@G_NORMALIZE_ALL_COMPOSE: like %G_NORMALIZE_ALL, but with composed
|
||||
forms rather than a maximally decomposed form.
|
||||
@G_NORMALIZE_NFKC: another name for %G_NORMALIZE_ALL_COMPOSE.
|
||||
|
||||
<!-- ##### FUNCTION g_utf8_collate ##### -->
|
||||
<para>
|
||||
|
||||
</para>
|
||||
|
||||
@str1:
|
||||
@str2:
|
||||
@Returns:
|
||||
|
||||
|
||||
<!-- ##### FUNCTION g_utf8_collate_key ##### -->
|
||||
<para>
|
||||
|
||||
</para>
|
||||
|
||||
@str:
|
||||
@len:
|
||||
@Returns:
|
||||
|
||||
|
||||
<!-- ##### FUNCTION g_utf8_collate_key_for_filename ##### -->
|
||||
<para>
|
||||
|
||||
</para>
|
||||
|
||||
@str:
|
||||
@len:
|
||||
@Returns:
|
||||
|
||||
|
||||
<!-- ##### FUNCTION g_utf8_to_utf16 ##### -->
|
||||
<para>
|
||||
|
||||
</para>
|
||||
|
||||
@str:
|
||||
@len:
|
||||
@items_read:
|
||||
@items_written:
|
||||
@error:
|
||||
@Returns:
|
||||
|
||||
|
||||
<!-- ##### FUNCTION g_utf8_to_ucs4 ##### -->
|
||||
<para>
|
||||
|
||||
</para>
|
||||
|
||||
@str:
|
||||
@len:
|
||||
@items_read:
|
||||
@items_written:
|
||||
@error:
|
||||
@Returns:
|
||||
|
||||
|
||||
<!-- ##### FUNCTION g_utf8_to_ucs4_fast ##### -->
|
||||
<para>
|
||||
|
||||
</para>
|
||||
|
||||
@str:
|
||||
@len:
|
||||
@items_written:
|
||||
@Returns:
|
||||
|
||||
|
||||
<!-- ##### FUNCTION g_utf16_to_ucs4 ##### -->
|
||||
<para>
|
||||
|
||||
</para>
|
||||
|
||||
@str:
|
||||
@len:
|
||||
@items_read:
|
||||
@items_written:
|
||||
@error:
|
||||
@Returns:
|
||||
|
||||
|
||||
<!-- ##### FUNCTION g_utf16_to_utf8 ##### -->
|
||||
<para>
|
||||
|
||||
</para>
|
||||
|
||||
@str:
|
||||
@len:
|
||||
@items_read:
|
||||
@items_written:
|
||||
@error:
|
||||
@Returns:
|
||||
|
||||
|
||||
<!-- ##### FUNCTION g_ucs4_to_utf16 ##### -->
|
||||
<para>
|
||||
|
||||
</para>
|
||||
|
||||
@str:
|
||||
@len:
|
||||
@items_read:
|
||||
@items_written:
|
||||
@error:
|
||||
@Returns:
|
||||
|
||||
|
||||
<!-- ##### FUNCTION g_ucs4_to_utf8 ##### -->
|
||||
<para>
|
||||
|
||||
</para>
|
||||
|
||||
@str:
|
||||
@len:
|
||||
@items_read:
|
||||
@items_written:
|
||||
@error:
|
||||
@Returns:
|
||||
|
||||
|
||||
<!-- ##### FUNCTION g_unichar_to_utf8 ##### -->
|
||||
<para>
|
||||
|
||||
</para>
|
||||
|
||||
@c:
|
||||
@outbuf:
|
||||
@Returns:
|
||||
|
||||
|
414
glib/gunicode.h
414
glib/gunicode.h
@ -31,11 +31,89 @@
|
||||
|
||||
G_BEGIN_DECLS
|
||||
|
||||
/**
|
||||
* gunichar:
|
||||
*
|
||||
* A type which can hold any UTF-32 or UCS-4 character code,
|
||||
* also known as a Unicode code point.
|
||||
*
|
||||
* If you want to produce the UTF-8 representation of a #gunichar,
|
||||
* use g_ucs4_to_utf8(). See also g_utf8_to_ucs4() for the reverse
|
||||
* process.
|
||||
*
|
||||
* To print/scan values of this type as integer, use
|
||||
* %G_GINT32_MODIFIER and/or %G_GUINT32_FORMAT.
|
||||
*
|
||||
* The notation to express a Unicode code point in running text is
|
||||
* as a hexadecimal number with four to six digits and uppercase
|
||||
* letters, prefixed by the string "U+". Leading zeros are omitted,
|
||||
* unless the code point would have fewer than four hexadecimal digits.
|
||||
* For example, "U+0041 LATIN CAPITAL LETTER A". To print a code point
|
||||
* in the U+-notation, use the format string "U+%04"G_GINT32_FORMAT"X".
|
||||
* To scan, use the format string "U+%06"G_GINT32_FORMAT"X".
|
||||
*
|
||||
* |[
|
||||
* gunichar c;
|
||||
* sscanf ("U+0041", "U+%06"G_GINT32_FORMAT"X", &c)
|
||||
* g_print ("Read U+%04"G_GINT32_FORMAT"X", c);
|
||||
* ]|
|
||||
*/
|
||||
typedef guint32 gunichar;
|
||||
|
||||
/**
|
||||
* gunichar2:
|
||||
*
|
||||
* A type which can hold any UTF-16 code
|
||||
* point<footnote id="utf16_surrogate_pairs">UTF-16 also has so called
|
||||
* <firstterm>surrogate pairs</firstterm> to encode characters beyond
|
||||
* the BMP as pairs of 16bit numbers. Surrogate pairs cannot be stored
|
||||
* in a single gunichar2 field, but all GLib functions accepting gunichar2
|
||||
* arrays will correctly interpret surrogate pairs.</footnote>.
|
||||
*
|
||||
* To print/scan values of this type to/from text you need to convert
|
||||
* to/from UTF-8, using g_utf16_to_utf8()/g_utf8_to_utf16().
|
||||
*
|
||||
* To print/scan values of this type as integer, use
|
||||
* %G_GINT16_MODIFIER and/or %G_GUINT16_FORMAT.
|
||||
*/
|
||||
typedef guint16 gunichar2;
|
||||
|
||||
/* These are the possible character classifications.
|
||||
* See http://www.unicode.org/Public/UNIDATA/UCD.html#General_Category_Values
|
||||
/**
|
||||
* GUnicodeType:
|
||||
* @G_UNICODE_CONTROL: General category "Other, Control" (Cc)
|
||||
* @G_UNICODE_FORMAT: General category "Other, Format" (Cf)
|
||||
* @G_UNICODE_UNASSIGNED: General category "Other, Not Assigned" (Cn)
|
||||
* @G_UNICODE_PRIVATE_USE: General category "Other, Private Use" (Co)
|
||||
* @G_UNICODE_SURROGATE: General category "Other, Surrogate" (Cs)
|
||||
* @G_UNICODE_LOWERCASE_LETTER: General category "Letter, Lowercase" (Ll)
|
||||
* @G_UNICODE_MODIFIER_LETTER: General category "Letter, Modifier" (Lm)
|
||||
* @G_UNICODE_OTHER_LETTER: General category "Letter, Other" (Lo)
|
||||
* @G_UNICODE_TITLECASE_LETTER: General category "Letter, Titlecase" (Lt)
|
||||
* @G_UNICODE_UPPERCASE_LETTER: General category "Letter, Uppercase" (Lu)
|
||||
* @G_UNICODE_COMBINING_MARK: General category "Mark, Spacing Combining" (Mc)
|
||||
* @G_UNICODE_ENCLOSING_MARK: General category "Mark, Enclosing" (Me)
|
||||
* @G_UNICODE_NON_SPACING_MARK: General category "Mark, Nonspacing" (Mn)
|
||||
* @G_UNICODE_DECIMAL_NUMBER: General category "Number, Decimal Digit" (Nd)
|
||||
* @G_UNICODE_LETTER_NUMBER: General category "Number, Letter" (Nl)
|
||||
* @G_UNICODE_OTHER_NUMBER: General category "Number, Other" (No)
|
||||
* @G_UNICODE_CONNECT_PUNCTUATION: General category "Punctuation, Connector" (Pc)
|
||||
* @G_UNICODE_DASH_PUNCTUATION: General category "Punctuation, Dash" (Pd)
|
||||
* @G_UNICODE_CLOSE_PUNCTUATION: General category "Punctuation, Close" (Pe)
|
||||
* @G_UNICODE_FINAL_PUNCTUATION: General category "Punctuation, Final quote" (Pf)* @G_UNICODE_INITIAL_PUNCTUATION: General category "Punctuation, Initial quote" (Pi)
|
||||
* @G_UNICODE_OTHER_PUNCTUATION: General category "Punctuation, Other" (Po)
|
||||
* @G_UNICODE_OPEN_PUNCTUATION: General category "Punctuation, Open" (Ps)
|
||||
* @G_UNICODE_CURRENCY_SYMBOL: General category "Symbol, Currency" (Sc)
|
||||
* @G_UNICODE_MODIFIER_SYMBOL: General category "Symbol, Modifier" (Sk)
|
||||
* @G_UNICODE_MATH_SYMBOL: General category "Symbol, Math" (Sm)
|
||||
* @G_UNICODE_OTHER_SYMBOL: General category "Symbol, Other" (So)
|
||||
* @G_UNICODE_LINE_SEPARATOR: General category "Separator, Line" (Zl)
|
||||
* @G_UNICODE_PARAGRAPH_SEPARATOR: General category "Separator, Paragraph" (Zp)
|
||||
* @G_UNICODE_SPACE_SEPARATOR: General category "Separator, Space" (Zs)
|
||||
*
|
||||
* These are the possible character classifications from the
|
||||
* Unicode specification.
|
||||
* See <ulink url="http://www.unicode.org/Public/UNIDATA/UnicodeData.html"
|
||||
* >http://www.unicode.org/Public/UNIDATA/UnicodeData.html</ulink>.
|
||||
*/
|
||||
typedef enum
|
||||
{
|
||||
@ -71,10 +149,55 @@ typedef enum
|
||||
G_UNICODE_SPACE_SEPARATOR
|
||||
} GUnicodeType;
|
||||
|
||||
/* These are the possible line break classifications.
|
||||
* Note that new types may be added in the future.
|
||||
* Implementations may regard unknown values like G_UNICODE_BREAK_UNKNOWN
|
||||
* See http://www.unicode.org/unicode/reports/tr14/
|
||||
/**
|
||||
* GUnicodeBreakType:
|
||||
* @G_UNICODE_BREAK_MANDATORY: Mandatory Break (BK)
|
||||
* @G_UNICODE_BREAK_CARRIAGE_RETURN: Carriage Return (CR)
|
||||
* @G_UNICODE_BREAK_LINE_FEED: Line Feed (LF)
|
||||
* @G_UNICODE_BREAK_COMBINING_MARK: Attached Characters and Combining Marks (CM)
|
||||
* @G_UNICODE_BREAK_SURROGATE: Surrogates (SG)
|
||||
* @G_UNICODE_BREAK_ZERO_WIDTH_SPACE: Zero Width Space (ZW)
|
||||
* @G_UNICODE_BREAK_INSEPARABLE: Inseparable (IN)
|
||||
* @G_UNICODE_BREAK_NON_BREAKING_GLUE: Non-breaking ("Glue") (GL)
|
||||
* @G_UNICODE_BREAK_CONTINGENT: Contingent Break Opportunity (CB)
|
||||
* @G_UNICODE_BREAK_SPACE: Space (SP)
|
||||
* @G_UNICODE_BREAK_AFTER: Break Opportunity After (BA)
|
||||
* @G_UNICODE_BREAK_BEFORE: Break Opportunity Before (BB)
|
||||
* @G_UNICODE_BREAK_BEFORE_AND_AFTER: Break Opportunity Before and After (B2)
|
||||
* @G_UNICODE_BREAK_HYPHEN: Hyphen (HY)
|
||||
* @G_UNICODE_BREAK_NON_STARTER: Nonstarter (NS)
|
||||
* @G_UNICODE_BREAK_OPEN_PUNCTUATION: Opening Punctuation (OP)
|
||||
* @G_UNICODE_BREAK_CLOSE_PUNCTUATION: Closing Punctuation (CL)
|
||||
* @G_UNICODE_BREAK_QUOTATION: Ambiguous Quotation (QU)
|
||||
* @G_UNICODE_BREAK_EXCLAMATION: Exclamation/Interrogation (EX)
|
||||
* @G_UNICODE_BREAK_IDEOGRAPHIC: Ideographic (ID)
|
||||
* @G_UNICODE_BREAK_NUMERIC: Numeric (NU)
|
||||
* @G_UNICODE_BREAK_INFIX_SEPARATOR: Infix Separator (Numeric) (IS)
|
||||
* @G_UNICODE_BREAK_SYMBOL: Symbols Allowing Break After (SY)
|
||||
* @G_UNICODE_BREAK_ALPHABETIC: Ordinary Alphabetic and Symbol Characters (AL)
|
||||
* @G_UNICODE_BREAK_PREFIX: Prefix (Numeric) (PR)
|
||||
* @G_UNICODE_BREAK_POSTFIX: Postfix (Numeric) (PO)
|
||||
* @G_UNICODE_BREAK_COMPLEX_CONTEXT: Complex Content Dependent (South East Asian) (SA)
|
||||
* @G_UNICODE_BREAK_AMBIGUOUS: Ambiguous (Alphabetic or Ideographic) (AI)
|
||||
* @G_UNICODE_BREAK_UNKNOWN: Unknown (XX)
|
||||
* @G_UNICODE_BREAK_NEXT_LINE: Next Line (NL)
|
||||
* @G_UNICODE_BREAK_WORD_JOINER: Word Joiner (WJ)
|
||||
* @G_UNICODE_BREAK_HANGUL_L_JAMO: Hangul L Jamo (JL)
|
||||
* @G_UNICODE_BREAK_HANGUL_V_JAMO: Hangul V Jamo (JV)
|
||||
* @G_UNICODE_BREAK_HANGUL_T_JAMO: Hangul T Jamo (JT)
|
||||
* @G_UNICODE_BREAK_HANGUL_LV_SYLLABLE: Hangul LV Syllable (H2)
|
||||
* @G_UNICODE_BREAK_HANGUL_LVT_SYLLABLE: Hangul LVT Syllable (H3)
|
||||
* @G_UNICODE_BREAK_CLOSE_PARANTHESIS: Closing Parenthesis (CP). Since 2.28
|
||||
*
|
||||
* These are the possible line break classifications.
|
||||
*
|
||||
* The five Hangul types were added in Unicode 4.1, so, has been
|
||||
* introduced in GLib 2.10. Note that new types may be added in the future.
|
||||
* Applications should be ready to handle unknown values.
|
||||
* They may be regarded as %G_UNICODE_BREAK_UNKNOWN.
|
||||
*
|
||||
* See <ulink url="http://www.unicode.org/unicode/reports/tr14/"
|
||||
* >http://www.unicode.org/unicode/reports/tr14/</ulink>.
|
||||
*/
|
||||
typedef enum
|
||||
{
|
||||
@ -117,6 +240,129 @@ typedef enum
|
||||
G_UNICODE_BREAK_CLOSE_PARANTHESIS
|
||||
} GUnicodeBreakType;
|
||||
|
||||
/**
|
||||
* GUnicodeScript:
|
||||
* @G_UNICODE_SCRIPT_INVALID_CODE:
|
||||
* a value never returned from g_unichar_get_script()
|
||||
* @G_UNICODE_SCRIPT_COMMON: a character used by multiple different scripts
|
||||
* @G_UNICODE_SCRIPT_INHERITED: a mark glyph that takes its script from the
|
||||
* i base glyph to which it is attached
|
||||
* @G_UNICODE_SCRIPT_ARABIC: Arabic
|
||||
* @G_UNICODE_SCRIPT_ARMENIAN: Armenian
|
||||
* @G_UNICODE_SCRIPT_BENGALI: Bengali
|
||||
* @G_UNICODE_SCRIPT_BOPOMOFO: Bopomofo
|
||||
* @G_UNICODE_SCRIPT_CHEROKEE: Cherokee
|
||||
* @G_UNICODE_SCRIPT_COPTIC: Coptic
|
||||
* @G_UNICODE_SCRIPT_CYRILLIC: Cyrillic
|
||||
* @G_UNICODE_SCRIPT_DESERET: Deseret
|
||||
* @G_UNICODE_SCRIPT_DEVANAGARI: Devanagari
|
||||
* @G_UNICODE_SCRIPT_ETHIOPIC: Ethiopic
|
||||
* @G_UNICODE_SCRIPT_GEORGIAN: Georgian
|
||||
* @G_UNICODE_SCRIPT_GOTHIC: Gothic
|
||||
* @G_UNICODE_SCRIPT_GREEK: Greek
|
||||
* @G_UNICODE_SCRIPT_GUJARATI: Gujarati
|
||||
* @G_UNICODE_SCRIPT_GURMUKHI: Gurmukhi
|
||||
* @G_UNICODE_SCRIPT_HAN: Han
|
||||
* @G_UNICODE_SCRIPT_HANGUL: Hangul
|
||||
* @G_UNICODE_SCRIPT_HEBREW: Hebrew
|
||||
* @G_UNICODE_SCRIPT_HIRAGANA: Hiragana
|
||||
* @G_UNICODE_SCRIPT_KANNADA: Kannada
|
||||
* @G_UNICODE_SCRIPT_KATAKANA: Katakana
|
||||
* @G_UNICODE_SCRIPT_KHMER: Khmer
|
||||
* @G_UNICODE_SCRIPT_LAO: Lao
|
||||
* @G_UNICODE_SCRIPT_LATIN: Latin
|
||||
* @G_UNICODE_SCRIPT_MALAYALAM: Malayalam
|
||||
* @G_UNICODE_SCRIPT_MONGOLIAN: Mongolian
|
||||
* @G_UNICODE_SCRIPT_MYANMAR: Myanmar
|
||||
* @G_UNICODE_SCRIPT_OGHAM: Ogham
|
||||
* @G_UNICODE_SCRIPT_OLD_ITALIC: Old Italic
|
||||
* @G_UNICODE_SCRIPT_ORIYA: Oriya
|
||||
* @G_UNICODE_SCRIPT_RUNIC: Runic
|
||||
* @G_UNICODE_SCRIPT_SINHALA: Sinhala
|
||||
* @G_UNICODE_SCRIPT_SYRIAC: Syriac
|
||||
* @G_UNICODE_SCRIPT_TAMIL: Tamil
|
||||
* @G_UNICODE_SCRIPT_TELUGU: Telugu
|
||||
* @G_UNICODE_SCRIPT_THAANA: Thaana
|
||||
* @G_UNICODE_SCRIPT_THAI: Thai
|
||||
* @G_UNICODE_SCRIPT_TIBETAN: Tibetan
|
||||
* @G_UNICODE_SCRIPT_CANADIAN_ABORIGINAL:
|
||||
* Canadian Aboriginal
|
||||
* @G_UNICODE_SCRIPT_YI: Yi
|
||||
* @G_UNICODE_SCRIPT_TAGALOG: Tagalog
|
||||
* @G_UNICODE_SCRIPT_HANUNOO: Hanunoo
|
||||
* @G_UNICODE_SCRIPT_BUHID: Buhid
|
||||
* @G_UNICODE_SCRIPT_TAGBANWA: Tagbanwa
|
||||
* @G_UNICODE_SCRIPT_BRAILLE: Braille
|
||||
* @G_UNICODE_SCRIPT_CYPRIOT: Cypriot
|
||||
* @G_UNICODE_SCRIPT_LIMBU: Limbu
|
||||
* @G_UNICODE_SCRIPT_OSMANYA: Osmanya
|
||||
* @G_UNICODE_SCRIPT_SHAVIAN: Shavian
|
||||
* @G_UNICODE_SCRIPT_LINEAR_B: Linear B
|
||||
* @G_UNICODE_SCRIPT_TAI_LE: Tai Le
|
||||
* @G_UNICODE_SCRIPT_UGARITIC: Ugaritic
|
||||
* @G_UNICODE_SCRIPT_NEW_TAI_LUE:
|
||||
* New Tai Lue
|
||||
* @G_UNICODE_SCRIPT_BUGINESE: Buginese
|
||||
* @G_UNICODE_SCRIPT_GLAGOLITIC: Glagolitic
|
||||
* @G_UNICODE_SCRIPT_TIFINAGH: Tifinagh
|
||||
* @G_UNICODE_SCRIPT_SYLOTI_NAGRI:
|
||||
* Syloti Nagri
|
||||
* @G_UNICODE_SCRIPT_OLD_PERSIAN:
|
||||
* Old Persian
|
||||
* @G_UNICODE_SCRIPT_KHAROSHTHI: Kharoshthi
|
||||
* @G_UNICODE_SCRIPT_UNKNOWN: an unassigned code point
|
||||
* @G_UNICODE_SCRIPT_BALINESE: Balinese
|
||||
* @G_UNICODE_SCRIPT_CUNEIFORM: Cuneiform
|
||||
* @G_UNICODE_SCRIPT_PHOENICIAN: Phoenician
|
||||
* @G_UNICODE_SCRIPT_PHAGS_PA: Phags-pa
|
||||
* @G_UNICODE_SCRIPT_NKO: N'Ko
|
||||
* @G_UNICODE_SCRIPT_KAYAH_LI: Kayah Li. Since 2.16.3
|
||||
* @G_UNICODE_SCRIPT_LEPCHA: Lepcha. Since 2.16.3
|
||||
* @G_UNICODE_SCRIPT_REJANG: Rejang. Since 2.16.3
|
||||
* @G_UNICODE_SCRIPT_SUNDANESE: Sundanese. Since 2.16.3
|
||||
* @G_UNICODE_SCRIPT_SAURASHTRA: Saurashtra. Since 2.16.3
|
||||
* @G_UNICODE_SCRIPT_CHAM: Cham. Since 2.16.3
|
||||
* @G_UNICODE_SCRIPT_OL_CHIKI: Ol Chiki. Since 2.16.3
|
||||
* @G_UNICODE_SCRIPT_VAI: Vai. Since 2.16.3
|
||||
* @G_UNICODE_SCRIPT_CARIAN: Carian. Since 2.16.3
|
||||
* @G_UNICODE_SCRIPT_LYCIAN: Lycian. Since 2.16.3
|
||||
* @G_UNICODE_SCRIPT_LYDIAN: Lydian. Since 2.16.3
|
||||
* @G_UNICODE_SCRIPT_AVESTAN: Avestan. Since 2.26
|
||||
* @G_UNICODE_SCRIPT_BAMUM: Bamum. Since 2.26
|
||||
* @G_UNICODE_SCRIPT_EGYPTIAN_HIEROGLYPHS:
|
||||
* Egyptian Hieroglpyhs. Since 2.26
|
||||
* @G_UNICODE_SCRIPT_IMPERIAL_ARAMAIC:
|
||||
* Imperial Aramaic. Since 2.26
|
||||
* @G_UNICODE_SCRIPT_INSCRIPTIONAL_PAHLAVI:
|
||||
* Inscriptional Pahlavi. Since 2.26
|
||||
* @G_UNICODE_SCRIPT_INSCRIPTIONAL_PARTHIAN:
|
||||
* Inscriptional Parthian. Since 2.26
|
||||
* @G_UNICODE_SCRIPT_JAVANESE: Javanese. Since 2.26
|
||||
* @G_UNICODE_SCRIPT_KAITHI: Kaithi. Since 2.26
|
||||
* @G_UNICODE_SCRIPT_LISU: Lisu. Since 2.26
|
||||
* @G_UNICODE_SCRIPT_MEETEI_MAYEK:
|
||||
* Meetei Mayek. Since 2.26
|
||||
* @G_UNICODE_SCRIPT_OLD_SOUTH_ARABIAN:
|
||||
* Old South Arabian. Since 2.26
|
||||
* @G_UNICODE_SCRIPT_OLD_TURKIC: Old Turkic. Since 2.28
|
||||
* @G_UNICODE_SCRIPT_SAMARITAN: Samaritan. Since 2.26
|
||||
* @G_UNICODE_SCRIPT_TAI_THAM: Tai Tham. Since 2.26
|
||||
* @G_UNICODE_SCRIPT_TAI_VIET: Tai Viet. Since 2.26
|
||||
* @G_UNICODE_SCRIPT_BATAK: Batak. Since 2.28
|
||||
* @G_UNICODE_SCRIPT_BRAHMI: Brahmi. Since 2.28
|
||||
* @G_UNICODE_SCRIPT_MANDAIC: Mandaic. Since 2.28
|
||||
*
|
||||
* The #GUnicodeScript enumeration identifies different writing
|
||||
* systems. The values correspond to the names as defined in the
|
||||
* Unicode standard. The enumeration has been added in GLib 2.14,
|
||||
* and is interchangeable with #PangoScript.
|
||||
*
|
||||
* Note that new types may be added in the future. Applications
|
||||
* should be ready to handle unknown values.
|
||||
* See <ulink
|
||||
* url="http://www.unicode.org/reports/tr24/">Unicode Standard Annex
|
||||
* #24: Script names</ulink>.
|
||||
*/
|
||||
typedef enum
|
||||
{ /* ISO 15924 code */
|
||||
G_UNICODE_SCRIPT_INVALID_CODE = -1,
|
||||
@ -289,48 +535,59 @@ gboolean g_unichar_validate (gunichar ch) G_GNUC_CONST;
|
||||
|
||||
/* Pairwise canonical compose/decompose */
|
||||
gboolean g_unichar_compose (gunichar a,
|
||||
gunichar b,
|
||||
gunichar *ch);
|
||||
gunichar b,
|
||||
gunichar *ch);
|
||||
gboolean g_unichar_decompose (gunichar ch,
|
||||
gunichar *a,
|
||||
gunichar *b);
|
||||
gunichar *a,
|
||||
gunichar *b);
|
||||
|
||||
gsize g_unichar_fully_decompose (gunichar ch,
|
||||
gboolean compat,
|
||||
gunichar *result,
|
||||
gsize result_len);
|
||||
gboolean compat,
|
||||
gunichar *result,
|
||||
gsize result_len);
|
||||
|
||||
/* Compute canonical ordering of a string in-place. This rearranges
|
||||
decomposed characters in the string according to their combining
|
||||
classes. See the Unicode manual for more information. */
|
||||
void g_unicode_canonical_ordering (gunichar *string,
|
||||
gsize len);
|
||||
gsize len);
|
||||
|
||||
|
||||
/* Deprecated. Use g_unichar_fully_decompose() */
|
||||
gunichar *g_unicode_canonical_decomposition (gunichar ch,
|
||||
gsize *result_len) G_GNUC_MALLOC;
|
||||
gsize *result_len) G_GNUC_MALLOC;
|
||||
|
||||
|
||||
/* Array of skip-bytes-per-initial character.
|
||||
*/
|
||||
GLIB_VAR const gchar * const g_utf8_skip;
|
||||
|
||||
/**
|
||||
* g_utf8_next_char:
|
||||
* @p: Pointer to the start of a valid UTF-8 character
|
||||
*
|
||||
* Skips to the next character in a UTF-8 string. The string must be
|
||||
* valid; this macro is as fast as possible, and has no error-checking.
|
||||
* You would use this macro to iterate over a string character by
|
||||
* character. The macro returns the start of the next UTF-8 character.
|
||||
* Before using this macro, use g_utf8_validate() to validate strings
|
||||
* that may contain invalid UTF-8.
|
||||
*/
|
||||
#define g_utf8_next_char(p) (char *)((p) + g_utf8_skip[*(const guchar *)(p)])
|
||||
|
||||
gunichar g_utf8_get_char (const gchar *p) G_GNUC_PURE;
|
||||
gunichar g_utf8_get_char_validated (const gchar *p,
|
||||
gssize max_len) G_GNUC_PURE;
|
||||
gssize max_len) G_GNUC_PURE;
|
||||
|
||||
gchar* g_utf8_offset_to_pointer (const gchar *str,
|
||||
glong offset) G_GNUC_PURE;
|
||||
glong g_utf8_pointer_to_offset (const gchar *str,
|
||||
const gchar *pos) G_GNUC_PURE;
|
||||
const gchar *pos) G_GNUC_PURE;
|
||||
gchar* g_utf8_prev_char (const gchar *p) G_GNUC_PURE;
|
||||
gchar* g_utf8_find_next_char (const gchar *p,
|
||||
const gchar *end) G_GNUC_PURE;
|
||||
const gchar *end) G_GNUC_PURE;
|
||||
gchar* g_utf8_find_prev_char (const gchar *str,
|
||||
const gchar *p) G_GNUC_PURE;
|
||||
const gchar *p) G_GNUC_PURE;
|
||||
|
||||
glong g_utf8_strlen (const gchar *p,
|
||||
gssize max) G_GNUC_PURE;
|
||||
@ -339,78 +596,93 @@ gchar *g_utf8_substring (const gchar *p,
|
||||
glong start_pos,
|
||||
glong end_pos) G_GNUC_MALLOC;
|
||||
|
||||
/* Copies n characters from src to dest */
|
||||
gchar* g_utf8_strncpy (gchar *dest,
|
||||
const gchar *src,
|
||||
gsize n);
|
||||
gchar *g_utf8_strncpy (gchar *dest,
|
||||
const gchar *src,
|
||||
gsize n);
|
||||
|
||||
/* Find the UTF-8 character corresponding to ch, in string p. These
|
||||
functions are equivalants to strchr and strrchr */
|
||||
gchar* g_utf8_strchr (const gchar *p,
|
||||
gssize len,
|
||||
gunichar c);
|
||||
gssize len,
|
||||
gunichar c);
|
||||
gchar* g_utf8_strrchr (const gchar *p,
|
||||
gssize len,
|
||||
gunichar c);
|
||||
gssize len,
|
||||
gunichar c);
|
||||
gchar* g_utf8_strreverse (const gchar *str,
|
||||
gssize len);
|
||||
gssize len);
|
||||
|
||||
gunichar2 *g_utf8_to_utf16 (const gchar *str,
|
||||
glong len,
|
||||
glong *items_read,
|
||||
glong *items_written,
|
||||
GError **error) G_GNUC_MALLOC;
|
||||
glong len,
|
||||
glong *items_read,
|
||||
glong *items_written,
|
||||
GError **error) G_GNUC_MALLOC;
|
||||
gunichar * g_utf8_to_ucs4 (const gchar *str,
|
||||
glong len,
|
||||
glong *items_read,
|
||||
glong *items_written,
|
||||
GError **error) G_GNUC_MALLOC;
|
||||
glong len,
|
||||
glong *items_read,
|
||||
glong *items_written,
|
||||
GError **error) G_GNUC_MALLOC;
|
||||
gunichar * g_utf8_to_ucs4_fast (const gchar *str,
|
||||
glong len,
|
||||
glong *items_written) G_GNUC_MALLOC;
|
||||
glong len,
|
||||
glong *items_written) G_GNUC_MALLOC;
|
||||
gunichar * g_utf16_to_ucs4 (const gunichar2 *str,
|
||||
glong len,
|
||||
glong *items_read,
|
||||
glong *items_written,
|
||||
GError **error) G_GNUC_MALLOC;
|
||||
glong len,
|
||||
glong *items_read,
|
||||
glong *items_written,
|
||||
GError **error) G_GNUC_MALLOC;
|
||||
gchar* g_utf16_to_utf8 (const gunichar2 *str,
|
||||
glong len,
|
||||
glong *items_read,
|
||||
glong *items_written,
|
||||
GError **error) G_GNUC_MALLOC;
|
||||
glong len,
|
||||
glong *items_read,
|
||||
glong *items_written,
|
||||
GError **error) G_GNUC_MALLOC;
|
||||
gunichar2 *g_ucs4_to_utf16 (const gunichar *str,
|
||||
glong len,
|
||||
glong *items_read,
|
||||
glong *items_written,
|
||||
GError **error) G_GNUC_MALLOC;
|
||||
glong len,
|
||||
glong *items_read,
|
||||
glong *items_written,
|
||||
GError **error) G_GNUC_MALLOC;
|
||||
gchar* g_ucs4_to_utf8 (const gunichar *str,
|
||||
glong len,
|
||||
glong *items_read,
|
||||
glong *items_written,
|
||||
GError **error) G_GNUC_MALLOC;
|
||||
glong len,
|
||||
glong *items_read,
|
||||
glong *items_written,
|
||||
GError **error) G_GNUC_MALLOC;
|
||||
|
||||
/* Convert a single character into UTF-8. outbuf must have at
|
||||
* least 6 bytes of space. Returns the number of bytes in the
|
||||
* result.
|
||||
*/
|
||||
gint g_unichar_to_utf8 (gunichar c,
|
||||
gchar *outbuf);
|
||||
|
||||
/* Validate a UTF8 string, return TRUE if valid, put pointer to
|
||||
* first invalid char in **end
|
||||
*/
|
||||
gchar *outbuf);
|
||||
|
||||
gboolean g_utf8_validate (const gchar *str,
|
||||
gssize max_len,
|
||||
const gchar **end);
|
||||
|
||||
gchar *g_utf8_strup (const gchar *str,
|
||||
gssize len) G_GNUC_MALLOC;
|
||||
gssize len) G_GNUC_MALLOC;
|
||||
gchar *g_utf8_strdown (const gchar *str,
|
||||
gssize len) G_GNUC_MALLOC;
|
||||
gssize len) G_GNUC_MALLOC;
|
||||
gchar *g_utf8_casefold (const gchar *str,
|
||||
gssize len) G_GNUC_MALLOC;
|
||||
gssize len) G_GNUC_MALLOC;
|
||||
|
||||
/**
|
||||
* GNormalizeMode:
|
||||
* @G_NORMALIZE_DEFAULT: standardize differences that do not affect the
|
||||
* text content, such as the above-mentioned accent representation
|
||||
* @G_NORMALIZE_NFD: another name for %G_NORMALIZE_DEFAULT
|
||||
* @G_NORMALIZE_DEFAULT_COMPOSE: like %G_NORMALIZE_DEFAULT, but with
|
||||
* composed forms rather than a maximally decomposed form
|
||||
* @G_NORMALIZE_NFC: another name for %G_NORMALIZE_DEFAULT_COMPOSE
|
||||
* @G_NORMALIZE_ALL: beyond %G_NORMALIZE_DEFAULT also standardize the
|
||||
* "compatibility" characters in Unicode, such as SUPERSCRIPT THREE
|
||||
* to the standard forms (in this case DIGIT THREE). Formatting
|
||||
* information may be lost but for most text operations such
|
||||
* characters should be considered the same
|
||||
* @G_NORMALIZE_NFKD: another name for %G_NORMALIZE_ALL
|
||||
* @G_NORMALIZE_ALL_COMPOSE: like %G_NORMALIZE_ALL, but with composed
|
||||
* forms rather than a maximally decomposed form
|
||||
* @G_NORMALIZE_NFKC: another name for %G_NORMALIZE_ALL_COMPOSE
|
||||
*
|
||||
* Defines how a Unicode string is transformed in a canonical
|
||||
* form, standardizing such issues as whether a character with
|
||||
* an accent is represented as a base character and combining
|
||||
* accent or as a single precomposed character. Unicode strings
|
||||
* should generally be normalized before comparing them.
|
||||
*/
|
||||
typedef enum {
|
||||
G_NORMALIZE_DEFAULT,
|
||||
G_NORMALIZE_NFD = G_NORMALIZE_DEFAULT,
|
||||
@ -423,15 +695,15 @@ typedef enum {
|
||||
} GNormalizeMode;
|
||||
|
||||
gchar *g_utf8_normalize (const gchar *str,
|
||||
gssize len,
|
||||
GNormalizeMode mode) G_GNUC_MALLOC;
|
||||
gssize len,
|
||||
GNormalizeMode mode) G_GNUC_MALLOC;
|
||||
|
||||
gint g_utf8_collate (const gchar *str1,
|
||||
const gchar *str2) G_GNUC_PURE;
|
||||
const gchar *str2) G_GNUC_PURE;
|
||||
gchar *g_utf8_collate_key (const gchar *str,
|
||||
gssize len) G_GNUC_MALLOC;
|
||||
gssize len) G_GNUC_MALLOC;
|
||||
gchar *g_utf8_collate_key_for_filename (const gchar *str,
|
||||
gssize len) G_GNUC_MALLOC;
|
||||
gssize len) G_GNUC_MALLOC;
|
||||
|
||||
|
||||
/* private */
|
||||
|
@ -19,6 +19,29 @@
|
||||
* Boston, MA 02111-1307, USA.
|
||||
*/
|
||||
|
||||
/**
|
||||
* SECTION:unicode
|
||||
* @Title: Unicode Manipulation
|
||||
* @Short_description: functions operating on Unicode characters and
|
||||
* UTF-8 strings
|
||||
* @See_also: g_locale_to_utf8(), g_locale_from_utf8()
|
||||
*
|
||||
* This section describes a number of functions for dealing with
|
||||
* Unicode characters and strings. There are analogues of the
|
||||
* traditional <filename>ctype.h</filename> character classification
|
||||
* and case conversion functions, UTF-8 analogues of some string utility
|
||||
* functions, functions to perform normalization, case conversion and
|
||||
* collation on UTF-8 strings and finally functions to convert between
|
||||
* the UTF-8, UTF-16 and UCS-4 encodings of Unicode.
|
||||
*
|
||||
* The implementations of the Unicode functions in GLib are based
|
||||
* on the Unicode Character Data tables, which are available from
|
||||
* <ulink url="http://www.unicode.org/">www.unicode.org</ulink>.
|
||||
* GLib 2.8 supports Unicode 4.0, GLib 2.10 supports Unicode 4.1,
|
||||
* GLib 2.12 supports Unicode 5.0, GLib 2.16.3 supports Unicode 5.1,
|
||||
* GLib 2.30 supports Unicode 6.0.
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
|
||||
#include <stdlib.h>
|
||||
|
Loading…
x
Reference in New Issue
Block a user