glib/tests/gen-casemap-txt.pl

#! /usr/bin/perl -w

#    Copyright (C) 1998, 1999 Tom Tromey
#    Copyright (C) 2001 Red Hat Software

#    This program is free software; you can redistribute it and/or modify
#    it under the terms of the GNU General Public License as published by
#    the Free Software Foundation; either version 2, or (at your option)
#    any later version.

#    This program is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU General Public License for more details.

#    You should have received a copy of the GNU General Public License
#    along with this program; if not, write to the Free Software
#    Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
#    02111-1307, USA.

# gen-casemap-test.pl - Generate test cases for case mapping from Unicode data.
# See http://www.unicode.org/Public/UNIDATA/UnicodeCharacterDatabase.html
# I consider the output of this program to be unrestricted.  Use it as
# you will.

use utf8;

if (@ARGV != 3) {
    $0 =~ s@.*/@@;
    die "Usage: $0 UNICODE-VERSION UnicodeData.txt SpecialCasing.txt\n";
}
 
use vars qw($CODE $NAME $CATEGORY $COMBINING_CLASSES $BIDI_CATEGORY $DECOMPOSITION $DECIMAL_VALUE $DIGIT_VALUE $NUMERIC_VALUE $MIRRORED $OLD_NAME $COMMENT $UPPER $LOWER $TITLE $BREAK_CODE $BREAK_CATEGORY $BREAK_NAME $CASE_CODE $CASE_LOWER $CASE_TITLE $CASE_UPPER $CASE_CONDITION);

# Names of fields in Unicode data table.
$CODE = 0;
$NAME = 1;
$CATEGORY = 2;
$COMBINING_CLASSES = 3;
$BIDI_CATEGORY = 4;
$DECOMPOSITION = 5;
$DECIMAL_VALUE = 6;
$DIGIT_VALUE = 7;
$NUMERIC_VALUE = 8;
$MIRRORED = 9;
$OLD_NAME = 10;
$COMMENT = 11;
$UPPER = 12;
$LOWER = 13;
$TITLE = 14;

# Names of fields in the SpecialCasing table
$CASE_CODE = 0;
$CASE_LOWER = 1;
$CASE_TITLE = 2;
$CASE_UPPER = 3;
$CASE_CONDITION = 4;

my @upper;
my @title;
my @lower;

open (INPUT, "< $ARGV[1]") || exit 1;

$last_code = -1;
while (<INPUT>)
{
    chop;
    @fields = split (';', $_, 30);
    if ($#fields != 14)
    {
	printf STDERR ("Entry for $fields[$CODE] has wrong number of fields (%d)\n", $#fields);
    }

    $code = hex ($fields[$CODE]);

    last if ($code > 0xFFFF); # ignore characters out of the basic plane

    if ($code > $last_code + 1)
    {
	# Found a gap.
	if ($fields[$NAME] =~ /Last>/)
	{
	    # Fill the gap with the last character read,
            # since this was a range specified in the char database
	    @gfields = @fields;
	}
	else
	{
	    # The gap represents undefined characters.  Only the type
	    # matters.
	    @gfields = ('', '', 'Cn', '0', '', '', '', '', '', '', '',
			'', '', '', '');
	}
	for (++$last_code; $last_code < $code; ++$last_code)
	{
	    $gfields{$CODE} = sprintf ("%04x", $last_code);
	    &process_one ($last_code, @gfields);
	}
    }
    &process_one ($code, @fields);
    $last_code = $code;
}

close INPUT;

open (INPUT, "< $ARGV[2]") || exit 1;

while (<INPUT>)
{
    my $code;
    
    chop;

    next if /^#/;
    next if /^\s*$/;

    s/\s*#.*//;

    @fields = split ('\s*;\s*', $_, 30);

    $raw_code = $fields[$CASE_CODE];
    $code = hex ($raw_code);

    if ($#fields != 4 && $#fields != 5)
    {
	printf STDERR ("Entry for $raw_code has wrong number of fields (%d)\n", $#fields);
	next;
    }

    if (defined $fields[5]) {
	# Ignore conditional special cases - we'll handle them manually
	next;
    }

    $upper[$code] = &make_hex ($fields[$CASE_UPPER]);
    $lower[$code] = &make_hex ($fields[$CASE_LOWER]);
    $title[$code] = &make_hex ($fields[$CASE_TITLE]);
}

close INPUT;

print <<EOT;
# Test cases generated from Unicode $ARGV[0] data
# by gen-case-tests.pl. Do not edit.
#
# Some special hand crafted tests
#
tr_TR\ti\ti\t\x{0130}\t\x{0130}\t# i => LATIN CAPITAL LETTER I WITH DOT ABOVE
tr_TR\tI\t\x{0131}\tI\tI\t# I => LATIN SMALL LETTER DOTLESS I
# Test reordering of YPOGEGRAMMENI across other accents
\t\x{03b1}\x{0345}\x{0314}\t\x{03b1}\x{0345}\x{314}\t\x{0391}\x{0345}\x{0314}\t\x{0391}\x{0314}\x{0399}\t
\t\x{03b1}\x{0314}\x{0345}\t\x{03b1}\x{314}\x{0345}\t\x{0391}\x{0314}\x{0345}\t\x{0391}\x{0314}\x{0399}\t
# Handling of final and nonfinal sigma
	ΜΆΙΟΣ 	μάιος 	Μάιος 	ΜΆΙΟΣ 	
	ΜΆΙΟΣ	μάιος	Μάιος	ΜΆΙΟΣ	
	ΣΙΓΜΑ	σιγμα	Σιγμα	ΣΙΓΜΑ	
# Lithuanian rule of i followed by letter with dot. Not at all sure
# about the titlecase part here
lt_LT\ti\x{117}\ti\x{117}\tIe\tIE\t
lt_LT\tie\x{307}\tie\x{307}\tIe\tIE\t
#
# Now the automatic tests
#
EOT
&print_tests;

exit 0;

# Process a single character.
sub process_one
{
    my ($code, @fields) = @_;

    my $type =  $fields[$CATEGORY];
    if ($type eq 'Ll')
    {
	$upper[$code] = make_hex ($fields[$UPPER]);
	$lower[$code] = pack ("U", $code);
	$title[$code] = make_hex ($fields[$TITLE]);
    }
    elsif ($type eq 'Lu')
    {
	$lower[$code] = make_hex ($fields[$LOWER]);
	$upper[$code] = pack ("U", $code);
	$title[$code] = make_hex ($fields[$TITLE]);
    }

    if ($type eq 'Lt')
    {
	$upper[$code] = make_hex ($fields[$UPPER]);
	$lower[$code] = pack ("U", hex ($fields[$LOWER]));
	$title[$code] = make_hex ($fields[$LOWER]);
    }
}

sub print_tests
{
    for ($i = 0; $i < 0xffff; $i++) {
	if ($i == 0x3A3) {
	    # Greek sigma needs special tests
	    next;
	}
	
	my $lower = $lower[$i];
	my $title = $title[$i];
	my $upper = $upper[$i];

	if (defined $upper || defined $lower || defined $title) {
	    printf "\t%s\t%s\t%s\t%s\t# %4X\n",
		    pack ("U", $i),
		    (defined $lower ? $lower : ""),
		    (defined $title ? $title : ""),
		    (defined $upper ? $upper : ""),
                    $i;
	}
    }
}

sub make_hex
{
    my $codes = shift;

    $codes =~ s/^\s+//;
    $codes =~ s/\s+$//;

    if ($codes eq "0" || $codes eq "") {
	return "";
    } else {
	return pack ("U*", map { hex ($_) } split /\s+/, $codes);
    }
}
Use G_N_ELEMENTS rather than a custom macro. Sun Jul 1 20:16:25 2001 Owen Taylor <otaylor@redhat.com> * glib/guniprop.c (g_unichar_totitle): Use G_N_ELEMENTS rather than a custom macro. * glib/gen-unicode-tables.pl: Adapt to changes in table formats for Unicode 3.1 * glib/gunicode.h glib/guniprop.c glib/gunichartables.h glib/gen-unicode-tables.pl: Add case conversion functions g_utf8_casefold, g_utf8_strup, g_utf8_strdown. * tests/unicode-caseconv.c tests/gen-casefold-txt.pl tests/gen-casemap-txt.pl tests/casefold.txt tests/casemap.txt: Test cases for case conversion. * glib/gunicode.h glib/gunidecomp.[ch] glib/gunicomp.h glib/gen-unicode-tables.pl: Add function to do Unicode normalization g_utf8_normalize(). * tests/unicode-normalize.c: Test program for case conversion. * glib/gunicode.h glib/gunicollate.c: Add collation functions g_utf8_collate, g_utf8_collate_key. * test/unicode-collate.c: Test program for collation. * glib/gdate.c (g_date_fill_parse_tokens): Fix uninitialized variable. * glib/gdate.c (g_date_strftime) docs/Changes-2.0.txt: Make work with UTF-8 even if the locale isn't UTF-8 based. Still somewhat of broken, if the format string contains characters not representable in the current locale, will warn and not work. * glib/gdate.c: Use UTF-8 normalization and casefolding. 2001-07-02 02:49:21 +02:00			`#! /usr/bin/perl -w`

			`# Copyright (C) 1998, 1999 Tom Tromey`
			`# Copyright (C) 2001 Red Hat Software`

			`# This program is free software; you can redistribute it and/or modify`
			`# it under the terms of the GNU General Public License as published by`
			`# the Free Software Foundation; either version 2, or (at your option)`
			`# any later version.`

			`# This program is distributed in the hope that it will be useful,`
			`# but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`# GNU General Public License for more details.`

			`# You should have received a copy of the GNU General Public License`
			`# along with this program; if not, write to the Free Software`
			`# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA`
			`# 02111-1307, USA.`

			`# gen-casemap-test.pl - Generate test cases for case mapping from Unicode data.`
			`# See http://www.unicode.org/Public/UNIDATA/UnicodeCharacterDatabase.html`
			`# I consider the output of this program to be unrestricted. Use it as`
			`# you will.`

			`use utf8;`

			`if (@ARGV != 3) {`
			`$0 =~ s@.*/@@;`
			`die "Usage: $0 UNICODE-VERSION UnicodeData.txt SpecialCasing.txt\n";`
			`}`

			`use vars qw($CODE $NAME $CATEGORY $COMBINING_CLASSES $BIDI_CATEGORY $DECOMPOSITION $DECIMAL_VALUE $DIGIT_VALUE $NUMERIC_VALUE $MIRRORED $OLD_NAME $COMMENT $UPPER $LOWER $TITLE $BREAK_CODE $BREAK_CATEGORY $BREAK_NAME $CASE_CODE $CASE_LOWER $CASE_TITLE $CASE_UPPER $CASE_CONDITION);`

			`# Names of fields in Unicode data table.`
			`$CODE = 0;`
			`$NAME = 1;`
			`$CATEGORY = 2;`
			`$COMBINING_CLASSES = 3;`
			`$BIDI_CATEGORY = 4;`
			`$DECOMPOSITION = 5;`
			`$DECIMAL_VALUE = 6;`
			`$DIGIT_VALUE = 7;`
			`$NUMERIC_VALUE = 8;`
			`$MIRRORED = 9;`
			`$OLD_NAME = 10;`
			`$COMMENT = 11;`
			`$UPPER = 12;`
			`$LOWER = 13;`
			`$TITLE = 14;`

			`# Names of fields in the SpecialCasing table`
			`$CASE_CODE = 0;`
			`$CASE_LOWER = 1;`
			`$CASE_TITLE = 2;`
			`$CASE_UPPER = 3;`
			`$CASE_CONDITION = 4;`

			`my @upper;`
			`my @title;`
			`my @lower;`

			`open (INPUT, "< $ARGV[1]") \|\| exit 1;`

			`$last_code = -1;`
			`while (<INPUT>)`
			`{`
			`chop;`
			`@fields = split (';', $_, 30);`
			`if ($#fields != 14)`
			`{`
			`printf STDERR ("Entry for $fields[$CODE] has wrong number of fields (%d)\n", $#fields);`
			`}`

			`$code = hex ($fields[$CODE]);`

			`last if ($code > 0xFFFF); # ignore characters out of the basic plane`

			`if ($code > $last_code + 1)`
			`{`
			`# Found a gap.`
			`if ($fields[$NAME] =~ /Last>/)`
			`{`
			`# Fill the gap with the last character read,`
			`# since this was a range specified in the char database`
			`@gfields = @fields;`
			`}`
			`else`
			`{`
			`# The gap represents undefined characters. Only the type`
			`# matters.`
			`@gfields = ('', '', 'Cn', '0', '', '', '', '', '', '', '',`
			`'', '', '', '');`
			`}`
			`for (++$last_code; $last_code < $code; ++$last_code)`
			`{`
			`$gfields{$CODE} = sprintf ("%04x", $last_code);`
			`&process_one ($last_code, @gfields);`
			`}`
			`}`
			`&process_one ($code, @fields);`
			`$last_code = $code;`
			`}`

			`close INPUT;`

			`open (INPUT, "< $ARGV[2]") \|\| exit 1;`

			`while (<INPUT>)`
			`{`
			`my $code;`

			`chop;`

			`next if /^#/;`
			`next if /^\s*$/;`

			`s/\s#.//;`

			`@fields = split ('\s;\s', $_, 30);`

			`$raw_code = $fields[$CASE_CODE];`
			`$code = hex ($raw_code);`

			`if ($#fields != 4 && $#fields != 5)`
			`{`
			`printf STDERR ("Entry for $raw_code has wrong number of fields (%d)\n", $#fields);`
			`next;`
			`}`

			`if (defined $fields[5]) {`
			`# Ignore conditional special cases - we'll handle them manually`
			`next;`
			`}`

			`$upper[$code] = &make_hex ($fields[$CASE_UPPER]);`
			`$lower[$code] = &make_hex ($fields[$CASE_LOWER]);`
			`$title[$code] = &make_hex ($fields[$CASE_TITLE]);`
			`}`

			`close INPUT;`

			`print <<EOT;`
			`# Test cases generated from Unicode $ARGV[0] data`
			`# by gen-case-tests.pl. Do not edit.`
			`#`
			`# Some special hand crafted tests`
			`#`
			`tr_TR\ti\ti\t\x{0130}\t\x{0130}\t# i => LATIN CAPITAL LETTER I WITH DOT ABOVE`
			`tr_TR\tI\t\x{0131}\tI\tI\t# I => LATIN SMALL LETTER DOTLESS I`
			`# Test reordering of YPOGEGRAMMENI across other accents`
			`\t\x{03b1}\x{0345}\x{0314}\t\x{03b1}\x{0345}\x{314}\t\x{0391}\x{0345}\x{0314}\t\x{0391}\x{0314}\x{0399}\t`
			`\t\x{03b1}\x{0314}\x{0345}\t\x{03b1}\x{314}\x{0345}\t\x{0391}\x{0314}\x{0345}\t\x{0391}\x{0314}\x{0399}\t`
			`# Handling of final and nonfinal sigma`
			`ΜΆΙΟΣ μάιος Μάιος ΜΆΙΟΣ`
			`ΜΆΙΟΣ μάιος Μάιος ΜΆΙΟΣ`
			`ΣΙΓΜΑ σιγμα Σιγμα ΣΙΓΜΑ`
			`# Lithuanian rule of i followed by letter with dot. Not at all sure`
			`# about the titlecase part here`
			`lt_LT\ti\x{117}\ti\x{117}\tIe\tIE\t`
			`lt_LT\tie\x{307}\tie\x{307}\tIe\tIE\t`
			`#`
			`# Now the automatic tests`
			`#`
			`EOT`
			`&print_tests;`

			`exit 0;`

			`# Process a single character.`
			`sub process_one`
			`{`
			`my ($code, @fields) = @_;`

			`my $type = $fields[$CATEGORY];`
			`if ($type eq 'Ll')`
			`{`
			`$upper[$code] = make_hex ($fields[$UPPER]);`
			`$lower[$code] = pack ("U", $code);`
			`$title[$code] = make_hex ($fields[$TITLE]);`
			`}`
			`elsif ($type eq 'Lu')`
			`{`
			`$lower[$code] = make_hex ($fields[$LOWER]);`
			`$upper[$code] = pack ("U", $code);`
			`$title[$code] = make_hex ($fields[$TITLE]);`
			`}`

			`if ($type eq 'Lt')`
			`{`
			`$upper[$code] = make_hex ($fields[$UPPER]);`
			`$lower[$code] = pack ("U", hex ($fields[$LOWER]));`
			`$title[$code] = make_hex ($fields[$LOWER]);`
			`}`
			`}`

			`sub print_tests`
			`{`
			`for ($i = 0; $i < 0xffff; $i++) {`
			`if ($i == 0x3A3) {`
			`# Greek sigma needs special tests`
			`next;`
			`}`

			`my $lower = $lower[$i];`
			`my $title = $title[$i];`
			`my $upper = $upper[$i];`

			`if (defined $upper \|\| defined $lower \|\| defined $title) {`
			`printf "\t%s\t%s\t%s\t%s\t# %4X\n",`
			`pack ("U", $i),`
			`(defined $lower ? $lower : ""),`
			`(defined $title ? $title : ""),`
			`(defined $upper ? $upper : ""),`
			`$i;`
			`}`
			`}`
			`}`

			`sub make_hex`
			`{`
			`my $codes = shift;`

			`$codes =~ s/^\s+//;`
			`$codes =~ s/\s+$//;`

			`if ($codes eq "0" \|\| $codes eq "") {`
			`return "";`
			`} else {`
			`return pack ("U*", map { hex ($_) } split /\s+/, $codes);`
			`}`
			`}`