From 32d3bddae457f385f0ea2a94d89ac11b50a54d85 Mon Sep 17 00:00:00 2001 From: Philip Withnall Date: Mon, 21 Oct 2024 17:05:04 +0100 Subject: [PATCH] =?UTF-8?q?gen-unicode-tables.pl:=20Add=20a=20new=20?= =?UTF-8?q?=E2=80=98either=E2=80=99=20compose=20table?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit See the big comment in the code for details. Essentially, this adds a new compose table specifically for the transitive closure of ‘either’ codepoints — codepoints which appear as the first codepoint in a composition pair and as the second point in a composition pair (potentially, but not necessarily, the same pair); or which appear in a composition pair with an ‘either’ codepoint. This new compose table has to be symmetrically indexed, as the `COMPOSE_INDEX` macro doesn’t differentiate based on codepoint position (first or second). It’s not possible to achieve that with the main `compose_array` without making it absolutely huge (it’s currently about 150×40 in size and would have to become at least 150×150 in size). In contrast, the new `compose_either_array` is currently 15×15. Signed-off-by: Philip Withnall Helps: #3470 --- glib/gen-unicode-tables.pl | 109 ++++++- glib/gunicomp.h | 573 ++++++++++++++++++------------------- glib/gunidecomp.c | 14 +- 3 files changed, 398 insertions(+), 298 deletions(-) diff --git a/glib/gen-unicode-tables.pl b/glib/gen-unicode-tables.pl index 645ed67f1..db2210493 100755 --- a/glib/gen-unicode-tables.pl +++ b/glib/gen-unicode-tables.pl @@ -1166,11 +1166,12 @@ EOT sub enumerate_ordered { - my ($array) = @_; + my ($array, $should_filter) = @_; + $should_filter = defined $should_filter ? $should_filter : 1; my $n = 0; for my $code (sort { $a <=> $b } keys %$array) { - if ($array->{$code} == 1) { + if ($should_filter && $array->{$code} == 1) { delete $array->{$code}; next; } @@ -1238,6 +1239,20 @@ sub output_composition_table # The resulting value is a tuple containing the other expected codepoint in # the composition pair, and the composed codepoint. # + # ‘Either’ codepoints are also separated out into their own array within + # this domain. An ‘either’ codepoint is one which appears as the first + # codepoint of a composition pair and the second codepoint of a composition + # pair (not necessarily different pairs). These have to be separated out as + # `COMPOSE_INDEX` does not distinguish between first and second codepoints, + # so ‘either’ codepoints necessitate a symmetric array. If the full + # `compose_array` were symmetric, it would be huge. Since there are + # relatively few ‘either’ codepoints (15 at the time of writing in 2024), + # a separate symmetric array suffices for them. + # + # This means that ‘either’ codepoints have to be a transitive closure: any + # codepoint which appears in a first and second position is an ‘either’ + # codepoint, plus all codepoints which appear with an ‘either’ codepoint. + # # The main composition table (`compose_array`) is indexed by the first and # second codepoints of a composed pair. Because these are both in the same # indexing domain, this means a given codepoint can *only* appear as the @@ -1294,9 +1309,6 @@ sub output_composition_table } } - # Assign integer indices, removing singletons - my $n_first = enumerate_ordered (\%first); - # Now record the second character of each (non-singleton) decomposition for $code (keys %compositions) { @values = map { hex ($_) } split /\s+/, $compositions{$code}; @@ -1310,21 +1322,64 @@ sub output_composition_table } } - # Assign integer indices, removing duplicate + # See if there are any codepoints which occur as the first codepoint in one + # decomposition and the second in another. These need to be indexed + # separately, and need to be a transitive closure. + my $changed = 0; + do { + $changed = 0; + + for $code (keys %compositions) { + @values = map { hex ($_) } split /\s+/, $compositions{$code}; + + if (exists $first{$values[1]} || exists $second{$values[0]} || + (exists $either{$values[0]} && !exists $either{$values[1]}) || + (!exists $either{$values[0]} && exists $either{$values[1]})) { + if (exists $either{$values[0]}) { + $either{$values[0]}++; + } else { + $either{$values[0]} = 1; + } + if (exists $either{$values[1]}) { + $either{$values[1]}++; + } else { + $either{$values[1]} = 1; + } + + delete $first{$values[0]}; + delete $first{$values[1]}; + delete $second{$values[0]}; + delete $second{$values[1]}; + $changed = 1; + } + } + } while ($changed); + + # Assign integer indices, removing singletons from the first and second maps, + # but not from the either map (as it needs to be a transitive closure). + my $n_first = enumerate_ordered (\%first); my $n_second = enumerate_ordered (\%second); + my $n_either = enumerate_ordered (\%either, 0); # Build reverse table - my @first_singletons; my @second_singletons; my %reverse; + my %reverse_either; + for $code (keys %compositions) { @values = map { hex ($_) } split /\s+/, $compositions{$code}; my $first = $first{$values[0]}; my $second = $second{$values[1]}; + my $either0 = $either{$values[0]}; + my $either1 = $either{$values[1]}; - if (defined $first && defined $second) { + if (defined $either0 && defined $either1) { + $reverse_either{"$either0|$either1"} = $code; + } elsif ((defined $either0 && !defined $either1) || (!defined $either0 && defined $either1)) { + die "‘either’ map is not a transitive closure for ", $values[0], " or ", $values[1]; + } elsif (defined $first && defined $second) { $reverse{"$first|$second"} = $code; } elsif (!defined $first) { push @first_singletons, [ $values[0], $values[1], $code ]; @@ -1380,6 +1435,15 @@ sub output_composition_table $vals{$code} = $i++ + $total; $last = $code if $code > $last; } + $total += @second_singletons; + printf OUT "#define COMPOSE_EITHER_START %d\n\n", $total; + for $code (keys %either) { + if (defined $vals{$code}) { + die "redefining $code as either"; + } + $vals{$code} = $either{$code} + $total; + $last = $code if $code > $last; + } printf OUT "#define COMPOSE_TABLE_LAST %d\n\n", $last / 256; @@ -1453,7 +1517,34 @@ EOT EOT $bytes_out += $n_first * $n_second * 4; - + + # Output array of ‘either’ codepoints — the codepoints which can appear as + # either the first or second in a composition pair. + print OUT <= COMPOSE_SECOND_SINGLE_START) + if (index_b >= COMPOSE_SECOND_SINGLE_START && index_b < COMPOSE_EITHER_START) { if (a == compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][0]) { @@ -330,6 +330,18 @@ combine (gunichar a, } } + if (index_a >= COMPOSE_EITHER_START && + index_b >= COMPOSE_EITHER_START) + { + gunichar res = compose_either_array[index_a - COMPOSE_EITHER_START][index_b - COMPOSE_EITHER_START]; + + if (res) + { + *result = res; + return TRUE; + } + } + return FALSE; }