v8-icu73-simple-case-folding.patch

From 572b80f2e906a826a499c4c5561b90b97a687f0e Mon Sep 17 00:00:00 2001
From: pthier <pthier@chromium.org>
Date: Tue, 18 Jul 2023 16:27:28 +0200
Subject: [PATCH] [regexp] Remove special handling for simple case folding

ICU 73 introduced creating closures using simple case folding.
We can directly use this method instead of our own special handling where simple case folding (required by JS spec) differs from full case
folding (the previously only supported mode in ICU).

Bug: v8:13377
Change-Id: I42bbcc37fe5c1f33a1d6c36f0d4ceb18a67a9b43
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/4694009
Commit-Queue: Patrick Thier <pthier@chromium.org>
Reviewed-by: Jakob Linke <jgruber@chromium.org>
Cr-Commit-Position: refs/heads/main@{#89024}
---
 src/regexp/gen-regexp-special-case.cc | 48 ---------------------------
 src/regexp/regexp-ast.h               |  6 ----
 src/regexp/regexp-compiler-tonode.cc  | 24 +-------------
 src/regexp/regexp-parser.cc           |  2 +-
 src/regexp/special-case.h             | 10 ------
 5 files changed, 2 insertions(+), 88 deletions(-)

diff --git a/src/regexp/gen-regexp-special-case.cc b/src/regexp/gen-regexp-special-case.cc
index 55618f11783..86f6b212c93 100644
--- a/v8/src/regexp/gen-regexp-special-case.cc
+++ b/v8/src/regexp/gen-regexp-special-case.cc
@@ -9,7 +9,6 @@
 
 #include "src/base/strings.h"
 #include "src/regexp/special-case.h"
-#include "unicode/usetiter.h"
 
 namespace v8 {
 namespace internal {
@@ -127,52 +126,6 @@ void PrintSpecial(std::ofstream& out) {
   PrintSet(out, "SpecialAddSet", special_add);
 }
 
-void PrintUnicodeSpecial(std::ofstream& out) {
-  icu::UnicodeSet non_simple_folding;
-  icu::UnicodeSet current;
-  UErrorCode status = U_ZERO_ERROR;
-  // Look at all characters except white spaces.
-  icu::UnicodeSet interestingCP(u"[^[:White_Space:]]", status);
-  CHECK_EQ(status, U_ZERO_ERROR);
-  icu::UnicodeSetIterator iter(interestingCP);
-  while (iter.next()) {
-    UChar32 c = iter.getCodepoint();
-    current.set(c, c);
-    current.closeOver(USET_CASE_INSENSITIVE).removeAllStrings();
-    CHECK(!current.isBogus());
-    // Remove characters from the closeover that have a simple case folding.
-    icu::UnicodeSet toRemove;
-    icu::UnicodeSetIterator closeOverIter(current);
-    while (closeOverIter.next()) {
-      UChar32 closeOverChar = closeOverIter.getCodepoint();
-      UChar32 closeOverSCF = u_foldCase(closeOverChar, U_FOLD_CASE_DEFAULT);
-      if (closeOverChar != closeOverSCF) {
-        toRemove.add(closeOverChar);
-      }
-    }
-    CHECK(!toRemove.isBogus());
-    current.removeAll(toRemove);
-
-    // The current character and its simple case folding are also always OK.
-    UChar32 scf = u_foldCase(c, U_FOLD_CASE_DEFAULT);
-    current.remove(c);
-    current.remove(scf);
-
-    // If there are any characters remaining, they were added due to full case
-    // foldings and shouldn't match the current charcter according to the spec.
-    if (!current.isEmpty()) {
-      // Ensure that the character doesn't have a simple case folding.
-      // Otherwise the current approach of simply removing the character from
-      // the set before calling closeOver won't work.
-      CHECK_EQ(c, scf);
-      non_simple_folding.add(c);
-    }
-  }
-  CHECK(!non_simple_folding.isBogus());
-
-  PrintSet(out, "UnicodeNonSimpleCloseOverSet", non_simple_folding);
-}
-
 void WriteHeader(const char* header_filename) {
   std::ofstream out(header_filename);
   out << std::hex << std::setfill('0') << std::setw(4);
@@ -193,7 +146,6 @@ void WriteHeader(const char* header_filename) {
       << "namespace internal {\n\n";
 
   PrintSpecial(out);
-  PrintUnicodeSpecial(out);
 
   out << "\n"
       << "}  // namespace internal\n"
diff --git a/src/regexp/regexp-ast.h b/src/regexp/regexp-ast.h
index e7453ad3f8f..8e3bb12fce2 100644
--- a/v8/src/regexp/regexp-ast.h
+++ b/v8/src/regexp/regexp-ast.h
@@ -134,12 +134,6 @@ class CharacterRange {
   static void AddUnicodeCaseEquivalents(ZoneList<CharacterRange>* ranges,
                                         Zone* zone);
 
-#ifdef V8_INTL_SUPPORT
-  // Creates the closeOver of the given UnicodeSet, removing all
-  // characters/strings that can't be derived via simple case folding.
-  static void UnicodeSimpleCloseOver(icu::UnicodeSet& set);
-#endif  // V8_INTL_SUPPORT
-
   bool Contains(base::uc32 i) const { return from_ <= i && i <= to_; }
   base::uc32 from() const { return from_; }
   base::uc32 to() const { return to_; }
diff --git a/src/regexp/regexp-compiler-tonode.cc b/src/regexp/regexp-compiler-tonode.cc
index 5ff16ee71d2..9c83e2332e8 100644
--- a/v8/src/regexp/regexp-compiler-tonode.cc
+++ b/v8/src/regexp/regexp-compiler-tonode.cc
@@ -423,27 +423,6 @@ RegExpNode* UnanchoredAdvance(RegExpCompiler* compiler,
 
 }  // namespace
 
-#ifdef V8_INTL_SUPPORT
-// static
-void CharacterRange::UnicodeSimpleCloseOver(icu::UnicodeSet& set) {
-  // Remove characters for which closeOver() adds full-case-folding equivalents
-  // because we should work only with simple case folding mappings.
-  icu::UnicodeSet non_simple = icu::UnicodeSet(set);
-  non_simple.retainAll(RegExpCaseFolding::UnicodeNonSimpleCloseOverSet());
-  set.removeAll(non_simple);
-
-  set.closeOver(USET_CASE_INSENSITIVE);
-  // Full case folding maps single characters to multiple characters.
-  // Those are represented as strings in the set. Remove them so that
-  // we end up with only simple and common case mappings.
-  set.removeAllStrings();
-
-  // Add characters that have non-simple case foldings again (they match
-  // themselves).
-  set.addAll(non_simple);
-}
-#endif  // V8_INTL_SUPPORT
-
 // static
 void CharacterRange::AddUnicodeCaseEquivalents(ZoneList<CharacterRange>* ranges,
                                                Zone* zone) {
@@ -465,8 +444,7 @@ void CharacterRange::AddUnicodeCaseEquivalents(ZoneList<CharacterRange>* ranges,
   }
   // Clear the ranges list without freeing the backing store.
   ranges->Rewind(0);
-
-  UnicodeSimpleCloseOver(set);
+  set.closeOver(USET_SIMPLE_CASE_INSENSITIVE);
   for (int i = 0; i < set.getRangeCount(); i++) {
     ranges->Add(Range(set.getRangeStart(i), set.getRangeEnd(i)), zone);
   }
diff --git a/src/regexp/regexp-parser.cc b/src/regexp/regexp-parser.cc
index 76ca02bf240..730dfb9da86 100644
--- a/v8/src/regexp/regexp-parser.cc
+++ b/v8/src/regexp/regexp-parser.cc
@@ -1897,7 +1897,7 @@ bool LookupPropertyValueName(UProperty property,
       ExtractStringsFromUnicodeSet(set, result_strings, flags, zone);
     }
     const bool needs_case_folding = IsUnicodeSets(flags) && IsIgnoreCase(flags);
-    if (needs_case_folding) CharacterRange::UnicodeSimpleCloseOver(set);
+    if (needs_case_folding) set.closeOver(USET_SIMPLE_CASE_INSENSITIVE);
     set.removeAllStrings();
     if (negate) set.complement();
     for (int i = 0; i < set.getRangeCount(); i++) {
diff --git a/src/regexp/special-case.h b/src/regexp/special-case.h
index c80b94e976a..753c9231ede 100644
--- a/v8/src/regexp/special-case.h
+++ b/v8/src/regexp/special-case.h
@@ -71,21 +71,11 @@ namespace internal {
 // another character. Characters that match no other characters in
 // their equivalence class are added to IgnoreSet. Characters that
 // match at least one other character are added to SpecialAddSet.
-//
-// For unicode ignoreCase ("iu" and "iv"),
-// UnicodeSet::closeOver(USET_CASE_INSENSITIVE) adds all characters that are in
-// the same equivalence class. This includes characaters that are in the same
-// equivalence class using full case folding. According to the spec, only
-// simple case folding shall be considered. We therefore create
-// UnicodeNonSimpleCloseOverSet containing all characters for which
-// UnicodeSet::closeOver adds characters that are not simple case folds. This
-// set should be used similar to IgnoreSet described above.
 
 class RegExpCaseFolding final : public AllStatic {
  public:
   static const icu::UnicodeSet& IgnoreSet();
   static const icu::UnicodeSet& SpecialAddSet();
-  static const icu::UnicodeSet& UnicodeNonSimpleCloseOverSet();
 
   // This implements ECMAScript 2020 21.2.2.8.2 (Runtime Semantics:
   // Canonicalize) step 3, which is used to determine whether
Accepting request 1119084 from home:dziobian:gulgul-ultron:19 - New upstream release 27.0.1 * Updated Chromium to 118.0.5993.89. * Fixed an issue where calling loadURL during some webContents url loading events could crash. * Fixed an issue where fully occluded windows would return an empty image from webContents.capturePage() * Fixed some redundant permission dialogs while screen sharing on Wayland. - Fix typo installing icon file to wrong folder. - Update to 27.0.0: * ABI break: NODE_MODULE_VERSION is now 118 * Chromium 118.0.5993.32 * Node 18.17.1 * V8 11.8 * The deprecated color scheme changed events in systemPreferences have been removed. * The systemPreferences.getAppLevelAppearance, systemPreferences.setAppLevelAppearance and systemPreferences.appLevelAppearance APIs have been removed, as well as the alternate-selected-control-text value for systemPreferences.getColor. * see https://www.electronjs.org/blog/electron-27-0 and https://www.electronjs.org/blog/electron-26-0 for new features - Drop upstreamed patches * absl-uint128-do-not-assume-abi.patch * cpu-missing-uint8_t.patch * electron-24-components-missing-headers.patch * mojom-python3.12-imp.patch * re2-11-StringPiece.patch * swiftshader-llvm17.patch - (Fedora) switch to bundled avif as Chromium no longer builds with avif 0.x * drop avif_image_decoder-AVIF_PIXEL_FORMAT_COUNT.patch * drop avif_image_decoder-libavif-1-mode.patch * drop avif_image_decoder-repetitionCount-clli.patch - Drop no longer relevant chromium-86-fix-vaapi-on-intel.patch - (Leap and Fedora) reverse upstream changes to build with re2 10 * replace-StringPiece-with-string_view.patch - (Fedora <39) reverse upstream changes to build with icu 71 * v8-icu73-alt_calendar.patch * v8-icu73-simple-case-folding.patch - Reverse upstream changes to build against old brotli * brotli-remove-shared-dictionary.patch - Add patches to fix build errors * absl-make_unique-missing-include.patch * autofill_i18n_parsing_expressions-constexpr.patch * chromium-117-blink-BUILD-mnemonic.patch * decoder_buffer_side_data-missing-uint8_t.patch * disable-tests.patch * keyboard_util-gcc12-invalid-constexpr.patch * kwallet_dbus-missing-uint8_t.patch * material_color_utilities-tones-missing-round.patch * page_content_annotations_common-remove-tflite.patch * partition_root-attribute.patch * perfetto-numeric_storage-double_t.patch * sensor_reading-missing-int64_t-size_t.patch * simple_font_data-freetype-include.patch * utf_string_conversion_utils-missing-numeric_limits.patch OBS-URL: https://build.opensuse.org/request/show/1119084 OBS-URL: https://build.opensuse.org/package/show/devel:languages:nodejs/nodejs-electron?expand=0&rev=103 2023-10-19 17:09:42 +00:00			`From 572b80f2e906a826a499c4c5561b90b97a687f0e Mon Sep 17 00:00:00 2001`
			`From: pthier <pthier@chromium.org>`
			`Date: Tue, 18 Jul 2023 16:27:28 +0200`
			`Subject: [PATCH] [regexp] Remove special handling for simple case folding`

			`ICU 73 introduced creating closures using simple case folding.`
			`We can directly use this method instead of our own special handling where simple case folding (required by JS spec) differs from full case`
			`folding (the previously only supported mode in ICU).`

			`Bug: v8:13377`
			`Change-Id: I42bbcc37fe5c1f33a1d6c36f0d4ceb18a67a9b43`
			`Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/4694009`
			`Commit-Queue: Patrick Thier <pthier@chromium.org>`
			`Reviewed-by: Jakob Linke <jgruber@chromium.org>`
			`Cr-Commit-Position: refs/heads/main@{#89024}`
			`---`
			`src/regexp/gen-regexp-special-case.cc \| 48 ---------------------------`
			`src/regexp/regexp-ast.h \| 6 ----`
			`src/regexp/regexp-compiler-tonode.cc \| 24 +-------------`
			`src/regexp/regexp-parser.cc \| 2 +-`
			`src/regexp/special-case.h \| 10 ------`
			`5 files changed, 2 insertions(+), 88 deletions(-)`

			`diff --git a/src/regexp/gen-regexp-special-case.cc b/src/regexp/gen-regexp-special-case.cc`
			`index 55618f11783..86f6b212c93 100644`
			`--- a/v8/src/regexp/gen-regexp-special-case.cc`
			`+++ b/v8/src/regexp/gen-regexp-special-case.cc`
			`@@ -9,7 +9,6 @@`

			`#include "src/base/strings.h"`
			`#include "src/regexp/special-case.h"`
			`-#include "unicode/usetiter.h"`

			`namespace v8 {`
			`namespace internal {`
			`@@ -127,52 +126,6 @@ void PrintSpecial(std::ofstream& out) {`
			`PrintSet(out, "SpecialAddSet", special_add);`
			`}`

			`-void PrintUnicodeSpecial(std::ofstream& out) {`
			`- icu::UnicodeSet non_simple_folding;`
			`- icu::UnicodeSet current;`
			`- UErrorCode status = U_ZERO_ERROR;`
			`- // Look at all characters except white spaces.`
			`- icu::UnicodeSet interestingCP(u"[^[:White_Space:]]", status);`
			`- CHECK_EQ(status, U_ZERO_ERROR);`
			`- icu::UnicodeSetIterator iter(interestingCP);`
			`- while (iter.next()) {`
			`- UChar32 c = iter.getCodepoint();`
			`- current.set(c, c);`
			`- current.closeOver(USET_CASE_INSENSITIVE).removeAllStrings();`
			`- CHECK(!current.isBogus());`
			`- // Remove characters from the closeover that have a simple case folding.`
			`- icu::UnicodeSet toRemove;`
			`- icu::UnicodeSetIterator closeOverIter(current);`
			`- while (closeOverIter.next()) {`
			`- UChar32 closeOverChar = closeOverIter.getCodepoint();`
			`- UChar32 closeOverSCF = u_foldCase(closeOverChar, U_FOLD_CASE_DEFAULT);`
			`- if (closeOverChar != closeOverSCF) {`
			`- toRemove.add(closeOverChar);`
			`- }`
			`- }`
			`- CHECK(!toRemove.isBogus());`
			`- current.removeAll(toRemove);`
			`-`
			`- // The current character and its simple case folding are also always OK.`
			`- UChar32 scf = u_foldCase(c, U_FOLD_CASE_DEFAULT);`
			`- current.remove(c);`
			`- current.remove(scf);`
			`-`
			`- // If there are any characters remaining, they were added due to full case`
			`- // foldings and shouldn't match the current charcter according to the spec.`
			`- if (!current.isEmpty()) {`
			`- // Ensure that the character doesn't have a simple case folding.`
			`- // Otherwise the current approach of simply removing the character from`
			`- // the set before calling closeOver won't work.`
			`- CHECK_EQ(c, scf);`
			`- non_simple_folding.add(c);`
			`- }`
			`- }`
			`- CHECK(!non_simple_folding.isBogus());`
			`-`
			`- PrintSet(out, "UnicodeNonSimpleCloseOverSet", non_simple_folding);`
			`-}`
			`-`
			`void WriteHeader(const char* header_filename) {`
			`std::ofstream out(header_filename);`
			`out << std::hex << std::setfill('0') << std::setw(4);`
			`@@ -193,7 +146,6 @@ void WriteHeader(const char* header_filename) {`
			`<< "namespace internal {\n\n";`

			`PrintSpecial(out);`
			`- PrintUnicodeSpecial(out);`

			`out << "\n"`
			`<< "} // namespace internal\n"`
			`diff --git a/src/regexp/regexp-ast.h b/src/regexp/regexp-ast.h`
			`index e7453ad3f8f..8e3bb12fce2 100644`
			`--- a/v8/src/regexp/regexp-ast.h`
			`+++ b/v8/src/regexp/regexp-ast.h`
			`@@ -134,12 +134,6 @@ class CharacterRange {`
			`static void AddUnicodeCaseEquivalents(ZoneList<CharacterRange>* ranges,`
			`Zone* zone);`

			`-#ifdef V8_INTL_SUPPORT`
			`- // Creates the closeOver of the given UnicodeSet, removing all`
			`- // characters/strings that can't be derived via simple case folding.`
			`- static void UnicodeSimpleCloseOver(icu::UnicodeSet& set);`
			`-#endif // V8_INTL_SUPPORT`
			`-`
			`bool Contains(base::uc32 i) const { return from_ <= i && i <= to_; }`
			`base::uc32 from() const { return from_; }`
			`base::uc32 to() const { return to_; }`
			`diff --git a/src/regexp/regexp-compiler-tonode.cc b/src/regexp/regexp-compiler-tonode.cc`
			`index 5ff16ee71d2..9c83e2332e8 100644`
			`--- a/v8/src/regexp/regexp-compiler-tonode.cc`
			`+++ b/v8/src/regexp/regexp-compiler-tonode.cc`
			`@@ -423,27 +423,6 @@ RegExpNode* UnanchoredAdvance(RegExpCompiler* compiler,`

			`} // namespace`

			`-#ifdef V8_INTL_SUPPORT`
			`-// static`
			`-void CharacterRange::UnicodeSimpleCloseOver(icu::UnicodeSet& set) {`
			`- // Remove characters for which closeOver() adds full-case-folding equivalents`
			`- // because we should work only with simple case folding mappings.`
			`- icu::UnicodeSet non_simple = icu::UnicodeSet(set);`
			`- non_simple.retainAll(RegExpCaseFolding::UnicodeNonSimpleCloseOverSet());`
			`- set.removeAll(non_simple);`
			`-`
			`- set.closeOver(USET_CASE_INSENSITIVE);`
			`- // Full case folding maps single characters to multiple characters.`
			`- // Those are represented as strings in the set. Remove them so that`
			`- // we end up with only simple and common case mappings.`
			`- set.removeAllStrings();`
			`-`
			`- // Add characters that have non-simple case foldings again (they match`
			`- // themselves).`
			`- set.addAll(non_simple);`
			`-}`
			`-#endif // V8_INTL_SUPPORT`
			`-`
			`// static`
			`void CharacterRange::AddUnicodeCaseEquivalents(ZoneList<CharacterRange>* ranges,`
			`Zone* zone) {`
			`@@ -465,8 +444,7 @@ void CharacterRange::AddUnicodeCaseEquivalents(ZoneList<CharacterRange>* ranges,`
			`}`
			`// Clear the ranges list without freeing the backing store.`
			`ranges->Rewind(0);`
			`-`
			`- UnicodeSimpleCloseOver(set);`
			`+ set.closeOver(USET_SIMPLE_CASE_INSENSITIVE);`
			`for (int i = 0; i < set.getRangeCount(); i++) {`
			`ranges->Add(Range(set.getRangeStart(i), set.getRangeEnd(i)), zone);`
			`}`
			`diff --git a/src/regexp/regexp-parser.cc b/src/regexp/regexp-parser.cc`
			`index 76ca02bf240..730dfb9da86 100644`
			`--- a/v8/src/regexp/regexp-parser.cc`
			`+++ b/v8/src/regexp/regexp-parser.cc`
			`@@ -1897,7 +1897,7 @@ bool LookupPropertyValueName(UProperty property,`
			`ExtractStringsFromUnicodeSet(set, result_strings, flags, zone);`
			`}`
			`const bool needs_case_folding = IsUnicodeSets(flags) && IsIgnoreCase(flags);`
			`- if (needs_case_folding) CharacterRange::UnicodeSimpleCloseOver(set);`
			`+ if (needs_case_folding) set.closeOver(USET_SIMPLE_CASE_INSENSITIVE);`
			`set.removeAllStrings();`
			`if (negate) set.complement();`
			`for (int i = 0; i < set.getRangeCount(); i++) {`
			`diff --git a/src/regexp/special-case.h b/src/regexp/special-case.h`
			`index c80b94e976a..753c9231ede 100644`
			`--- a/v8/src/regexp/special-case.h`
			`+++ b/v8/src/regexp/special-case.h`
			`@@ -71,21 +71,11 @@ namespace internal {`
			`// another character. Characters that match no other characters in`
			`// their equivalence class are added to IgnoreSet. Characters that`
			`// match at least one other character are added to SpecialAddSet.`
			`-//`
			`-// For unicode ignoreCase ("iu" and "iv"),`
			`-// UnicodeSet::closeOver(USET_CASE_INSENSITIVE) adds all characters that are in`
			`-// the same equivalence class. This includes characaters that are in the same`
			`-// equivalence class using full case folding. According to the spec, only`
			`-// simple case folding shall be considered. We therefore create`
			`-// UnicodeNonSimpleCloseOverSet containing all characters for which`
			`-// UnicodeSet::closeOver adds characters that are not simple case folds. This`
			`-// set should be used similar to IgnoreSet described above.`

			`class RegExpCaseFolding final : public AllStatic {`
			`public:`
			`static const icu::UnicodeSet& IgnoreSet();`
			`static const icu::UnicodeSet& SpecialAddSet();`
			`- static const icu::UnicodeSet& UnicodeNonSimpleCloseOverSet();`

			`// This implements ECMAScript 2020 21.2.2.8.2 (Runtime Semantics:`
			`// Canonicalize) step 3, which is used to determine whether`