forked from pool/nodejs-electron
195 lines
8.0 KiB
Diff
195 lines
8.0 KiB
Diff
|
|
From 572b80f2e906a826a499c4c5561b90b97a687f0e Mon Sep 17 00:00:00 2001
|
||
|
|
From: pthier <pthier@chromium.org>
|
||
|
|
Date: Tue, 18 Jul 2023 16:27:28 +0200
|
||
|
|
Subject: [PATCH] [regexp] Remove special handling for simple case folding
|
||
|
|
|
||
|
|
ICU 73 introduced creating closures using simple case folding.
|
||
|
|
We can directly use this method instead of our own special handling where simple case folding (required by JS spec) differs from full case
|
||
|
|
folding (the previously only supported mode in ICU).
|
||
|
|
|
||
|
|
Bug: v8:13377
|
||
|
|
Change-Id: I42bbcc37fe5c1f33a1d6c36f0d4ceb18a67a9b43
|
||
|
|
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/4694009
|
||
|
|
Commit-Queue: Patrick Thier <pthier@chromium.org>
|
||
|
|
Reviewed-by: Jakob Linke <jgruber@chromium.org>
|
||
|
|
Cr-Commit-Position: refs/heads/main@{#89024}
|
||
|
|
---
|
||
|
|
src/regexp/gen-regexp-special-case.cc | 48 ---------------------------
|
||
|
|
src/regexp/regexp-ast.h | 6 ----
|
||
|
|
src/regexp/regexp-compiler-tonode.cc | 24 +-------------
|
||
|
|
src/regexp/regexp-parser.cc | 2 +-
|
||
|
|
src/regexp/special-case.h | 10 ------
|
||
|
|
5 files changed, 2 insertions(+), 88 deletions(-)
|
||
|
|
|
||
|
|
diff --git a/src/regexp/gen-regexp-special-case.cc b/src/regexp/gen-regexp-special-case.cc
|
||
|
|
index 55618f11783..86f6b212c93 100644
|
||
|
|
--- a/v8/src/regexp/gen-regexp-special-case.cc
|
||
|
|
+++ b/v8/src/regexp/gen-regexp-special-case.cc
|
||
|
|
@@ -9,7 +9,6 @@
|
||
|
|
|
||
|
|
#include "src/base/strings.h"
|
||
|
|
#include "src/regexp/special-case.h"
|
||
|
|
-#include "unicode/usetiter.h"
|
||
|
|
|
||
|
|
namespace v8 {
|
||
|
|
namespace internal {
|
||
|
|
@@ -127,52 +126,6 @@ void PrintSpecial(std::ofstream& out) {
|
||
|
|
PrintSet(out, "SpecialAddSet", special_add);
|
||
|
|
}
|
||
|
|
|
||
|
|
-void PrintUnicodeSpecial(std::ofstream& out) {
|
||
|
|
- icu::UnicodeSet non_simple_folding;
|
||
|
|
- icu::UnicodeSet current;
|
||
|
|
- UErrorCode status = U_ZERO_ERROR;
|
||
|
|
- // Look at all characters except white spaces.
|
||
|
|
- icu::UnicodeSet interestingCP(u"[^[:White_Space:]]", status);
|
||
|
|
- CHECK_EQ(status, U_ZERO_ERROR);
|
||
|
|
- icu::UnicodeSetIterator iter(interestingCP);
|
||
|
|
- while (iter.next()) {
|
||
|
|
- UChar32 c = iter.getCodepoint();
|
||
|
|
- current.set(c, c);
|
||
|
|
- current.closeOver(USET_CASE_INSENSITIVE).removeAllStrings();
|
||
|
|
- CHECK(!current.isBogus());
|
||
|
|
- // Remove characters from the closeover that have a simple case folding.
|
||
|
|
- icu::UnicodeSet toRemove;
|
||
|
|
- icu::UnicodeSetIterator closeOverIter(current);
|
||
|
|
- while (closeOverIter.next()) {
|
||
|
|
- UChar32 closeOverChar = closeOverIter.getCodepoint();
|
||
|
|
- UChar32 closeOverSCF = u_foldCase(closeOverChar, U_FOLD_CASE_DEFAULT);
|
||
|
|
- if (closeOverChar != closeOverSCF) {
|
||
|
|
- toRemove.add(closeOverChar);
|
||
|
|
- }
|
||
|
|
- }
|
||
|
|
- CHECK(!toRemove.isBogus());
|
||
|
|
- current.removeAll(toRemove);
|
||
|
|
-
|
||
|
|
- // The current character and its simple case folding are also always OK.
|
||
|
|
- UChar32 scf = u_foldCase(c, U_FOLD_CASE_DEFAULT);
|
||
|
|
- current.remove(c);
|
||
|
|
- current.remove(scf);
|
||
|
|
-
|
||
|
|
- // If there are any characters remaining, they were added due to full case
|
||
|
|
- // foldings and shouldn't match the current charcter according to the spec.
|
||
|
|
- if (!current.isEmpty()) {
|
||
|
|
- // Ensure that the character doesn't have a simple case folding.
|
||
|
|
- // Otherwise the current approach of simply removing the character from
|
||
|
|
- // the set before calling closeOver won't work.
|
||
|
|
- CHECK_EQ(c, scf);
|
||
|
|
- non_simple_folding.add(c);
|
||
|
|
- }
|
||
|
|
- }
|
||
|
|
- CHECK(!non_simple_folding.isBogus());
|
||
|
|
-
|
||
|
|
- PrintSet(out, "UnicodeNonSimpleCloseOverSet", non_simple_folding);
|
||
|
|
-}
|
||
|
|
-
|
||
|
|
void WriteHeader(const char* header_filename) {
|
||
|
|
std::ofstream out(header_filename);
|
||
|
|
out << std::hex << std::setfill('0') << std::setw(4);
|
||
|
|
@@ -193,7 +146,6 @@ void WriteHeader(const char* header_filename) {
|
||
|
|
<< "namespace internal {\n\n";
|
||
|
|
|
||
|
|
PrintSpecial(out);
|
||
|
|
- PrintUnicodeSpecial(out);
|
||
|
|
|
||
|
|
out << "\n"
|
||
|
|
<< "} // namespace internal\n"
|
||
|
|
diff --git a/src/regexp/regexp-ast.h b/src/regexp/regexp-ast.h
|
||
|
|
index e7453ad3f8f..8e3bb12fce2 100644
|
||
|
|
--- a/v8/src/regexp/regexp-ast.h
|
||
|
|
+++ b/v8/src/regexp/regexp-ast.h
|
||
|
|
@@ -134,12 +134,6 @@ class CharacterRange {
|
||
|
|
static void AddUnicodeCaseEquivalents(ZoneList<CharacterRange>* ranges,
|
||
|
|
Zone* zone);
|
||
|
|
|
||
|
|
-#ifdef V8_INTL_SUPPORT
|
||
|
|
- // Creates the closeOver of the given UnicodeSet, removing all
|
||
|
|
- // characters/strings that can't be derived via simple case folding.
|
||
|
|
- static void UnicodeSimpleCloseOver(icu::UnicodeSet& set);
|
||
|
|
-#endif // V8_INTL_SUPPORT
|
||
|
|
-
|
||
|
|
bool Contains(base::uc32 i) const { return from_ <= i && i <= to_; }
|
||
|
|
base::uc32 from() const { return from_; }
|
||
|
|
base::uc32 to() const { return to_; }
|
||
|
|
diff --git a/src/regexp/regexp-compiler-tonode.cc b/src/regexp/regexp-compiler-tonode.cc
|
||
|
|
index 5ff16ee71d2..9c83e2332e8 100644
|
||
|
|
--- a/v8/src/regexp/regexp-compiler-tonode.cc
|
||
|
|
+++ b/v8/src/regexp/regexp-compiler-tonode.cc
|
||
|
|
@@ -423,27 +423,6 @@ RegExpNode* UnanchoredAdvance(RegExpCompiler* compiler,
|
||
|
|
|
||
|
|
} // namespace
|
||
|
|
|
||
|
|
-#ifdef V8_INTL_SUPPORT
|
||
|
|
-// static
|
||
|
|
-void CharacterRange::UnicodeSimpleCloseOver(icu::UnicodeSet& set) {
|
||
|
|
- // Remove characters for which closeOver() adds full-case-folding equivalents
|
||
|
|
- // because we should work only with simple case folding mappings.
|
||
|
|
- icu::UnicodeSet non_simple = icu::UnicodeSet(set);
|
||
|
|
- non_simple.retainAll(RegExpCaseFolding::UnicodeNonSimpleCloseOverSet());
|
||
|
|
- set.removeAll(non_simple);
|
||
|
|
-
|
||
|
|
- set.closeOver(USET_CASE_INSENSITIVE);
|
||
|
|
- // Full case folding maps single characters to multiple characters.
|
||
|
|
- // Those are represented as strings in the set. Remove them so that
|
||
|
|
- // we end up with only simple and common case mappings.
|
||
|
|
- set.removeAllStrings();
|
||
|
|
-
|
||
|
|
- // Add characters that have non-simple case foldings again (they match
|
||
|
|
- // themselves).
|
||
|
|
- set.addAll(non_simple);
|
||
|
|
-}
|
||
|
|
-#endif // V8_INTL_SUPPORT
|
||
|
|
-
|
||
|
|
// static
|
||
|
|
void CharacterRange::AddUnicodeCaseEquivalents(ZoneList<CharacterRange>* ranges,
|
||
|
|
Zone* zone) {
|
||
|
|
@@ -465,8 +444,7 @@ void CharacterRange::AddUnicodeCaseEquivalents(ZoneList<CharacterRange>* ranges,
|
||
|
|
}
|
||
|
|
// Clear the ranges list without freeing the backing store.
|
||
|
|
ranges->Rewind(0);
|
||
|
|
-
|
||
|
|
- UnicodeSimpleCloseOver(set);
|
||
|
|
+ set.closeOver(USET_SIMPLE_CASE_INSENSITIVE);
|
||
|
|
for (int i = 0; i < set.getRangeCount(); i++) {
|
||
|
|
ranges->Add(Range(set.getRangeStart(i), set.getRangeEnd(i)), zone);
|
||
|
|
}
|
||
|
|
diff --git a/src/regexp/regexp-parser.cc b/src/regexp/regexp-parser.cc
|
||
|
|
index 76ca02bf240..730dfb9da86 100644
|
||
|
|
--- a/v8/src/regexp/regexp-parser.cc
|
||
|
|
+++ b/v8/src/regexp/regexp-parser.cc
|
||
|
|
@@ -1897,7 +1897,7 @@ bool LookupPropertyValueName(UProperty property,
|
||
|
|
ExtractStringsFromUnicodeSet(set, result_strings, flags, zone);
|
||
|
|
}
|
||
|
|
const bool needs_case_folding = IsUnicodeSets(flags) && IsIgnoreCase(flags);
|
||
|
|
- if (needs_case_folding) CharacterRange::UnicodeSimpleCloseOver(set);
|
||
|
|
+ if (needs_case_folding) set.closeOver(USET_SIMPLE_CASE_INSENSITIVE);
|
||
|
|
set.removeAllStrings();
|
||
|
|
if (negate) set.complement();
|
||
|
|
for (int i = 0; i < set.getRangeCount(); i++) {
|
||
|
|
diff --git a/src/regexp/special-case.h b/src/regexp/special-case.h
|
||
|
|
index c80b94e976a..753c9231ede 100644
|
||
|
|
--- a/v8/src/regexp/special-case.h
|
||
|
|
+++ b/v8/src/regexp/special-case.h
|
||
|
|
@@ -71,21 +71,11 @@ namespace internal {
|
||
|
|
// another character. Characters that match no other characters in
|
||
|
|
// their equivalence class are added to IgnoreSet. Characters that
|
||
|
|
// match at least one other character are added to SpecialAddSet.
|
||
|
|
-//
|
||
|
|
-// For unicode ignoreCase ("iu" and "iv"),
|
||
|
|
-// UnicodeSet::closeOver(USET_CASE_INSENSITIVE) adds all characters that are in
|
||
|
|
-// the same equivalence class. This includes characaters that are in the same
|
||
|
|
-// equivalence class using full case folding. According to the spec, only
|
||
|
|
-// simple case folding shall be considered. We therefore create
|
||
|
|
-// UnicodeNonSimpleCloseOverSet containing all characters for which
|
||
|
|
-// UnicodeSet::closeOver adds characters that are not simple case folds. This
|
||
|
|
-// set should be used similar to IgnoreSet described above.
|
||
|
|
|
||
|
|
class RegExpCaseFolding final : public AllStatic {
|
||
|
|
public:
|
||
|
|
static const icu::UnicodeSet& IgnoreSet();
|
||
|
|
static const icu::UnicodeSet& SpecialAddSet();
|
||
|
|
- static const icu::UnicodeSet& UnicodeNonSimpleCloseOverSet();
|
||
|
|
|
||
|
|
// This implements ECMAScript 2020 21.2.2.8.2 (Runtime Semantics:
|
||
|
|
// Canonicalize) step 3, which is used to determine whether
|