forked from pool/diffutils
Accepting request 437384 from home:Andreas_Schwab:Factory
- gnulib-diffseq.patch, big-file-performance.patch: Avoid performance regression on big files (bsc#1004991) OBS-URL: https://build.opensuse.org/request/show/437384 OBS-URL: https://build.opensuse.org/package/show/Base:System/diffutils?expand=0&rev=42
This commit is contained in:
parent
5c37307dbc
commit
9c171e3315
50
big-file-performance.patch
Normal file
50
big-file-performance.patch
Normal file
@ -0,0 +1,50 @@
|
|||||||
|
From 68b82f6f8419a815cfcf962b3061352d414dc606 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Paul Eggert <eggert@cs.ucla.edu>
|
||||||
|
Date: Tue, 25 Oct 2016 21:57:56 -0700
|
||||||
|
Subject: [PATCH] diff: fix big performance degradation in 3.4
|
||||||
|
|
||||||
|
* NEWS, doc/diffutils.texi (Overview): Document this.
|
||||||
|
* src/analyze.c (diff_2_files): Restore too_expensive heuristic,
|
||||||
|
but this time with a floor that is 16 times the old floor. This
|
||||||
|
should fix Bug#16848, by generating good-quality output for its
|
||||||
|
test case, while not introducing Bug#24715, by running nearly as
|
||||||
|
fast as diff-3.3 for that test case.
|
||||||
|
---
|
||||||
|
NEWS | 5 +++++
|
||||||
|
doc/diffutils.texi | 5 ++++-
|
||||||
|
src/analyze.c | 11 ++++++++++-
|
||||||
|
3 files changed, 19 insertions(+), 2 deletions(-)
|
||||||
|
|
||||||
|
Index: diffutils-3.5/src/analyze.c
|
||||||
|
===================================================================
|
||||||
|
--- diffutils-3.5.orig/src/analyze.c
|
||||||
|
+++ diffutils-3.5/src/analyze.c
|
||||||
|
@@ -534,6 +534,7 @@ diff_2_files (struct comparison *cmp)
|
||||||
|
{
|
||||||
|
struct context ctxt;
|
||||||
|
lin diags;
|
||||||
|
+ lin too_expensive;
|
||||||
|
|
||||||
|
/* Allocate vectors for the results of comparison:
|
||||||
|
a flag for each line of each file, saying whether that line
|
||||||
|
@@ -565,11 +566,19 @@ diff_2_files (struct comparison *cmp)
|
||||||
|
|
||||||
|
ctxt.heuristic = speed_large_files;
|
||||||
|
|
||||||
|
+ /* Set TOO_EXPENSIVE to be the approximate square root of the
|
||||||
|
+ input size, bounded below by 4096. 4096 seems to be good for
|
||||||
|
+ circa-2016 CPUs; see Bug#16848 and Bug#24715. */
|
||||||
|
+ too_expensive = 1;
|
||||||
|
+ for (; diags != 0; diags >>= 2)
|
||||||
|
+ too_expensive <<= 1;
|
||||||
|
+ ctxt.too_expensive = MAX (4096, too_expensive);
|
||||||
|
+
|
||||||
|
files[0] = cmp->file[0];
|
||||||
|
files[1] = cmp->file[1];
|
||||||
|
|
||||||
|
compareseq (0, cmp->file[0].nondiscarded_lines,
|
||||||
|
- 0, cmp->file[1].nondiscarded_lines, &ctxt);
|
||||||
|
+ 0, cmp->file[1].nondiscarded_lines, minimal, &ctxt);
|
||||||
|
|
||||||
|
free (ctxt.fdiag - (cmp->file[1].nondiscarded_lines + 1));
|
||||||
|
|
@ -1,3 +1,9 @@
|
|||||||
|
-------------------------------------------------------------------
|
||||||
|
Wed Oct 26 07:33:19 UTC 2016 - schwab@suse.de
|
||||||
|
|
||||||
|
- gnulib-diffseq.patch, big-file-performance.patch: Avoid performance
|
||||||
|
regression on big files (bsc#1004991)
|
||||||
|
|
||||||
-------------------------------------------------------------------
|
-------------------------------------------------------------------
|
||||||
Mon Aug 22 20:34:19 UTC 2016 - astieger@suse.com
|
Mon Aug 22 20:34:19 UTC 2016 - astieger@suse.com
|
||||||
|
|
||||||
|
@ -27,6 +27,8 @@ Source0: https://ftp.gnu.org/gnu/%{name}/%{name}-%{version}.tar.xz
|
|||||||
Source1: https://ftp.gnu.org/gnu/%{name}/%{name}-%{version}.tar.xz.sig
|
Source1: https://ftp.gnu.org/gnu/%{name}/%{name}-%{version}.tar.xz.sig
|
||||||
# http://savannah.gnu.org/project/memberlist-gpgkeys.php?group=diffutils
|
# http://savannah.gnu.org/project/memberlist-gpgkeys.php?group=diffutils
|
||||||
Source2: %{name}.keyring
|
Source2: %{name}.keyring
|
||||||
|
Patch1: gnulib-diffseq.patch
|
||||||
|
Patch2: big-file-performance.patch
|
||||||
BuildRequires: xz
|
BuildRequires: xz
|
||||||
Requires(pre): %{install_info_prereq}
|
Requires(pre): %{install_info_prereq}
|
||||||
Requires(preun): %{install_info_prereq}
|
Requires(preun): %{install_info_prereq}
|
||||||
@ -43,6 +45,8 @@ make source code patches, for instance.
|
|||||||
|
|
||||||
%prep
|
%prep
|
||||||
%setup -q
|
%setup -q
|
||||||
|
%patch1 -p1
|
||||||
|
%patch2 -p1
|
||||||
|
|
||||||
%build
|
%build
|
||||||
export CFLAGS="%{optflags} -fPIE"
|
export CFLAGS="%{optflags} -fPIE"
|
||||||
|
244
gnulib-diffseq.patch
Normal file
244
gnulib-diffseq.patch
Normal file
@ -0,0 +1,244 @@
|
|||||||
|
From d55ed0636eefeb43ad1aa8123416c33839652646 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Paul Eggert <eggert@cs.ucla.edu>
|
||||||
|
Date: Tue, 25 Oct 2016 14:59:29 -0700
|
||||||
|
Subject: [PATCH] diffseq: restore TOO_EXPENSIVE heuristic
|
||||||
|
|
||||||
|
* lib/diffseq.h: Problem with diffutils reported by Andreas Schwab
|
||||||
|
(Bug#24715). The simplest solution is to restore the
|
||||||
|
TOO_EXPENSIVE heuristic that I added to GNU diff in 1993, while
|
||||||
|
using a higher threshold to avoid Bug#16848 on smaller files.
|
||||||
|
* lib/diffseq.h (struct context): Restore member too_expensive.
|
||||||
|
(struct partition): Restore members lo_minimal, hi_minimal.
|
||||||
|
(diag, compareseq): Restore arg find_minimal. All uses changed.
|
||||||
|
(diag): Restore the TOO_EXPENSIVE heuristic that I added back in
|
||||||
|
1993 to make 'diff' run faster (but not as well) on large inputs,
|
||||||
|
but use a threshold of 4096 instead of the old 256.
|
||||||
|
* lib/fstrcmp.c (strcmp_bounded):
|
||||||
|
* lib/git-merge-changelog.c (compute_differences):
|
||||||
|
Adjust to diffseq.h changes.
|
||||||
|
---
|
||||||
|
ChangeLog | 17 +++++++
|
||||||
|
lib/diffseq.h | 118 ++++++++++++++++++++++++++++++++++++++++++----
|
||||||
|
lib/fstrcmp.c | 10 +++-
|
||||||
|
lib/git-merge-changelog.c | 3 +-
|
||||||
|
4 files changed, 137 insertions(+), 11 deletions(-)
|
||||||
|
|
||||||
|
Index: diffutils-3.5/lib/diffseq.h
|
||||||
|
===================================================================
|
||||||
|
--- diffutils-3.5.orig/lib/diffseq.h
|
||||||
|
+++ diffutils-3.5/lib/diffseq.h
|
||||||
|
@@ -34,7 +34,12 @@
|
||||||
|
The basic algorithm was independently discovered as described in:
|
||||||
|
"Algorithms for Approximate String Matching", Esko Ukkonen,
|
||||||
|
Information and Control Vol. 64, 1985, pp. 100-118,
|
||||||
|
- <http://dx.doi.org/10.1016/S0019-9958(85)80046-2>. */
|
||||||
|
+ <http://dx.doi.org/10.1016/S0019-9958(85)80046-2>.
|
||||||
|
+
|
||||||
|
+ Unless the 'find_minimal' flag is set, this code uses the TOO_EXPENSIVE
|
||||||
|
+ heuristic, by Paul Eggert, to limit the cost to O(N**1.5 log N)
|
||||||
|
+ at the price of producing suboptimal output for large inputs with
|
||||||
|
+ many differences. */
|
||||||
|
|
||||||
|
/* Before including this file, you need to define:
|
||||||
|
ELEMENT The element type of the vectors being compared.
|
||||||
|
@@ -123,6 +128,9 @@ struct context
|
||||||
|
bool heuristic;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
+ /* Edit scripts longer than this are too expensive to compute. */
|
||||||
|
+ OFFSET too_expensive;
|
||||||
|
+
|
||||||
|
/* Snakes bigger than this are considered "big". */
|
||||||
|
#define SNAKE_LIMIT 20
|
||||||
|
};
|
||||||
|
@@ -132,6 +140,12 @@ struct partition
|
||||||
|
/* Midpoints of this partition. */
|
||||||
|
OFFSET xmid;
|
||||||
|
OFFSET ymid;
|
||||||
|
+
|
||||||
|
+ /* True if low half will be analyzed minimally. */
|
||||||
|
+ bool lo_minimal;
|
||||||
|
+
|
||||||
|
+ /* Likewise for high half. */
|
||||||
|
+ bool hi_minimal;
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
@@ -143,10 +157,17 @@ struct partition
|
||||||
|
When the two searches meet, we have found the midpoint of the shortest
|
||||||
|
edit sequence.
|
||||||
|
|
||||||
|
- Set *PART to the midpoint (XMID,YMID). The diagonal number
|
||||||
|
+ If FIND_MINIMAL is true, find the minimal edit script regardless of
|
||||||
|
+ expense. Otherwise, if the search is too expensive, use heuristics to
|
||||||
|
+ stop the search and report a suboptimal answer.
|
||||||
|
+
|
||||||
|
+ Set PART->(xmid,ymid) to the midpoint (XMID,YMID). The diagonal number
|
||||||
|
XMID - YMID equals the number of inserted elements minus the number
|
||||||
|
of deleted elements (counting only elements before the midpoint).
|
||||||
|
|
||||||
|
+ Set PART->lo_minimal to true iff the minimal edit script for the
|
||||||
|
+ left half of the partition is known; similarly for PART->hi_minimal.
|
||||||
|
+
|
||||||
|
This function assumes that the first elements of the specified portions
|
||||||
|
of the two vectors do not match, and likewise that the last elements do not
|
||||||
|
match. The caller must trim matching elements from the beginning and end
|
||||||
|
@@ -156,7 +177,7 @@ struct partition
|
||||||
|
suboptimal diff output. It cannot cause incorrect diff output. */
|
||||||
|
|
||||||
|
static void
|
||||||
|
-diag (OFFSET xoff, OFFSET xlim, OFFSET yoff, OFFSET ylim,
|
||||||
|
+diag (OFFSET xoff, OFFSET xlim, OFFSET yoff, OFFSET ylim, bool find_minimal,
|
||||||
|
struct partition *part, struct context *ctxt)
|
||||||
|
{
|
||||||
|
OFFSET *const fd = ctxt->fdiag; /* Give the compiler a chance. */
|
||||||
|
@@ -216,6 +237,7 @@ diag (OFFSET xoff, OFFSET xlim, OFFSET y
|
||||||
|
{
|
||||||
|
part->xmid = x;
|
||||||
|
part->ymid = y;
|
||||||
|
+ part->lo_minimal = part->hi_minimal = true;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
@@ -248,10 +270,14 @@ diag (OFFSET xoff, OFFSET xlim, OFFSET y
|
||||||
|
{
|
||||||
|
part->xmid = x;
|
||||||
|
part->ymid = y;
|
||||||
|
+ part->lo_minimal = part->hi_minimal = true;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
+ if (find_minimal)
|
||||||
|
+ continue;
|
||||||
|
+
|
||||||
|
#ifdef USE_HEURISTIC
|
||||||
|
/* Heuristic: check occasionally for a diagonal that has made lots
|
||||||
|
of progress compared with the edit distance. If we have any
|
||||||
|
@@ -295,7 +321,11 @@ diag (OFFSET xoff, OFFSET xlim, OFFSET y
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (best > 0)
|
||||||
|
- return;
|
||||||
|
+ {
|
||||||
|
+ part->lo_minimal = true;
|
||||||
|
+ part->hi_minimal = false;
|
||||||
|
+ return;
|
||||||
|
+ }
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
@@ -330,10 +360,77 @@ diag (OFFSET xoff, OFFSET xlim, OFFSET y
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (best > 0)
|
||||||
|
- return;
|
||||||
|
+ {
|
||||||
|
+ part->lo_minimal = false;
|
||||||
|
+ part->hi_minimal = true;
|
||||||
|
+ return;
|
||||||
|
+ }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif /* USE_HEURISTIC */
|
||||||
|
+
|
||||||
|
+ /* Heuristic: if we've gone well beyond the call of duty, give up
|
||||||
|
+ and report halfway between our best results so far. */
|
||||||
|
+ if (c >= ctxt->too_expensive)
|
||||||
|
+ {
|
||||||
|
+ OFFSET fxybest;
|
||||||
|
+ OFFSET fxbest IF_LINT (= 0);
|
||||||
|
+ OFFSET bxybest;
|
||||||
|
+ OFFSET bxbest IF_LINT (= 0);
|
||||||
|
+
|
||||||
|
+ /* Find forward diagonal that maximizes X + Y. */
|
||||||
|
+ fxybest = -1;
|
||||||
|
+ for (d = fmax; d >= fmin; d -= 2)
|
||||||
|
+ {
|
||||||
|
+ OFFSET x = MIN (fd[d], xlim);
|
||||||
|
+ OFFSET y = x - d;
|
||||||
|
+ if (ylim < y)
|
||||||
|
+ {
|
||||||
|
+ x = ylim + d;
|
||||||
|
+ y = ylim;
|
||||||
|
+ }
|
||||||
|
+ if (fxybest < x + y)
|
||||||
|
+ {
|
||||||
|
+ fxybest = x + y;
|
||||||
|
+ fxbest = x;
|
||||||
|
+ }
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ /* Find backward diagonal that minimizes X + Y. */
|
||||||
|
+ bxybest = OFFSET_MAX;
|
||||||
|
+ for (d = bmax; d >= bmin; d -= 2)
|
||||||
|
+ {
|
||||||
|
+ OFFSET x = MAX (xoff, bd[d]);
|
||||||
|
+ OFFSET y = x - d;
|
||||||
|
+ if (y < yoff)
|
||||||
|
+ {
|
||||||
|
+ x = yoff + d;
|
||||||
|
+ y = yoff;
|
||||||
|
+ }
|
||||||
|
+ if (x + y < bxybest)
|
||||||
|
+ {
|
||||||
|
+ bxybest = x + y;
|
||||||
|
+ bxbest = x;
|
||||||
|
+ }
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ /* Use the better of the two diagonals. */
|
||||||
|
+ if ((xlim + ylim) - bxybest < fxybest - (xoff + yoff))
|
||||||
|
+ {
|
||||||
|
+ part->xmid = fxbest;
|
||||||
|
+ part->ymid = fxybest - fxbest;
|
||||||
|
+ part->lo_minimal = true;
|
||||||
|
+ part->hi_minimal = false;
|
||||||
|
+ }
|
||||||
|
+ else
|
||||||
|
+ {
|
||||||
|
+ part->xmid = bxbest;
|
||||||
|
+ part->ymid = bxybest - bxbest;
|
||||||
|
+ part->lo_minimal = false;
|
||||||
|
+ part->hi_minimal = true;
|
||||||
|
+ }
|
||||||
|
+ return;
|
||||||
|
+ }
|
||||||
|
}
|
||||||
|
#undef XREF_YREF_EQUAL
|
||||||
|
}
|
||||||
|
@@ -347,6 +444,9 @@ diag (OFFSET xoff, OFFSET xlim, OFFSET y
|
||||||
|
Note that XLIM, YLIM are exclusive bounds. All indices into the vectors
|
||||||
|
are origin-0.
|
||||||
|
|
||||||
|
+ If FIND_MINIMAL, find a minimal difference no matter how
|
||||||
|
+ expensive it is.
|
||||||
|
+
|
||||||
|
The results are recorded by invoking NOTE_DELETE and NOTE_INSERT.
|
||||||
|
|
||||||
|
Return false if terminated normally, or true if terminated through early
|
||||||
|
@@ -354,7 +454,7 @@ diag (OFFSET xoff, OFFSET xlim, OFFSET y
|
||||||
|
|
||||||
|
static bool
|
||||||
|
compareseq (OFFSET xoff, OFFSET xlim, OFFSET yoff, OFFSET ylim,
|
||||||
|
- struct context *ctxt)
|
||||||
|
+ bool find_minimal, struct context *ctxt)
|
||||||
|
{
|
||||||
|
#ifdef ELEMENT
|
||||||
|
ELEMENT const *xv = ctxt->xvec; /* Help the compiler. */
|
||||||
|
@@ -400,12 +500,12 @@ compareseq (OFFSET xoff, OFFSET xlim, OF
|
||||||
|
struct partition part IF_LINT2 (= { .xmid = 0, .ymid = 0 });
|
||||||
|
|
||||||
|
/* Find a point of correspondence in the middle of the vectors. */
|
||||||
|
- diag (xoff, xlim, yoff, ylim, &part, ctxt);
|
||||||
|
+ diag (xoff, xlim, yoff, ylim, find_minimal, &part, ctxt);
|
||||||
|
|
||||||
|
/* Use the partitions to split this problem into subproblems. */
|
||||||
|
- if (compareseq (xoff, part.xmid, yoff, part.ymid, ctxt))
|
||||||
|
+ if (compareseq (xoff, part.xmid, yoff, part.ymid, part.lo_minimal, ctxt))
|
||||||
|
return true;
|
||||||
|
- if (compareseq (part.xmid, xlim, part.ymid, ylim, ctxt))
|
||||||
|
+ if (compareseq (part.xmid, xlim, part.ymid, ylim, part.hi_minimal, ctxt))
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
Loading…
Reference in New Issue
Block a user