forked from pool/diffutils
This commit is contained in:
parent
3308ee6d2e
commit
0fc958104e
50
big-file-performance.patch
Normal file
50
big-file-performance.patch
Normal file
@ -0,0 +1,50 @@
|
||||
From 68b82f6f8419a815cfcf962b3061352d414dc606 Mon Sep 17 00:00:00 2001
|
||||
From: Paul Eggert <eggert@cs.ucla.edu>
|
||||
Date: Tue, 25 Oct 2016 21:57:56 -0700
|
||||
Subject: [PATCH] diff: fix big performance degradation in 3.4
|
||||
|
||||
* NEWS, doc/diffutils.texi (Overview): Document this.
|
||||
* src/analyze.c (diff_2_files): Restore too_expensive heuristic,
|
||||
but this time with a floor that is 16 times the old floor. This
|
||||
should fix Bug#16848, by generating good-quality output for its
|
||||
test case, while not introducing Bug#24715, by running nearly as
|
||||
fast as diff-3.3 for that test case.
|
||||
---
|
||||
NEWS | 5 +++++
|
||||
doc/diffutils.texi | 5 ++++-
|
||||
src/analyze.c | 11 ++++++++++-
|
||||
3 files changed, 19 insertions(+), 2 deletions(-)
|
||||
|
||||
Index: diffutils-3.5/src/analyze.c
|
||||
===================================================================
|
||||
--- diffutils-3.5.orig/src/analyze.c
|
||||
+++ diffutils-3.5/src/analyze.c
|
||||
@@ -534,6 +534,7 @@ diff_2_files (struct comparison *cmp)
|
||||
{
|
||||
struct context ctxt;
|
||||
lin diags;
|
||||
+ lin too_expensive;
|
||||
|
||||
/* Allocate vectors for the results of comparison:
|
||||
a flag for each line of each file, saying whether that line
|
||||
@@ -565,11 +566,19 @@ diff_2_files (struct comparison *cmp)
|
||||
|
||||
ctxt.heuristic = speed_large_files;
|
||||
|
||||
+ /* Set TOO_EXPENSIVE to be the approximate square root of the
|
||||
+ input size, bounded below by 4096. 4096 seems to be good for
|
||||
+ circa-2016 CPUs; see Bug#16848 and Bug#24715. */
|
||||
+ too_expensive = 1;
|
||||
+ for (; diags != 0; diags >>= 2)
|
||||
+ too_expensive <<= 1;
|
||||
+ ctxt.too_expensive = MAX (4096, too_expensive);
|
||||
+
|
||||
files[0] = cmp->file[0];
|
||||
files[1] = cmp->file[1];
|
||||
|
||||
compareseq (0, cmp->file[0].nondiscarded_lines,
|
||||
- 0, cmp->file[1].nondiscarded_lines, &ctxt);
|
||||
+ 0, cmp->file[1].nondiscarded_lines, minimal, &ctxt);
|
||||
|
||||
free (ctxt.fdiag - (cmp->file[1].nondiscarded_lines + 1));
|
||||
|
3
diffutils-3.5.tar.xz
Normal file
3
diffutils-3.5.tar.xz
Normal file
@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:dad398ccd5b9faca6b0ab219a036453f62a602a56203ac659b43e889bec35533
|
||||
size 1360996
|
16
diffutils-3.5.tar.xz.sig
Normal file
16
diffutils-3.5.tar.xz.sig
Normal file
@ -0,0 +1,16 @@
|
||||
-----BEGIN PGP SIGNATURE-----
|
||||
|
||||
iQIcBAABCgAGBQJXuT2mAAoJEH/Z/MsAC+7uECsP/0wVZOh74XKI0Y4XPxqhgKou
|
||||
L/uFMgqpq2P9Uwr8jnJCr3xsKWCHYSYXOjmzbQ21wkqDWynT+NbLRcH4HLvO6vU3
|
||||
EWtoBor7UG0weTDanNfRBFjVLWsHsDWJj7VGMja9OAkXjpqo0f+iYHHIbJ+oKlIY
|
||||
gzNqUdjGg8RpsvNapz4XuTsoUNDrTtVOy/k9xHUZCw/h1cZBVpaAU8MEE3MReab6
|
||||
pOn660BlVqT50vMd09FKRuTLktJ2LBFZ6x+xdPBJm5LFdUFqClbiNaNv+idhlvfB
|
||||
GC8qjBr4WhuCtGpJKLFADTOZ8UOxcmx2sNz0ypiQrLT1UkTUtY3B0ADnzWuMEcwx
|
||||
eaPNzdJhPExY64i7MA2vc2MxlRb7omj8kI+n0rBpiFKLMI3x3ZWf6Papg/acYbJg
|
||||
0NHQkqdc82gH3vsp5DX/wNn+3TNwks9ziVt7Jervk7uQqWaDLrah3waBN3q5UWsk
|
||||
HLAlkhb94Ahi+cNMk1oiNKqT+en3AhZ/7O6imKzTM8bTk27Ek7q3ThLfBeUKcp2O
|
||||
j1aaPbaDGEL9pZZDCeuZCSdRZDGMY+spGNYRC4pmYCL9C2LsH5jtX5ob9gQsPHt3
|
||||
XFsi5l83i5N3amwzb2OdSTEwM0xgPX8TmcN435409gvz/VVmeSmx5jhHb9m8SLSk
|
||||
SrDXEuf5yKX8J8HXVMgt
|
||||
=e0eV
|
||||
-----END PGP SIGNATURE-----
|
244
gnulib-diffseq.patch
Normal file
244
gnulib-diffseq.patch
Normal file
@ -0,0 +1,244 @@
|
||||
From d55ed0636eefeb43ad1aa8123416c33839652646 Mon Sep 17 00:00:00 2001
|
||||
From: Paul Eggert <eggert@cs.ucla.edu>
|
||||
Date: Tue, 25 Oct 2016 14:59:29 -0700
|
||||
Subject: [PATCH] diffseq: restore TOO_EXPENSIVE heuristic
|
||||
|
||||
* lib/diffseq.h: Problem with diffutils reported by Andreas Schwab
|
||||
(Bug#24715). The simplest solution is to restore the
|
||||
TOO_EXPENSIVE heuristic that I added to GNU diff in 1993, while
|
||||
using a higher threshold to avoid Bug#16848 on smaller files.
|
||||
* lib/diffseq.h (struct context): Restore member too_expensive.
|
||||
(struct partition): Restore members lo_minimal, hi_minimal.
|
||||
(diag, compareseq): Restore arg find_minimal. All uses changed.
|
||||
(diag): Restore the TOO_EXPENSIVE heuristic that I added back in
|
||||
1993 to make 'diff' run faster (but not as well) on large inputs,
|
||||
but use a threshold of 4096 instead of the old 256.
|
||||
* lib/fstrcmp.c (strcmp_bounded):
|
||||
* lib/git-merge-changelog.c (compute_differences):
|
||||
Adjust to diffseq.h changes.
|
||||
---
|
||||
ChangeLog | 17 +++++++
|
||||
lib/diffseq.h | 118 ++++++++++++++++++++++++++++++++++++++++++----
|
||||
lib/fstrcmp.c | 10 +++-
|
||||
lib/git-merge-changelog.c | 3 +-
|
||||
4 files changed, 137 insertions(+), 11 deletions(-)
|
||||
|
||||
Index: diffutils-3.5/lib/diffseq.h
|
||||
===================================================================
|
||||
--- diffutils-3.5.orig/lib/diffseq.h
|
||||
+++ diffutils-3.5/lib/diffseq.h
|
||||
@@ -34,7 +34,12 @@
|
||||
The basic algorithm was independently discovered as described in:
|
||||
"Algorithms for Approximate String Matching", Esko Ukkonen,
|
||||
Information and Control Vol. 64, 1985, pp. 100-118,
|
||||
- <http://dx.doi.org/10.1016/S0019-9958(85)80046-2>. */
|
||||
+ <http://dx.doi.org/10.1016/S0019-9958(85)80046-2>.
|
||||
+
|
||||
+ Unless the 'find_minimal' flag is set, this code uses the TOO_EXPENSIVE
|
||||
+ heuristic, by Paul Eggert, to limit the cost to O(N**1.5 log N)
|
||||
+ at the price of producing suboptimal output for large inputs with
|
||||
+ many differences. */
|
||||
|
||||
/* Before including this file, you need to define:
|
||||
ELEMENT The element type of the vectors being compared.
|
||||
@@ -123,6 +128,9 @@ struct context
|
||||
bool heuristic;
|
||||
#endif
|
||||
|
||||
+ /* Edit scripts longer than this are too expensive to compute. */
|
||||
+ OFFSET too_expensive;
|
||||
+
|
||||
/* Snakes bigger than this are considered "big". */
|
||||
#define SNAKE_LIMIT 20
|
||||
};
|
||||
@@ -132,6 +140,12 @@ struct partition
|
||||
/* Midpoints of this partition. */
|
||||
OFFSET xmid;
|
||||
OFFSET ymid;
|
||||
+
|
||||
+ /* True if low half will be analyzed minimally. */
|
||||
+ bool lo_minimal;
|
||||
+
|
||||
+ /* Likewise for high half. */
|
||||
+ bool hi_minimal;
|
||||
};
|
||||
|
||||
|
||||
@@ -143,10 +157,17 @@ struct partition
|
||||
When the two searches meet, we have found the midpoint of the shortest
|
||||
edit sequence.
|
||||
|
||||
- Set *PART to the midpoint (XMID,YMID). The diagonal number
|
||||
+ If FIND_MINIMAL is true, find the minimal edit script regardless of
|
||||
+ expense. Otherwise, if the search is too expensive, use heuristics to
|
||||
+ stop the search and report a suboptimal answer.
|
||||
+
|
||||
+ Set PART->(xmid,ymid) to the midpoint (XMID,YMID). The diagonal number
|
||||
XMID - YMID equals the number of inserted elements minus the number
|
||||
of deleted elements (counting only elements before the midpoint).
|
||||
|
||||
+ Set PART->lo_minimal to true iff the minimal edit script for the
|
||||
+ left half of the partition is known; similarly for PART->hi_minimal.
|
||||
+
|
||||
This function assumes that the first elements of the specified portions
|
||||
of the two vectors do not match, and likewise that the last elements do not
|
||||
match. The caller must trim matching elements from the beginning and end
|
||||
@@ -156,7 +177,7 @@ struct partition
|
||||
suboptimal diff output. It cannot cause incorrect diff output. */
|
||||
|
||||
static void
|
||||
-diag (OFFSET xoff, OFFSET xlim, OFFSET yoff, OFFSET ylim,
|
||||
+diag (OFFSET xoff, OFFSET xlim, OFFSET yoff, OFFSET ylim, bool find_minimal,
|
||||
struct partition *part, struct context *ctxt)
|
||||
{
|
||||
OFFSET *const fd = ctxt->fdiag; /* Give the compiler a chance. */
|
||||
@@ -216,6 +237,7 @@ diag (OFFSET xoff, OFFSET xlim, OFFSET y
|
||||
{
|
||||
part->xmid = x;
|
||||
part->ymid = y;
|
||||
+ part->lo_minimal = part->hi_minimal = true;
|
||||
return;
|
||||
}
|
||||
}
|
||||
@@ -248,10 +270,14 @@ diag (OFFSET xoff, OFFSET xlim, OFFSET y
|
||||
{
|
||||
part->xmid = x;
|
||||
part->ymid = y;
|
||||
+ part->lo_minimal = part->hi_minimal = true;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
+ if (find_minimal)
|
||||
+ continue;
|
||||
+
|
||||
#ifdef USE_HEURISTIC
|
||||
/* Heuristic: check occasionally for a diagonal that has made lots
|
||||
of progress compared with the edit distance. If we have any
|
||||
@@ -295,7 +321,11 @@ diag (OFFSET xoff, OFFSET xlim, OFFSET y
|
||||
}
|
||||
}
|
||||
if (best > 0)
|
||||
- return;
|
||||
+ {
|
||||
+ part->lo_minimal = true;
|
||||
+ part->hi_minimal = false;
|
||||
+ return;
|
||||
+ }
|
||||
}
|
||||
|
||||
{
|
||||
@@ -330,10 +360,77 @@ diag (OFFSET xoff, OFFSET xlim, OFFSET y
|
||||
}
|
||||
}
|
||||
if (best > 0)
|
||||
- return;
|
||||
+ {
|
||||
+ part->lo_minimal = false;
|
||||
+ part->hi_minimal = true;
|
||||
+ return;
|
||||
+ }
|
||||
}
|
||||
}
|
||||
#endif /* USE_HEURISTIC */
|
||||
+
|
||||
+ /* Heuristic: if we've gone well beyond the call of duty, give up
|
||||
+ and report halfway between our best results so far. */
|
||||
+ if (c >= ctxt->too_expensive)
|
||||
+ {
|
||||
+ OFFSET fxybest;
|
||||
+ OFFSET fxbest IF_LINT (= 0);
|
||||
+ OFFSET bxybest;
|
||||
+ OFFSET bxbest IF_LINT (= 0);
|
||||
+
|
||||
+ /* Find forward diagonal that maximizes X + Y. */
|
||||
+ fxybest = -1;
|
||||
+ for (d = fmax; d >= fmin; d -= 2)
|
||||
+ {
|
||||
+ OFFSET x = MIN (fd[d], xlim);
|
||||
+ OFFSET y = x - d;
|
||||
+ if (ylim < y)
|
||||
+ {
|
||||
+ x = ylim + d;
|
||||
+ y = ylim;
|
||||
+ }
|
||||
+ if (fxybest < x + y)
|
||||
+ {
|
||||
+ fxybest = x + y;
|
||||
+ fxbest = x;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ /* Find backward diagonal that minimizes X + Y. */
|
||||
+ bxybest = OFFSET_MAX;
|
||||
+ for (d = bmax; d >= bmin; d -= 2)
|
||||
+ {
|
||||
+ OFFSET x = MAX (xoff, bd[d]);
|
||||
+ OFFSET y = x - d;
|
||||
+ if (y < yoff)
|
||||
+ {
|
||||
+ x = yoff + d;
|
||||
+ y = yoff;
|
||||
+ }
|
||||
+ if (x + y < bxybest)
|
||||
+ {
|
||||
+ bxybest = x + y;
|
||||
+ bxbest = x;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ /* Use the better of the two diagonals. */
|
||||
+ if ((xlim + ylim) - bxybest < fxybest - (xoff + yoff))
|
||||
+ {
|
||||
+ part->xmid = fxbest;
|
||||
+ part->ymid = fxybest - fxbest;
|
||||
+ part->lo_minimal = true;
|
||||
+ part->hi_minimal = false;
|
||||
+ }
|
||||
+ else
|
||||
+ {
|
||||
+ part->xmid = bxbest;
|
||||
+ part->ymid = bxybest - bxbest;
|
||||
+ part->lo_minimal = false;
|
||||
+ part->hi_minimal = true;
|
||||
+ }
|
||||
+ return;
|
||||
+ }
|
||||
}
|
||||
#undef XREF_YREF_EQUAL
|
||||
}
|
||||
@@ -347,6 +444,9 @@ diag (OFFSET xoff, OFFSET xlim, OFFSET y
|
||||
Note that XLIM, YLIM are exclusive bounds. All indices into the vectors
|
||||
are origin-0.
|
||||
|
||||
+ If FIND_MINIMAL, find a minimal difference no matter how
|
||||
+ expensive it is.
|
||||
+
|
||||
The results are recorded by invoking NOTE_DELETE and NOTE_INSERT.
|
||||
|
||||
Return false if terminated normally, or true if terminated through early
|
||||
@@ -354,7 +454,7 @@ diag (OFFSET xoff, OFFSET xlim, OFFSET y
|
||||
|
||||
static bool
|
||||
compareseq (OFFSET xoff, OFFSET xlim, OFFSET yoff, OFFSET ylim,
|
||||
- struct context *ctxt)
|
||||
+ bool find_minimal, struct context *ctxt)
|
||||
{
|
||||
#ifdef ELEMENT
|
||||
ELEMENT const *xv = ctxt->xvec; /* Help the compiler. */
|
||||
@@ -400,12 +500,12 @@ compareseq (OFFSET xoff, OFFSET xlim, OF
|
||||
struct partition part IF_LINT2 (= { .xmid = 0, .ymid = 0 });
|
||||
|
||||
/* Find a point of correspondence in the middle of the vectors. */
|
||||
- diag (xoff, xlim, yoff, ylim, &part, ctxt);
|
||||
+ diag (xoff, xlim, yoff, ylim, find_minimal, &part, ctxt);
|
||||
|
||||
/* Use the partitions to split this problem into subproblems. */
|
||||
- if (compareseq (xoff, part.xmid, yoff, part.ymid, ctxt))
|
||||
+ if (compareseq (xoff, part.xmid, yoff, part.ymid, part.lo_minimal, ctxt))
|
||||
return true;
|
||||
- if (compareseq (part.xmid, xlim, part.ymid, ylim, ctxt))
|
||||
+ if (compareseq (part.xmid, xlim, part.ymid, ylim, part.hi_minimal, ctxt))
|
||||
return true;
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user