diff --git a/big-file-performance.patch b/big-file-performance.patch new file mode 100644 index 0000000..9f3b8ed --- /dev/null +++ b/big-file-performance.patch @@ -0,0 +1,50 @@ +From 68b82f6f8419a815cfcf962b3061352d414dc606 Mon Sep 17 00:00:00 2001 +From: Paul Eggert +Date: Tue, 25 Oct 2016 21:57:56 -0700 +Subject: [PATCH] diff: fix big performance degradation in 3.4 + +* NEWS, doc/diffutils.texi (Overview): Document this. +* src/analyze.c (diff_2_files): Restore too_expensive heuristic, +but this time with a floor that is 16 times the old floor. This +should fix Bug#16848, by generating good-quality output for its +test case, while not introducing Bug#24715, by running nearly as +fast as diff-3.3 for that test case. +--- + NEWS | 5 +++++ + doc/diffutils.texi | 5 ++++- + src/analyze.c | 11 ++++++++++- + 3 files changed, 19 insertions(+), 2 deletions(-) + +Index: diffutils-3.5/src/analyze.c +=================================================================== +--- diffutils-3.5.orig/src/analyze.c ++++ diffutils-3.5/src/analyze.c +@@ -534,6 +534,7 @@ diff_2_files (struct comparison *cmp) + { + struct context ctxt; + lin diags; ++ lin too_expensive; + + /* Allocate vectors for the results of comparison: + a flag for each line of each file, saying whether that line +@@ -565,11 +566,19 @@ diff_2_files (struct comparison *cmp) + + ctxt.heuristic = speed_large_files; + ++ /* Set TOO_EXPENSIVE to be the approximate square root of the ++ input size, bounded below by 4096. 4096 seems to be good for ++ circa-2016 CPUs; see Bug#16848 and Bug#24715. */ ++ too_expensive = 1; ++ for (; diags != 0; diags >>= 2) ++ too_expensive <<= 1; ++ ctxt.too_expensive = MAX (4096, too_expensive); ++ + files[0] = cmp->file[0]; + files[1] = cmp->file[1]; + + compareseq (0, cmp->file[0].nondiscarded_lines, +- 0, cmp->file[1].nondiscarded_lines, &ctxt); ++ 0, cmp->file[1].nondiscarded_lines, minimal, &ctxt); + + free (ctxt.fdiag - (cmp->file[1].nondiscarded_lines + 1)); + diff --git a/diffutils-3.5.tar.xz b/diffutils-3.5.tar.xz new file mode 100644 index 0000000..84acae2 --- /dev/null +++ b/diffutils-3.5.tar.xz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dad398ccd5b9faca6b0ab219a036453f62a602a56203ac659b43e889bec35533 +size 1360996 diff --git a/diffutils-3.5.tar.xz.sig b/diffutils-3.5.tar.xz.sig new file mode 100644 index 0000000..19ff48d --- /dev/null +++ b/diffutils-3.5.tar.xz.sig @@ -0,0 +1,16 @@ +-----BEGIN PGP SIGNATURE----- + +iQIcBAABCgAGBQJXuT2mAAoJEH/Z/MsAC+7uECsP/0wVZOh74XKI0Y4XPxqhgKou +L/uFMgqpq2P9Uwr8jnJCr3xsKWCHYSYXOjmzbQ21wkqDWynT+NbLRcH4HLvO6vU3 +EWtoBor7UG0weTDanNfRBFjVLWsHsDWJj7VGMja9OAkXjpqo0f+iYHHIbJ+oKlIY +gzNqUdjGg8RpsvNapz4XuTsoUNDrTtVOy/k9xHUZCw/h1cZBVpaAU8MEE3MReab6 +pOn660BlVqT50vMd09FKRuTLktJ2LBFZ6x+xdPBJm5LFdUFqClbiNaNv+idhlvfB +GC8qjBr4WhuCtGpJKLFADTOZ8UOxcmx2sNz0ypiQrLT1UkTUtY3B0ADnzWuMEcwx +eaPNzdJhPExY64i7MA2vc2MxlRb7omj8kI+n0rBpiFKLMI3x3ZWf6Papg/acYbJg +0NHQkqdc82gH3vsp5DX/wNn+3TNwks9ziVt7Jervk7uQqWaDLrah3waBN3q5UWsk +HLAlkhb94Ahi+cNMk1oiNKqT+en3AhZ/7O6imKzTM8bTk27Ek7q3ThLfBeUKcp2O +j1aaPbaDGEL9pZZDCeuZCSdRZDGMY+spGNYRC4pmYCL9C2LsH5jtX5ob9gQsPHt3 +XFsi5l83i5N3amwzb2OdSTEwM0xgPX8TmcN435409gvz/VVmeSmx5jhHb9m8SLSk +SrDXEuf5yKX8J8HXVMgt +=e0eV +-----END PGP SIGNATURE----- diff --git a/gnulib-diffseq.patch b/gnulib-diffseq.patch new file mode 100644 index 0000000..0361966 --- /dev/null +++ b/gnulib-diffseq.patch @@ -0,0 +1,244 @@ +From d55ed0636eefeb43ad1aa8123416c33839652646 Mon Sep 17 00:00:00 2001 +From: Paul Eggert +Date: Tue, 25 Oct 2016 14:59:29 -0700 +Subject: [PATCH] diffseq: restore TOO_EXPENSIVE heuristic + +* lib/diffseq.h: Problem with diffutils reported by Andreas Schwab +(Bug#24715). The simplest solution is to restore the +TOO_EXPENSIVE heuristic that I added to GNU diff in 1993, while +using a higher threshold to avoid Bug#16848 on smaller files. +* lib/diffseq.h (struct context): Restore member too_expensive. +(struct partition): Restore members lo_minimal, hi_minimal. +(diag, compareseq): Restore arg find_minimal. All uses changed. +(diag): Restore the TOO_EXPENSIVE heuristic that I added back in +1993 to make 'diff' run faster (but not as well) on large inputs, +but use a threshold of 4096 instead of the old 256. +* lib/fstrcmp.c (strcmp_bounded): +* lib/git-merge-changelog.c (compute_differences): +Adjust to diffseq.h changes. +--- + ChangeLog | 17 +++++++ + lib/diffseq.h | 118 ++++++++++++++++++++++++++++++++++++++++++---- + lib/fstrcmp.c | 10 +++- + lib/git-merge-changelog.c | 3 +- + 4 files changed, 137 insertions(+), 11 deletions(-) + +Index: diffutils-3.5/lib/diffseq.h +=================================================================== +--- diffutils-3.5.orig/lib/diffseq.h ++++ diffutils-3.5/lib/diffseq.h +@@ -34,7 +34,12 @@ + The basic algorithm was independently discovered as described in: + "Algorithms for Approximate String Matching", Esko Ukkonen, + Information and Control Vol. 64, 1985, pp. 100-118, +- . */ ++ . ++ ++ Unless the 'find_minimal' flag is set, this code uses the TOO_EXPENSIVE ++ heuristic, by Paul Eggert, to limit the cost to O(N**1.5 log N) ++ at the price of producing suboptimal output for large inputs with ++ many differences. */ + + /* Before including this file, you need to define: + ELEMENT The element type of the vectors being compared. +@@ -123,6 +128,9 @@ struct context + bool heuristic; + #endif + ++ /* Edit scripts longer than this are too expensive to compute. */ ++ OFFSET too_expensive; ++ + /* Snakes bigger than this are considered "big". */ + #define SNAKE_LIMIT 20 + }; +@@ -132,6 +140,12 @@ struct partition + /* Midpoints of this partition. */ + OFFSET xmid; + OFFSET ymid; ++ ++ /* True if low half will be analyzed minimally. */ ++ bool lo_minimal; ++ ++ /* Likewise for high half. */ ++ bool hi_minimal; + }; + + +@@ -143,10 +157,17 @@ struct partition + When the two searches meet, we have found the midpoint of the shortest + edit sequence. + +- Set *PART to the midpoint (XMID,YMID). The diagonal number ++ If FIND_MINIMAL is true, find the minimal edit script regardless of ++ expense. Otherwise, if the search is too expensive, use heuristics to ++ stop the search and report a suboptimal answer. ++ ++ Set PART->(xmid,ymid) to the midpoint (XMID,YMID). The diagonal number + XMID - YMID equals the number of inserted elements minus the number + of deleted elements (counting only elements before the midpoint). + ++ Set PART->lo_minimal to true iff the minimal edit script for the ++ left half of the partition is known; similarly for PART->hi_minimal. ++ + This function assumes that the first elements of the specified portions + of the two vectors do not match, and likewise that the last elements do not + match. The caller must trim matching elements from the beginning and end +@@ -156,7 +177,7 @@ struct partition + suboptimal diff output. It cannot cause incorrect diff output. */ + + static void +-diag (OFFSET xoff, OFFSET xlim, OFFSET yoff, OFFSET ylim, ++diag (OFFSET xoff, OFFSET xlim, OFFSET yoff, OFFSET ylim, bool find_minimal, + struct partition *part, struct context *ctxt) + { + OFFSET *const fd = ctxt->fdiag; /* Give the compiler a chance. */ +@@ -216,6 +237,7 @@ diag (OFFSET xoff, OFFSET xlim, OFFSET y + { + part->xmid = x; + part->ymid = y; ++ part->lo_minimal = part->hi_minimal = true; + return; + } + } +@@ -248,10 +270,14 @@ diag (OFFSET xoff, OFFSET xlim, OFFSET y + { + part->xmid = x; + part->ymid = y; ++ part->lo_minimal = part->hi_minimal = true; + return; + } + } + ++ if (find_minimal) ++ continue; ++ + #ifdef USE_HEURISTIC + /* Heuristic: check occasionally for a diagonal that has made lots + of progress compared with the edit distance. If we have any +@@ -295,7 +321,11 @@ diag (OFFSET xoff, OFFSET xlim, OFFSET y + } + } + if (best > 0) +- return; ++ { ++ part->lo_minimal = true; ++ part->hi_minimal = false; ++ return; ++ } + } + + { +@@ -330,10 +360,77 @@ diag (OFFSET xoff, OFFSET xlim, OFFSET y + } + } + if (best > 0) +- return; ++ { ++ part->lo_minimal = false; ++ part->hi_minimal = true; ++ return; ++ } + } + } + #endif /* USE_HEURISTIC */ ++ ++ /* Heuristic: if we've gone well beyond the call of duty, give up ++ and report halfway between our best results so far. */ ++ if (c >= ctxt->too_expensive) ++ { ++ OFFSET fxybest; ++ OFFSET fxbest IF_LINT (= 0); ++ OFFSET bxybest; ++ OFFSET bxbest IF_LINT (= 0); ++ ++ /* Find forward diagonal that maximizes X + Y. */ ++ fxybest = -1; ++ for (d = fmax; d >= fmin; d -= 2) ++ { ++ OFFSET x = MIN (fd[d], xlim); ++ OFFSET y = x - d; ++ if (ylim < y) ++ { ++ x = ylim + d; ++ y = ylim; ++ } ++ if (fxybest < x + y) ++ { ++ fxybest = x + y; ++ fxbest = x; ++ } ++ } ++ ++ /* Find backward diagonal that minimizes X + Y. */ ++ bxybest = OFFSET_MAX; ++ for (d = bmax; d >= bmin; d -= 2) ++ { ++ OFFSET x = MAX (xoff, bd[d]); ++ OFFSET y = x - d; ++ if (y < yoff) ++ { ++ x = yoff + d; ++ y = yoff; ++ } ++ if (x + y < bxybest) ++ { ++ bxybest = x + y; ++ bxbest = x; ++ } ++ } ++ ++ /* Use the better of the two diagonals. */ ++ if ((xlim + ylim) - bxybest < fxybest - (xoff + yoff)) ++ { ++ part->xmid = fxbest; ++ part->ymid = fxybest - fxbest; ++ part->lo_minimal = true; ++ part->hi_minimal = false; ++ } ++ else ++ { ++ part->xmid = bxbest; ++ part->ymid = bxybest - bxbest; ++ part->lo_minimal = false; ++ part->hi_minimal = true; ++ } ++ return; ++ } + } + #undef XREF_YREF_EQUAL + } +@@ -347,6 +444,9 @@ diag (OFFSET xoff, OFFSET xlim, OFFSET y + Note that XLIM, YLIM are exclusive bounds. All indices into the vectors + are origin-0. + ++ If FIND_MINIMAL, find a minimal difference no matter how ++ expensive it is. ++ + The results are recorded by invoking NOTE_DELETE and NOTE_INSERT. + + Return false if terminated normally, or true if terminated through early +@@ -354,7 +454,7 @@ diag (OFFSET xoff, OFFSET xlim, OFFSET y + + static bool + compareseq (OFFSET xoff, OFFSET xlim, OFFSET yoff, OFFSET ylim, +- struct context *ctxt) ++ bool find_minimal, struct context *ctxt) + { + #ifdef ELEMENT + ELEMENT const *xv = ctxt->xvec; /* Help the compiler. */ +@@ -400,12 +500,12 @@ compareseq (OFFSET xoff, OFFSET xlim, OF + struct partition part IF_LINT2 (= { .xmid = 0, .ymid = 0 }); + + /* Find a point of correspondence in the middle of the vectors. */ +- diag (xoff, xlim, yoff, ylim, &part, ctxt); ++ diag (xoff, xlim, yoff, ylim, find_minimal, &part, ctxt); + + /* Use the partitions to split this problem into subproblems. */ +- if (compareseq (xoff, part.xmid, yoff, part.ymid, ctxt)) ++ if (compareseq (xoff, part.xmid, yoff, part.ymid, part.lo_minimal, ctxt)) + return true; +- if (compareseq (part.xmid, xlim, part.ymid, ylim, ctxt)) ++ if (compareseq (part.xmid, xlim, part.ymid, ylim, part.hi_minimal, ctxt)) + return true; + } +