View Javadoc

1   package org.catacomb.util;
2   
3   
4   /*
5    * $Log: Diff.java,v $
6    * Revision 1.2  2010-05-31 11:01:45  rcc
7    * rodrigos version
8    *
9    * Revision 1.1  2009-06-23 03:37:26  rcc
10   * first commit
11   *
12   * Revision 1.2  2007-08-31 20:01:54  rcc
13   * misc
14   *
15   * Revision 1.1  2007-08-13 13:20:39  rcc
16   * first
17   *
18   * Revision 1.6  2003/03/06 22:51:32  stuart
19   * Convert to CVS
20   *
21   * Revision 1.5  2002/07/19  19:14:40  stuart
22   * fix reverseScript, make change ctor public, update docs
23   *
24   * Revision 1.4  2002/04/09  17:53:39  stuart
25   * More flexible interface for diff() function.
26   *
27   * Revision 1.3  2000/03/03 21:58:03  stuart
28   * move discard_confusing_lines and shift_boundaries to class file_data
29   *
30   * Revision 1.2  2000/03/02  16:37:38  stuart
31   * Add GPL and copyright
32   *
33   */
34  
35  import java.util.Hashtable;
36  
37  /** A class to compare vectors of objects.  The result of comparison
38      is a list of <code>change</code> objects which form an
39      edit script.  The objects compared are traditionally lines
40      of text from two files.  Comparison options such as "ignore
41      whitespace" are implemented by modifying the <code>equals</code>
42      and <code>hashcode</code> methods for the objects compared.
43  <p>
44     The basic algorithm is described in: </br>
45     "An O(ND) Difference Algorithm and its Variations", Eugene Myers,
46     Algorithmica Vol. 1 No. 2, 1986, p 251.
47  <p>
48     This class outputs different results from GNU diff 1.15 on some
49     inputs.  Our results are actually better (smaller change list, smaller
50     total size of changes), but it would be nice to know why.  Perhaps
51     there is a memory overwrite bug in GNU diff 1.15.
52  
53    @author Stuart D. Gathman, translated from GNU diff 1.15
54      Copyright (C) 2000  Business Management Systems, Inc.
55  <p>
56      This program is free software; you can redistribute it and/or modify
57      it under the terms of the GNU General Public License as published by
58      the Free Software Foundation; either version 1, or (at your option)
59      any later version.
60  <p>
61      This program is distributed in the hope that it will be useful,
62      but WITHOUT ANY WARRANTY; without even the implied warranty of
63      MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
64      GNU General Public License for more details.
65  <p>
66      You should have received a copy of the <a href=COPYING.txt>
67      GNU General Public License</a>
68      along with this program; if not, write to the Free Software
69      Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
70  
71   */
72  
73  public class Diff {
74  
75      /** Prepare to find differences between two arrays.  Each element of
76          the arrays is translated to an "equivalence number" based on
77          the result of <code>equals</code>.  The original Object arrays
78          are no longer needed for computing the differences.  They will
79          be needed again later to print the results of the comparison as
80          an edit script, if desired.
81       */
82      public Diff(Object[] a,Object[] b) {
83          Hashtable<Object, Integer> h = new Hashtable<Object, Integer>(a.length + b.length);
84          filevec[0] = new file_data(a,h);
85          filevec[1] = new file_data(b,h);
86      }
87  
88      /** 1 more than the maximum equivalence value used for this or its
89         sibling file. */
90      int equiv_max = 1;
91  
92      /** When set to true, the comparison uses a heuristic to speed it up.
93        With this heuristic, for files with a constant small density
94        of changes, the algorithm is linear in the file size.  */
95      public boolean heuristic = false;
96  
97      /** When set to true, the algorithm returns a guarranteed minimal
98          set of changes.  This makes things slower, sometimes much slower. */
99      public boolean no_discards = false;
100 
101     private int[] xvec, yvec;	/* Vectors being compared. */
102     private int[] fdiag;		/* Vector, indexed by diagonal, containing
103 				   the X coordinate of the point furthest
104 				   along the given diagonal in the forward
105 				   search of the edit matrix. */
106     private int[] bdiag;		/* Vector, indexed by diagonal, containing
107 				   the X coordinate of the point furthest
108 				   along the given diagonal in the backward
109 				   search of the edit matrix. */
110     private int fdiagoff, bdiagoff;
111     private final file_data[] filevec = new file_data[2];
112     private int cost;
113 
114     /** Find the midpoint of the shortest edit script for a specified
115        portion of the two files.
116 
117        We scan from the beginnings of the files, and simultaneously from the ends,
118        doing a breadth-first search through the space of edit-sequence.
119        When the two searches meet, we have found the midpoint of the shortest
120        edit sequence.
121 
122        The value returned is the number of the diagonal on which the midpoint lies.
123        The diagonal number equals the number of inserted lines minus the number
124        of deleted lines (counting only lines before the midpoint).
125        The edit cost is stored into COST; this is the total number of
126        lines inserted or deleted (counting only lines before the midpoint).
127 
128        This function assumes that the first lines of the specified portions
129        of the two files do not match, and likewise that the last lines do not
130        match.  The caller must trim matching lines from the beginning and end
131        of the portions it is going to specify.
132 
133        Note that if we return the "wrong" diagonal value, or if
134        the value of bdiag at that diagonal is "wrong",
135        the worst this can do is cause suboptimal diff output.
136        It cannot cause incorrect diff output.  */
137 
138     private int diag(int xoff, int xlim, int yoff, int ylim) {
139         final int[] fd = fdiag;	// Give the compiler a chance.
140         final int[] bd = bdiag;	// Additional help for the compiler.
141         final int[] xv = xvec;		// Still more help for the compiler.
142         final int[] yv = yvec;		// And more and more . . .
143         final int dmin = xoff - ylim;	// Minimum valid diagonal.
144         final int dmax = xlim - yoff;	// Maximum valid diagonal.
145         final int fmid = xoff - yoff;	// Center diagonal of top-down search.
146         final int bmid = xlim - ylim;	// Center diagonal of bottom-up search.
147         int fmin = fmid, fmax = fmid;	// Limits of top-down search.
148         int bmin = bmid, bmax = bmid;	// Limits of bottom-up search.
149         /* True if southeast corner is on an odd
150         			     diagonal with respect to the northwest. */
151         final boolean odd = (fmid - bmid & 1) != 0;
152 
153         fd[fdiagoff + fmid] = xoff;
154         bd[bdiagoff + bmid] = xlim;
155 
156         for (int c = 1;; ++c)
157         {
158             int d;			/* Active diagonal. */
159             boolean big_snake = false;
160 
161             /* Extend the top-down search by an edit step in each diagonal. */
162             if (fmin > dmin)
163                 fd[fdiagoff + --fmin - 1] = -1;
164             else
165                 ++fmin;
166             if (fmax < dmax)
167                 fd[fdiagoff + ++fmax + 1] = -1;
168             else
169                 --fmax;
170             for (d = fmax; d >= fmin; d -= 2)
171             {
172                 int x, y, oldx, tlo = fd[fdiagoff + d - 1], thi = fd[fdiagoff + d + 1];
173 
174                 if (tlo >= thi)
175                     x = tlo + 1;
176                 else
177                     x = thi;
178                 oldx = x;
179                 y = x - d;
180                 while (x < xlim && y < ylim && xv[x] == yv[y]) {
181                     ++x;
182                     ++y;
183                 }
184                 if (x - oldx > 20)
185                     big_snake = true;
186                 fd[fdiagoff + d] = x;
187                 if (odd && bmin <= d && d <= bmax && bd[bdiagoff + d] <= fd[fdiagoff + d])
188                 {
189                     cost = 2 * c - 1;
190                     return d;
191                 }
192             }
193 
194             /* Similar extend the bottom-up search. */
195             if (bmin > dmin)
196                 bd[bdiagoff + --bmin - 1] = Integer.MAX_VALUE;
197             else
198                 ++bmin;
199             if (bmax < dmax)
200                 bd[bdiagoff + ++bmax + 1] = Integer.MAX_VALUE;
201             else
202                 --bmax;
203             for (d = bmax; d >= bmin; d -= 2)
204             {
205                 int x, y, oldx, tlo = bd[bdiagoff + d - 1], thi = bd[bdiagoff + d + 1];
206 
207                 if (tlo < thi)
208                     x = tlo;
209                 else
210                     x = thi - 1;
211                 oldx = x;
212                 y = x - d;
213                 while (x > xoff && y > yoff && xv[x - 1] == yv[y - 1]) {
214                     --x;
215                     --y;
216                 }
217                 if (oldx - x > 20)
218                     big_snake = true;
219                 bd[bdiagoff + d] = x;
220                 if (!odd && fmin <= d && d <= fmax && bd[bdiagoff + d] <= fd[fdiagoff + d])
221                 {
222                     cost = 2 * c;
223                     return d;
224                 }
225             }
226 
227             /* Heuristic: check occasionally for a diagonal that has made
228                lots of progress compared with the edit distance.
229                If we have any such, find the one that has made the most
230                progress and return it as if it had succeeded.
231 
232                With this heuristic, for files with a constant small density
233                of changes, the algorithm is linear in the file size.  */
234 
235             if (c > 200 && big_snake && heuristic)
236             {
237                 int best = 0;
238                 int bestpos = -1;
239 
240                 for (d = fmax; d >= fmin; d -= 2)
241                 {
242                     int dd = d - fmid;
243                     if ((fd[fdiagoff + d] - xoff)*2 - dd > 12 * (c + (dd > 0 ? dd : -dd)))
244                     {
245                         if (fd[fdiagoff + d] * 2 - dd > best
246                                 && fd[fdiagoff + d] - xoff > 20
247                                 && fd[fdiagoff + d] - d - yoff > 20)
248                         {
249                             int k;
250                             int x = fd[fdiagoff + d];
251 
252                             /* We have a good enough best diagonal;
253                                now insist that it end with a significant snake.  */
254                             for (k = 1; k <= 20; k++)
255                                 if (xvec[x - k] != yvec[x - d - k])
256                                     break;
257 
258                             if (k == 21)
259                             {
260                                 best = fd[fdiagoff + d] * 2 - dd;
261                                 bestpos = d;
262                             }
263                         }
264                     }
265                 }
266                 if (best > 0)
267                 {
268                     cost = 2 * c - 1;
269                     return bestpos;
270                 }
271 
272                 best = 0;
273                 for (d = bmax; d >= bmin; d -= 2)
274                 {
275                     int dd = d - bmid;
276                     if ((xlim - bd[bdiagoff + d])*2 + dd > 12 * (c + (dd > 0 ? dd : -dd)))
277                     {
278                         if ((xlim - bd[bdiagoff + d]) * 2 + dd > best
279                                 && xlim - bd[bdiagoff + d] > 20
280                                 && ylim - (bd[bdiagoff + d] - d) > 20)
281                         {
282                             /* We have a good enough best diagonal;
283                                now insist that it end with a significant snake.  */
284                             int k;
285                             int x = bd[bdiagoff + d];
286 
287                             for (k = 0; k < 20; k++)
288                                 if (xvec[x + k] != yvec[x - d + k])
289                                     break;
290                             if (k == 20)
291                             {
292                                 best = (xlim - bd[bdiagoff + d]) * 2 + dd;
293                                 bestpos = d;
294                             }
295                         }
296                     }
297                 }
298                 if (best > 0)
299                 {
300                     cost = 2 * c - 1;
301                     return bestpos;
302                 }
303             }
304         }
305     }
306 
307     /** Compare in detail contiguous subsequences of the two files
308        which are known, as a whole, to match each other.
309 
310        The results are recorded in the vectors filevec[N].changed_flag, by
311        storing a 1 in the element for each line that is an insertion or deletion.
312 
313        The subsequence of file 0 is [XOFF, XLIM) and likewise for file 1.
314 
315        Note that XLIM, YLIM are exclusive bounds.
316        All line numbers are origin-0 and discarded lines are not counted.  */
317 
318     @SuppressWarnings("all")
319     private void compareseq(int xoff, int xlim, int yoff, int ylim) {
320         /* Slide down the bottom initial diagonal. */
321         while (xoff < xlim && yoff < ylim && xvec[xoff] == yvec[yoff]) {
322             ++xoff;
323             ++yoff;
324         }
325         /* Slide up the top initial diagonal. */
326         while (xlim > xoff && ylim > yoff && xvec[xlim - 1] == yvec[ylim - 1]) {
327             --xlim;
328             --ylim;
329         }
330 
331         /* Handle simple cases. */
332         if (xoff == xlim)
333             while (yoff < ylim)
334                 filevec[1].changed_flag[1+filevec[1].realindexes[yoff++]] = true;
335         else if (yoff == ylim)
336             while (xoff < xlim)
337                 filevec[0].changed_flag[1+filevec[0].realindexes[xoff++]] = true;
338         else
339         {
340             /* Find a point of correspondence in the middle of the files.  */
341 
342             int d = diag(xoff, xlim, yoff, ylim);
343             int c = cost;
344 
345             int b = bdiag[bdiagoff + d];
346 
347             if (c == 1)
348             {
349                 /* This should be impossible, because it implies that
350                    one of the two subsequences is empty,
351                    and that case was handled above without calling `diag'.
352                    Let's verify that this is true.  */
353                 throw new IllegalArgumentException("Empty subsequence");
354             }
355 
356 
357             /* Use that point to split this problem into two subproblems.  */
358             compareseq(xoff, b, yoff, b - d);
359             /* This used to use f instead of b,
360                but that is incorrect!
361                It is not necessarily the case that diagonal d
362                has a snake from b to f.  */
363             compareseq(b, xlim, b - d, ylim);
364 
365         }
366     }
367 
368     /** Discard lines from one file that have no matches in the other file.
369      */
370 
371     private void discard_confusing_lines() {
372         filevec[0].discard_confusing_lines(filevec[1]);
373         filevec[1].discard_confusing_lines(filevec[0]);
374     }
375 
376     private boolean inhibit = false;
377 
378     /** Adjust inserts/deletes of blank lines to join changes
379        as much as possible.
380      */
381 
382     private void shift_boundaries() {
383         if (inhibit)
384             return;
385         filevec[0].shift_boundaries(filevec[1]);
386         filevec[1].shift_boundaries(filevec[0]);
387     }
388 
389     public interface ScriptBuilder {
390         /** Scan the tables of which lines are inserted and deleted,
391            producing an edit script.
392          @param changed0 true for lines in first file which do not match 2nd
393          @param len0 number of lines in first file
394          @param changed1 true for lines in 2nd file which do not match 1st
395          @param len1 number of lines in 2nd file
396          @return a linked list of changes - or null
397          */
398         public change build_script(
399             boolean[] changed0,int len0,
400             boolean[] changed1,int len1
401         );
402     }
403 
404     /** Scan the tables of which lines are inserted and deleted,
405        producing an edit script in reverse order.  */
406 
407     static class ReverseScript implements ScriptBuilder {
408         public  change build_script(
409             final boolean[] changed0,int len0,
410             final boolean[] changed1,int len1)
411         {
412             change script = null;
413             int i0 = 0, i1 = 0;
414             while (i0 < len0 || i1 < len1) {
415                 if (changed0[1+i0] || changed1[1+i1]) {
416                     int line0 = i0, line1 = i1;
417 
418                     /* Find # lines changed here in each file.  */
419                     while (changed0[1+i0]) ++i0;
420                     while (changed1[1+i1]) ++i1;
421 
422                     /* Record this change.  */
423                     script = new change(line0, line1, i0 - line0, i1 - line1, script);
424                 }
425 
426                 /* We have reached lines in the two files that match each other.  */
427                 i0++;
428                 i1++;
429             }
430 
431             return script;
432         }
433     }
434 
435     static class ForwardScript implements ScriptBuilder {
436         /** Scan the tables of which lines are inserted and deleted,
437            producing an edit script in forward order.  */
438         public change build_script(
439             final boolean[] changed0,int len0,
440             final boolean[] changed1,int len1)
441         {
442             change script = null;
443             int i0 = len0, i1 = len1;
444 
445             while (i0 >= 0 || i1 >= 0)
446             {
447                 if (changed0[i0] || changed1[i1])
448                 {
449                     int line0 = i0, line1 = i1;
450 
451                     /* Find # lines changed here in each file.  */
452                     while (changed0[i0]) --i0;
453                     while (changed1[i1]) --i1;
454 
455                     /* Record this change.  */
456                     script = new change(i0, i1, line0 - i0, line1 - i1, script);
457                 }
458 
459                 /* We have reached lines in the two files that match each other.  */
460                 i0--;
461                 i1--;
462             }
463 
464             return script;
465         }
466     }
467 
468     /** Standard ScriptBuilders. */
469     public final static ScriptBuilder
470     forwardScript = new ForwardScript(),
471     reverseScript = new ReverseScript();
472 
473     /* Report the differences of two files.  DEPTH is the current directory
474        depth. */
475     public final change diff_2(final boolean reverse) {
476         return diff(reverse ? reverseScript : forwardScript);
477     }
478 
479     /** Get the results of comparison as an edit script.  The script
480        is described by a list of changes.  The standard ScriptBuilder
481        implementations provide for forward and reverse edit scripts.
482        Alternate implementations could, for instance, list common elements
483        instead of differences.
484        @param bld	an object to build the script from change flags
485        @return the head of a list of changes
486      */
487     public change diff(final ScriptBuilder bld) {
488 
489         /* Some lines are obviously insertions or deletions
490            because they don't match anything.  Detect them now,
491            and avoid even thinking about them in the main comparison algorithm.  */
492 
493         discard_confusing_lines();
494 
495         /* Now do the main comparison algorithm, considering just the
496            undiscarded lines.  */
497 
498         xvec = filevec[0].undiscarded;
499         yvec = filevec[1].undiscarded;
500 
501         int diags =
502             filevec[0].nondiscarded_lines + filevec[1].nondiscarded_lines + 3;
503         fdiag = new int[diags];
504         fdiagoff = filevec[1].nondiscarded_lines + 1;
505         bdiag = new int[diags];
506         bdiagoff = filevec[1].nondiscarded_lines + 1;
507 
508         compareseq(0, filevec[0].nondiscarded_lines,
509                    0, filevec[1].nondiscarded_lines);
510         fdiag = null;
511         bdiag = null;
512 
513         /* Modify the results slightly to make them prettier
514            in cases where that can validly be done.  */
515 
516         shift_boundaries();
517 
518         /* Get the results of comparison in the form of a chain
519            of `struct change's -- an edit script.  */
520         return bld.build_script(
521                    filevec[0].changed_flag,
522                    filevec[0].buffered_lines,
523                    filevec[1].changed_flag,
524                    filevec[1].buffered_lines
525                );
526 
527     }
528 
529     /** The result of comparison is an "edit script": a chain of change objects.
530        Each change represents one place where some lines are deleted
531        and some are inserted.
532 
533        LINE0 and LINE1 are the first affected lines in the two files (origin 0).
534        DELETED is the number of lines deleted here from file 0.
535        INSERTED is the number of lines inserted here in file 1.
536 
537        If DELETED is 0 then LINE0 is the number of the line before
538        which the insertion was done; vice versa for INSERTED and LINE1.  */
539 
540     public static class change {
541         /** Previous or next edit command. */
542         public change link;
543         /** # lines of file 1 changed here.  */
544         public final int inserted;
545         /** # lines of file 0 changed here.  */
546         public final int deleted;
547         /** Line number of 1st deleted line.  */
548         public final int line0;
549         /** Line number of 1st inserted line.  */
550         public final int line1;
551 
552         /** Cons an additional entry onto the front of an edit script OLD.
553            LINE0 and LINE1 are the first affected lines in the two files (origin 0).
554            DELETED is the number of lines deleted here from file 0.
555            INSERTED is the number of lines inserted here in file 1.
556 
557            If DELETED is 0 then LINE0 is the number of the line before
558            which the insertion was done; vice versa for INSERTED and LINE1.  */
559         public change(int line0, int line1, int deleted, int inserted, change old) {
560             this.line0 = line0;
561             this.line1 = line1;
562             this.inserted = inserted;
563             this.deleted = deleted;
564             this.link = old;
565             //System.err.println(line0+","+line1+","+inserted+","+deleted);
566         }
567     }
568 
569     /** Data on one input file being compared.
570      */
571 
572     class file_data {
573 
574         /** Allocate changed array for the results of comparison.  */
575         void clear() {
576             /* Allocate a flag for each line of each file, saying whether that line
577             is an insertion or deletion.
578              Allocate an extra element, always zero, at each end of each vector.
579                  */
580             changed_flag = new boolean[buffered_lines + 2];
581         }
582 
583         /** Return equiv_count[I] as the number of lines in this file
584            that fall in equivalence class I.
585              @return the array of equivalence class counts.
586          */
587         int[] equivCount() {
588             int[] equiv_count = new int[equiv_max];
589             for (int i = 0; i < buffered_lines; ++i)
590                 ++equiv_count[equivs[i]];
591             return equiv_count;
592         }
593 
594         /** Discard lines that have no matches in another file.
595 
596            A line which is discarded will not be considered by the actual
597            comparison algorithm; it will be as if that line were not in the file.
598            The file's `realindexes' table maps virtual line numbers
599            (which don't count the discarded lines) into real line numbers;
600            this is how the actual comparison algorithm produces results
601            that are comprehensible when the discarded lines are counted.
602         <p>
603            When we discard a line, we also mark it as a deletion or insertion
604            so that it will be printed in the output.
605           @param f the other file
606          */
607         void discard_confusing_lines(file_data f) {
608             clear();
609             /* Set up table of which lines are going to be discarded. */
610             final byte[] discarded = discardable(f.equivCount());
611 
612             /* Don't really discard the provisional lines except when they occur
613                in a run of discardables, with nonprovisionals at the beginning
614                and end.  */
615             filterDiscards(discarded);
616 
617             /* Actually discard the lines. */
618             discard(discarded);
619         }
620 
621         /** Mark to be discarded each line that matches no line of another file.
622            If a line matches many lines, mark it as provisionally discardable.
623            @see equivCount()
624            @param counts The count of each equivalence number for the other file.
625            @return 0=nondiscardable, 1=discardable or 2=provisionally discardable
626            	for each line
627          */
628 
629         private byte[] discardable(final int[] counts) {
630             final int end = buffered_lines;
631             final byte[] discards = new byte[end];
632             //final int[] equivs = this.equivs;
633             int many = 5;
634             int tem = end / 64;
635 
636             /* Multiply MANY by approximate square root of number of lines.
637             That is the threshold for provisionally discardable lines.  */
638             while ((tem = tem >> 2) > 0)
639                 many *= 2;
640 
641             for (int i = 0; i < end; i++)
642             {
643                 int nmatch;
644                 if (equivs[i] == 0)
645                     continue;
646                 nmatch = counts[equivs[i]];
647                 if (nmatch == 0)
648                     discards[i] = 1;
649                 else if (nmatch > many)
650                     discards[i] = 2;
651             }
652             return discards;
653         }
654 
655         /** Don't really discard the provisional lines except when they occur
656            in a run of discardables, with nonprovisionals at the beginning
657            and end.  */
658 
659         private void filterDiscards(final byte[] discards) {
660             final int end = buffered_lines;
661 
662             for (int i = 0; i < end; i++)
663             {
664                 /* Cancel provisional discards not in middle of run of discards.  */
665                 if (discards[i] == 2)
666                     discards[i] = 0;
667                 else if (discards[i] != 0)
668                 {
669                     /* We have found a nonprovisional discard.  */
670                     int j;
671                     int length;
672                     int provisional = 0;
673 
674                     /* Find end of this run of discardable lines.
675                        Count how many are provisionally discardable.  */
676                     for (j = i; j < end; j++)
677                     {
678                         if (discards[j] == 0)
679                             break;
680                         if (discards[j] == 2)
681                             ++provisional;
682                     }
683 
684                     /* Cancel provisional discards at end, and shrink the run.  */
685                     while (j > i && discards[j - 1] == 2) {
686                         discards[--j] = 0;
687                         --provisional;
688                     }
689 
690                     /* Now we have the length of a run of discardable lines
691                        whose first and last are not provisional.  */
692                     length = j - i;
693 
694                     /* If 1/4 of the lines in the run are provisional,
695                        cancel discarding of all provisional lines in the run.  */
696                     if (provisional * 4 > length)
697                     {
698                         while (j > i)
699                             if (discards[--j] == 2)
700                                 discards[j] = 0;
701                     }
702                     else
703                     {
704                         int consec;
705                         int minimum = 1;
706                         int tem = length / 4;
707 
708                         /* MINIMUM is approximate square root of LENGTH/4.
709                            A subrun of two or more provisionals can stand
710                            when LENGTH is at least 16.
711                            A subrun of 4 or more can stand when LENGTH >= 64.  */
712                         while ((tem = tem >> 2) > 0)
713                             minimum *= 2;
714                         minimum++;
715 
716                         /* Cancel any subrun of MINIMUM or more provisionals
717                            within the larger run.  */
718                         for (j = 0, consec = 0; j < length; j++)
719                             if (discards[i + j] != 2)
720                                 consec = 0;
721                             else if (minimum == ++consec)
722                                 /* Back up to start of subrun, to cancel it all.  */
723                                 j -= consec;
724                             else if (minimum < consec)
725                                 discards[i + j] = 0;
726 
727                         /* Scan from beginning of run
728                            until we find 3 or more nonprovisionals in a row
729                            or until the first nonprovisional at least 8 lines in.
730                            Until that point, cancel any provisionals.  */
731                         for (j = 0, consec = 0; j < length; j++)
732                         {
733                             if (j >= 8 && discards[i + j] == 1)
734                                 break;
735                             if (discards[i + j] == 2) {
736                                 consec = 0;
737                                 discards[i + j] = 0;
738                             }
739                             else if (discards[i + j] == 0)
740                                 consec = 0;
741                             else
742                                 consec++;
743                             if (consec == 3)
744                                 break;
745                         }
746 
747                         /* I advances to the last line of the run.  */
748                         i += length - 1;
749 
750                         /* Same thing, from end.  */
751                         for (j = 0, consec = 0; j < length; j++)
752                         {
753                             if (j >= 8 && discards[i - j] == 1)
754                                 break;
755                             if (discards[i - j] == 2) {
756                                 consec = 0;
757                                 discards[i - j] = 0;
758                             }
759                             else if (discards[i - j] == 0)
760                                 consec = 0;
761                             else
762                                 consec++;
763                             if (consec == 3)
764                                 break;
765                         }
766                     }
767                 }
768             }
769         }
770 
771         /** Actually discard the lines.
772           @param discards flags lines to be discarded
773          */
774         private void discard(final byte[] discards) {
775             final int end = buffered_lines;
776             int j = 0;
777             for (int i = 0; i < end; ++i)
778                 if (no_discards || discards[i] == 0)
779                 {
780                     undiscarded[j] = equivs[i];
781                     realindexes[j++] = i;
782                 }
783                 else
784                     changed_flag[1+i] = true;
785             nondiscarded_lines = j;
786         }
787 
788         file_data(Object[] data,Hashtable<Object, Integer> h) {
789             buffered_lines = data.length;
790 
791             equivs = new int[buffered_lines];
792             undiscarded = new int[buffered_lines];
793             realindexes = new int[buffered_lines];
794 
795             for (int i = 0; i < data.length; ++i) {
796                 Integer ir = h.get(data[i]);
797                 if (ir == null)
798                     h.put(data[i], new Integer(equivs[i] = equiv_max++));
799                 else
800                     equivs[i] = ir.intValue();
801             }
802         }
803 
804         /** Adjust inserts/deletes of blank lines to join changes
805            as much as possible.
806 
807            We do something when a run of changed lines include a blank
808            line at one end and have an excluded blank line at the other.
809            We are free to choose which blank line is included.
810            `compareseq' always chooses the one at the beginning,
811            but usually it is cleaner to consider the following blank line
812            to be the "change".  The only exception is if the preceding blank line
813            would join this change to other changes.
814           @param f the file being compared against
815         */
816 
817         void shift_boundaries(file_data f) {
818             final boolean[] changed = changed_flag;
819             final boolean[] other_changed = f.changed_flag;
820             int i = 0;
821             int j = 0;
822             int i_end = buffered_lines;
823             int preceding = -1;
824             int other_preceding = -1;
825 
826             for (;;)
827             {
828                 int start, end, other_start;
829 
830                 /* Scan forwards to find beginning of another run of changes.
831                    Also keep track of the corresponding point in the other file.  */
832 
833                 while (i < i_end && !changed[1+i])
834                 {
835                     while (other_changed[1+j++])
836                         /* Non-corresponding lines in the other file
837                            will count as the preceding batch of changes.  */
838                         other_preceding = j;
839                     i++;
840                 }
841 
842                 if (i == i_end)
843                     break;
844 
845                 start = i;
846                 other_start = j;
847 
848                 for (;;)
849                 {
850                     /* Now find the end of this run of changes.  */
851 
852                     while (i < i_end && changed[1+i]) i++;
853                     end = i;
854 
855                     /* If the first changed line matches the following unchanged one,
856                     and this run does not follow right after a previous run,
857                      and there are no lines deleted from the other file here,
858                      then classify the first changed line as unchanged
859                      and the following line as changed in its place.  */
860 
861                     /* You might ask, how could this run follow right after another?
862                     Only because the previous run was shifted here.  */
863 
864                     if (end != i_end
865                             && equivs[start] == equivs[end]
866                             && !other_changed[1+j]
867                             && end != i_end
868                             && !((preceding >= 0 && start == preceding)
869                                  || (other_preceding >= 0
870                                      && other_start == other_preceding)))
871                     {
872                         changed[1+end++] = true;
873                         changed[1+start++] = false;
874                         ++i;
875                         /* Since one line-that-matches is now before this run
876                            instead of after, we must advance in the other file
877                            to keep in synch.  */
878                         ++j;
879                     }
880                     else
881                         break;
882                 }
883 
884                 preceding = i;
885                 other_preceding = j;
886             }
887         }
888 
889         /** Number of elements (lines) in this file. */
890         final int buffered_lines;
891 
892         /** Vector, indexed by line number, containing an equivalence code for
893            each line.  It is this vector that is actually compared with that
894            of another file to generate differences. */
895         private final int[]	    equivs;
896 
897         /** Vector, like the previous one except that
898            the elements for discarded lines have been squeezed out.  */
899         final int[]	   undiscarded;
900 
901         /** Vector mapping virtual line numbers (not counting discarded lines)
902            to real ones (counting those lines).  Both are origin-0.  */
903         final int[]	   realindexes;
904 
905         /** Total number of nondiscarded lines. */
906         int		    nondiscarded_lines;
907 
908         /** Array, indexed by real origin-1 line number,
909            containing true for a line that is an insertion or a deletion.
910            The results of comparison are stored here.  */
911         boolean[]	    changed_flag;
912 
913     }
914 }