rundiff revision 2:7ab458527c41
1#!/usr/bin/perl
2
3# Copyright (c) 2001 Nathan L. Binkert
4# All rights reserved.
5#
6# Permission to redistribute, use, copy, and modify this software
7# without fee is hereby granted, provided that the following
8# conditions are met:
9#
10# 1. This entire notice is included in all source code copies of any
11#    software which is or includes a copy or modification of this
12#    software.
13# 2. The name of the author may not be used to endorse or promote
14#    products derived from this software without specific prior
15#    written permission.
16#
17# THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
18# OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20# ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
21# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
23# GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
25# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
26# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28#
29
30use Algorithm::Diff qw(diff);
31use vars qw ($opt_C $opt_c $opt_u $opt_U);
32
33$opt_u = "";
34$opt_c = undef;
35
36$diffsize = 2000;
37# After we've read up to a certain point in each file, the number of items
38# we've read from each file will differ by $FLD (could be 0)
39my $File_Length_Difference = 0;
40my $Context_Lines = 9;
41
42$progname = $0;
43if (scalar(@ARGV) != 2) {
44  usage();
45}
46
47my ($filename1, $filename2);
48($filename1, $start1) = parse_filearg($ARGV[0]);
49($filename2, $start2) = parse_filearg($ARGV[1]);
50
51if ($filename1 eq "-" && $filename2 eq "-") {
52  die "Only one of the inputs may be standard in\n";
53}
54
55my ($file1, $file2);
56if ($filename1 eq "-") {
57  $file1 = STDIN;
58} else {
59  open(FILE1, $filename1) || die "can't open $file1: $!\n";
60  $file1 = FILE1;
61}
62
63if ($filename2 eq "-") {
64  $file2 = STDIN;
65} else {
66  open(FILE2, $filename2) || die "can't open $file2: $!\n";
67  $file2 = FILE2;
68}
69
70my $file_offset1 = ffw($file1, $start1);
71my $file_offset2 = ffw($file2, $start2);
72
73$skip_first = 0;
74my (@buf1, @buf2, @printbuf1, @printbuf2);
75
76$Compare_Ahead = 0;
77
78while (!eof($file1) && !eof($file2)) {
79  my $line1 = <$file1>; chomp $line1;
80  my $line2 = <$file2>; chomp $line2;
81  my $printline1 = $line1;
82  my $printline2 = $line2;
83
84  push @buf1, $line1;
85  push @buf2, $line2;
86  push @printbuf1, $printline1;
87  push @printbuf2, $printline2;
88
89#  while ($Compare_Ahead < $Context_Lines) {
90#    $line1 = @buf1[$Compare_Ahead];
91#    $line2 = @buf2[$Compare_Ahead];
92#    $line2 =~ s/ *--.*$//;
93#    if ($line1 ne $line2) { last; }
94#    ++$Compare_Ahead;
95#  }
96
97  $line1 = @buf1[$Compare_Ahead];
98  $line2 = @buf2[$Compare_Ahead];
99  $line2 =~ s/ *--.*$//;
100
101  if ($line1 ne $line2) {
102    while (!eof($file1) && scalar(@buf1) < $diffsize) {
103      $line = <$file1>; chomp $line;
104      my $printline = $line;
105
106      push @printbuf1, $printline;
107      push @buf1, $line;
108    }
109    
110    while (!eof($file2) && scalar(@buf2) < $diffsize) {
111      $line = <$file2>; chomp $line;
112      my $printline = $line;
113#      $line =~ s/ *--.*$//;
114
115      push @printbuf2, $printline;
116      push @buf2, $line;
117    }
118
119    my $diffs = diff(\@buf1, \@buf2);
120
121    next unless @$diffs;
122
123    my @hunklist;
124    my ($hunk,$oldhunk);
125    # Loop over hunks. If a hunk overlaps with the last hunk, join them.
126    # Otherwise, print out the old one.
127    foreach my $piece (@$diffs) {
128      $hunk = new Hunk ($piece, $Context_Lines, scalar(@buf1));
129      next unless $oldhunk;
130
131      if ($hunk->does_overlap($oldhunk)) {
132	$hunk->prepend_hunk($oldhunk);
133      } else {
134	push @hunklist, $oldhunk;
135      }
136    } continue {
137      $oldhunk = $hunk;
138    }
139      
140    my $change = 0;
141    while (scalar(@hunklist) && !$change) {
142      $hunk = pop @hunklist;
143      $change = $hunk->{"change"};
144    }
145    push @hunklist, $hunk;
146    $last_start1 = $hunk->{"start1"};
147    $last_start2 = $hunk->{"start2"};
148    $last_end1 = $hunk->{"end1"};
149    $last_end2 = $hunk->{"end2"};
150
151    while (scalar(@hunklist)) {
152      $hunk = shift @hunklist;
153#      $hunk->output_diff(\@buf1, \@buf2);
154      $hunk->output_diff(\@printbuf1, \@printbuf2);
155    }
156
157    $last_end1 -=  $Context_Lines - 1;
158    $last_end2 -=  $Context_Lines - 1;
159    $file_offset1 += $last_end1;
160    $file_offset2 += $last_end2;
161    @printbuf1 = @printbuf1[$last_end1..$#printbuf1];
162    @printbuf2 = @printbuf2[$last_end2..$#printbuf2];
163    @buf1 = @buf1[$last_end1..$#buf1];
164    @buf2 = @buf2[$last_end2..$#buf2];
165    while (scalar(@buf1) > $Context_Lines &&
166	   scalar(@buf2) > $Context_Lines) {
167      $foo1 = @buf1[$Context_Lines];
168      $foo2 = @buf2[$Context_Lines];
169      if (scalar($foo1) != scalar($foo2) || $foo1 ne $foo2) { last; }
170      $foo1 = shift @printbuf1;
171      $foo2 = shift @printbuf2;
172      $foo1 = shift @buf1;
173      $foo2 = shift @buf2;
174      ++$file_offset1;
175      ++$file_offset2;
176    }
177  } else {
178    ++$file_offset1;
179    ++$file_offset2;
180    $foo1 = shift @printbuf1;
181    $foo2 = shift @printbuf2;
182    $foo1 = shift @buf1;
183    $foo2 = shift @buf2;
184  }
185}
186
187close $file1;
188close $file2;
189
190sub ffw() {
191  if (scalar(@_) != 2) { die "improper usage of ffw\n"; }
192
193  my $FILE = $_[0];
194  my $start = $_[1];
195  my $count = 0;
196
197  while ($start-- > 0 && !eof($FILE)) {
198    <$FILE>;
199    $count++;
200  }
201
202  if ($start > 0) {die "File too short for ffw amount\n"; }
203  return $count;
204}
205
206sub parse_filearg() {
207  $start = 0;
208  split /:/, @_[0];
209  if (scalar(@_) > 2) { usage(); }
210
211  $file = $_[0];
212  if (scalar(@_) > 1) { $start = $_[1]; }
213
214  return ($file, $start);
215}
216
217sub usage() {
218  printf "usage: $progname <file1>[:start] <file2>[:start]\n";
219  exit 1;
220}
221
222
223# Package Hunk. A Hunk is a group of Blocks which overlap because of the
224# context surrounding each block. (So if we're not using context, every
225# hunk will contain one block.)
226{
227package Hunk;
228
229sub new {
230# Arg1 is output from &LCS::diff (which corresponds to one Block)
231# Arg2 is the number of items (lines, e.g.,) of context around each block
232#
233# This subroutine changes $File_Length_Difference
234#
235# Fields in a Hunk:
236# blocks      - a list of Block objects
237# start       - index in file 1 where first block of the hunk starts
238# end         - index in file 1 where last block of the hunk ends
239#
240# Variables:
241# before_diff - how much longer file 2 is than file 1 due to all hunks
242#               until but NOT including this one
243# after_diff  - difference due to all hunks including this one
244    my ($class, $piece, $context_items, $maxlen) = @_;
245
246    my $block = new Block ($piece); # this modifies $FLD!
247
248    my $before_diff = $File_Length_Difference; # BEFORE this hunk
249    my $after_diff = $before_diff + $block->{"length_diff"};
250    $File_Length_Difference += $block->{"length_diff"};
251
252    # @remove_array and @insert_array hold the items to insert and remove
253    # Save the start & beginning of each array. If the array doesn't exist
254    # though (e.g., we're only adding items in this block), then figure
255    # out the line number based on the line number of the other file and
256    # the current difference in file lenghts
257    my @remove_array = $block->remove;
258    my @insert_array = $block->insert;
259    my ($a1, $a2, $b1, $b2, $start1, $start2, $end1, $end2, $change);
260    $a1 = @remove_array ? $remove_array[0 ]->{"item_no"} : -1;
261    $a2 = @remove_array ? $remove_array[-1]->{"item_no"} : -1;
262    $b1 = @insert_array ? $insert_array[0 ]->{"item_no"} : -1;
263    $b2 = @insert_array ? $insert_array[-1]->{"item_no"} : -1;
264
265    $start1 = $a1 == -1 ? $b1 - $before_diff : $a1;
266    $end1   = $a2 == -1 ? $b2 - $after_diff  : $a2;
267    $start2 = $b1 == -1 ? $a1 + $before_diff : $b1;
268    $end2   = $b2 == -1 ? $a2 + $after_diff  : $b2;
269    $change = scalar(@remove_array) && scalar(@insert_array);
270
271    # At first, a hunk will have just one Block in it
272    my $hunk = {
273		"start1" => $start1,
274		"start2" => $start2,
275		"end1" => $end1,
276		"end2" => $end2,
277		"maxlen" => $maxlen,
278		"change" => $change,
279		"blocks" => [$block],
280              };
281    bless $hunk, $class;
282
283    $hunk->flag_context($context_items);
284
285    return $hunk;
286}
287
288# Change the "start" and "end" fields to note that context should be added
289# to this hunk
290sub flag_context {
291    my ($hunk, $context_items) = @_;
292    return unless $context_items; # no context
293
294    # add context before
295    my $start1 = $hunk->{"start1"};
296    my $num_added = $context_items > $start1 ? $start1 : $context_items;
297    $hunk->{"start1"} -= $num_added;
298    $hunk->{"start2"} -= $num_added;
299
300    # context after
301    my $end1 = $hunk->{"end1"};
302    $num_added = ($end1+$context_items > $hunk->{"maxlen"}) ?
303                  $hunk->{"maxlen"} - $end1 :
304                  $context_items;
305    $hunk->{"end1"} += $num_added;
306    $hunk->{"end2"} += $num_added;
307}
308
309# Is there an overlap between hunk arg0 and old hunk arg1?
310# Note: if end of old hunk is one less than beginning of second, they overlap
311sub does_overlap {
312    my ($hunk, $oldhunk) = @_;
313    return "" unless $oldhunk; # first time through, $oldhunk is empty
314
315    # Do I actually need to test both?
316    return ($hunk->{"start1"} - $oldhunk->{"end1"} <= 1 ||
317            $hunk->{"start2"} - $oldhunk->{"end2"} <= 1);
318}
319
320# Prepend hunk arg1 to hunk arg0
321# Note that arg1 isn't updated! Only arg0 is.
322sub prepend_hunk {
323    my ($hunk, $oldhunk) = @_;
324
325    $hunk->{"start1"} = $oldhunk->{"start1"};
326    $hunk->{"start2"} = $oldhunk->{"start2"};
327
328    unshift (@{$hunk->{"blocks"}}, @{$oldhunk->{"blocks"}});
329}
330
331
332# DIFF OUTPUT ROUTINES. THESE ROUTINES CONTAIN DIFF FORMATTING INFO...
333sub output_diff {
334    if    (defined $main::opt_u) {&output_unified_diff(@_)}
335    elsif (defined $main::opt_c) {&output_context_diff(@_)}
336    else {die "unknown diff"}
337}
338
339sub output_unified_diff {
340    my ($hunk, $fileref1, $fileref2) = @_;
341    my @blocklist;
342
343    # Calculate item number range.
344    my $range1 = $hunk->unified_range(1, $file_offset1);
345    my $range2 = $hunk->unified_range(2, $file_offset2);
346    print "@@ -$range1 +$range2 @@\n";
347
348    # Outlist starts containing the hunk of file 1.
349    # Removing an item just means putting a '-' in front of it.
350    # Inserting an item requires getting it from file2 and splicing it in.
351    #    We splice in $num_added items. Remove blocks use $num_added because
352    # splicing changed the length of outlist.
353    #    We remove $num_removed items. Insert blocks use $num_removed because
354    # their item numbers---corresponding to positions in file *2*--- don't take
355    # removed items into account.
356    my $low = $hunk->{"start1"};
357    my $hi = $hunk->{"end1"};
358    my ($num_added, $num_removed) = (0,0);
359    my @outlist = @$fileref1[$low..$hi];
360    map {s/^/ /} @outlist; # assume it's just context
361
362    foreach my $block (@{$hunk->{"blocks"}}) {
363	foreach my $item ($block->remove) {
364	    my $op = $item->{"sign"}; # -
365	    my $offset = $item->{"item_no"} - $low + $num_added;
366	    $outlist[$offset] =~ s/^ /$op/;
367	    $num_removed++;
368	}
369	foreach my $item ($block->insert) {
370	    my $op = $item->{"sign"}; # +
371	    my $i = $item->{"item_no"};
372	    my $offset = $i - $hunk->{"start2"} + $num_removed;
373	    splice(@outlist,$offset,0,"$op$$fileref2[$i]");
374	    $num_added++;
375	}
376    }
377
378    map {s/$/\n/} @outlist; # add \n's
379    print @outlist;
380
381}
382
383sub output_context_diff {
384    my ($hunk, $fileref1, $fileref2) = @_;
385    my @blocklist;
386
387    print "***************\n";
388    # Calculate item number range.
389    my $range1 = $hunk->context_range(1, $file_offset1);
390    my $range2 = $hunk->context_range(2, $file_offset2);
391
392    # Print out file 1 part for each block in context diff format if there are
393    # any blocks that remove items
394    print "*** $range1 ****\n";
395    my $low = $hunk->{"start1"};
396    my $hi  = $hunk->{"end1"};
397    if (@blocklist = grep {$_->remove} @{$hunk->{"blocks"}}) {
398	my @outlist = @$fileref1[$low..$hi];
399	map {s/^/  /} @outlist; # assume it's just context
400	foreach my $block (@blocklist) {
401	    my $op = $block->op; # - or !
402	    foreach my $item ($block->remove) {
403		$outlist[$item->{"item_no"} - $low] =~ s/^ /$op/;
404	    }
405	}
406	map {s/$/\n/} @outlist; # add \n's
407	print @outlist;
408    }
409
410    print "--- $range2 ----\n";
411    $low = $hunk->{"start2"};
412    $hi  = $hunk->{"end2"};
413    if (@blocklist = grep {$_->insert} @{$hunk->{"blocks"}}) {
414	my @outlist = @$fileref2[$low..$hi];
415	map {s/^/  /} @outlist; # assume it's just context
416	foreach my $block (@blocklist) {
417	    my $op = $block->op; # + or !
418	    foreach my $item ($block->insert) {
419		$outlist[$item->{"item_no"} - $low] =~ s/^ /$op/;
420	    }
421	}
422	map {s/$/\n/} @outlist; # add \n's
423	print @outlist;
424    }
425}
426
427sub context_range {
428# Generate a range of item numbers to print. Only print 1 number if the range
429# has only one item in it. Otherwise, it's 'start,end'
430    my ($hunk, $flag, $offset) = @_;
431    my ($start, $end) = ($hunk->{"start$flag"},$hunk->{"end$flag"});
432
433    # index from 1, not zero
434    $start += $offset + 1;
435    $end += $offset + 1;
436    my $range = ($start < $end) ? "$start,$end" : $end;
437    return $range;
438}
439
440sub unified_range {
441# Generate a range of item numbers to print for unified diff
442# Print number where block starts, followed by number of lines in the block
443# (don't print number of lines if it's 1)
444    my ($hunk, $flag, $offset) = @_;
445    my ($start, $end) = ($hunk->{"start$flag"},$hunk->{"end$flag"});
446
447    # index from 1, not zero
448    $start += $offset + 1;
449    $end += $offset + 1;
450    my $length = $end - $start + 1;
451    my $first = $length < 2 ? $end : $start; # strange, but correct...
452    my $range = $length== 1 ? $first : "$first,$length";
453    return $range;
454}
455} # end Package Hunk
456
457# Package Block. A block is an operation removing, adding, or changing
458# a group of items. Basically, this is just a list of changes, where each
459# change adds or deletes a single item.
460# (Change could be a separate class, but it didn't seem worth it)
461{
462package Block;
463sub new {
464# Input is a chunk from &Algorithm::LCS::diff
465# Fields in a block:
466# length_diff - how much longer file 2 is than file 1 due to this block
467# Each change has:
468# sign        - '+' for insert, '-' for remove
469# item_no     - number of the item in the file (e.g., line number)
470# We don't bother storing the text of the item
471#
472    my ($class,$chunk) = @_;
473    my @changes = ();
474
475# This just turns each change into a hash.
476    foreach my $item (@$chunk) {
477	my ($sign, $item_no, $text) = @$item;
478	my $hashref = {"sign" => $sign, "item_no" => $item_no};
479	push @changes, $hashref;
480    }
481
482    my $block = { "changes" => \@changes };
483    bless $block, $class;
484
485    $block->{"length_diff"} = $block->insert - $block->remove;
486    return $block;
487}
488
489
490# LOW LEVEL FUNCTIONS
491sub op {
492# what kind of block is this?
493    my $block = shift;
494    my $insert = $block->insert;
495    my $remove = $block->remove;
496
497    $remove && $insert and return '!';
498    $remove and return '-';
499    $insert and return '+';
500    warn "unknown block type";
501    return '^'; # context block
502}
503
504# Returns a list of the changes in this block that remove items
505# (or the number of removals if called in scalar context)
506sub remove { return grep {$_->{"sign"} eq '-'} @{shift->{"changes"}}; }
507
508# Returns a list of the changes in this block that insert items
509sub insert { return grep {$_->{"sign"} eq '+'} @{shift->{"changes"}}; }
510
511} # end of package Block
512