| line | stmt | bran | cond | sub | pod | time | code | 
| 1 |  |  |  |  |  |  | package HTML::Diff; | 
| 2 |  |  |  |  |  |  |  | 
| 3 |  |  |  |  |  |  | our $VERSION = '0.59'; | 
| 4 |  |  |  |  |  |  |  | 
| 5 | 1 |  |  | 1 |  | 58994 | use 5.006; | 
|  | 1 |  |  |  |  | 3 |  | 
| 6 | 1 |  |  | 1 |  | 5 | use strict; | 
|  | 1 |  |  |  |  | 2 |  | 
|  | 1 |  |  |  |  | 23 |  | 
| 7 | 1 |  |  | 1 |  | 5 | use warnings; | 
|  | 1 |  |  |  |  | 6 |  | 
|  | 1 |  |  |  |  | 33 |  | 
| 8 |  |  |  |  |  |  |  | 
| 9 | 1 |  |  | 1 |  | 5 | use Exporter; | 
|  | 1 |  |  |  |  | 2 |  | 
|  | 1 |  |  |  |  | 110 |  | 
| 10 |  |  |  |  |  |  | our @ISA = qw(Exporter); | 
| 11 |  |  |  |  |  |  | our @EXPORT = qw(line_diff word_diff html_word_diff); | 
| 12 |  |  |  |  |  |  |  | 
| 13 |  |  |  |  |  |  | # This list of tags is taken from the XHTML spec and includes | 
| 14 |  |  |  |  |  |  | # all those for which no closing tag is expected. In addition | 
| 15 |  |  |  |  |  |  | # the pattern below matches any tag which ends with a slash / | 
| 16 |  |  |  |  |  |  |  | 
| 17 |  |  |  |  |  |  | our @UNBALANCED_TAGS = qw(br hr p li base basefont meta link | 
| 18 |  |  |  |  |  |  | col colgroup frame input isindex area | 
| 19 |  |  |  |  |  |  | embed img bgsound marquee); | 
| 20 |  |  |  |  |  |  |  | 
| 21 | 1 |  |  | 1 |  | 1111 | use Algorithm::Diff 'sdiff'; | 
|  | 1 |  |  |  |  | 6588 |  | 
|  | 1 |  |  |  |  | 1580 |  | 
| 22 |  |  |  |  |  |  |  | 
| 23 |  |  |  |  |  |  | sub member { | 
| 24 | 122 |  |  | 122 | 0 | 434 | my ($item, @list) = @_; | 
| 25 |  |  |  |  |  |  |  | 
| 26 | 122 |  |  |  |  | 177 | return scalar(grep {$_ eq $item} @list); | 
|  | 2196 |  |  |  |  | 3968 |  | 
| 27 |  |  |  |  |  |  | } | 
| 28 |  |  |  |  |  |  |  | 
| 29 |  |  |  |  |  |  | sub html_word_diff { | 
| 30 | 10 |  |  | 10 | 0 | 2765 | my ($left, $right) = @_; | 
| 31 |  |  |  |  |  |  |  | 
| 32 |  |  |  |  |  |  | # Split the two texts into words and tags. | 
| 33 | 10 |  |  |  |  | 194 | my (@leftchks) = $left =~ m/(<[^>]*>\s*|[^<]+)/gm; | 
| 34 | 10 |  |  |  |  | 180 | my (@rightchks) = $right =~ m/(<[^>]*>\s*|[^<]+)/gm; | 
| 35 |  |  |  |  |  |  |  | 
| 36 | 10 | 100 |  |  |  | 24 | @leftchks = map { $_ =~ /^<[^>]*>$/ ? $_ : ($_ =~ m/(\S+\s*)/gm) } | 
|  | 149 |  |  |  |  | 886 |  | 
| 37 |  |  |  |  |  |  | @leftchks; | 
| 38 | 10 | 100 |  |  |  | 67 | @rightchks = map { $_ =~ /^<[^>]*>$/ ? $_ : ($_ =~ m/(\S+\s*)/gm) } | 
|  | 132 |  |  |  |  | 764 |  | 
| 39 |  |  |  |  |  |  | @rightchks; | 
| 40 |  |  |  |  |  |  |  | 
| 41 |  |  |  |  |  |  | # Remove blanks; maybe the above regexes could handle this? | 
| 42 | 10 |  |  |  |  | 62 | @leftchks = grep { $_ ne '' } @leftchks; | 
|  | 588 |  |  |  |  | 1048 |  | 
| 43 | 10 |  |  |  |  | 38 | @rightchks = grep { $_ ne '' } @rightchks; | 
|  | 531 |  |  |  |  | 909 |  | 
| 44 |  |  |  |  |  |  |  | 
| 45 |  |  |  |  |  |  | # Now we process each segment by turning it into a pair. The first element | 
| 46 |  |  |  |  |  |  | # is the text as we want it to read in the result. The second element is | 
| 47 |  |  |  |  |  |  | # the value we will to use in comparisons. It contains an identifier | 
| 48 |  |  |  |  |  |  | # for each of the balanced tags that it lies within. | 
| 49 |  |  |  |  |  |  |  | 
| 50 |  |  |  |  |  |  | # This subroutine holds state in the tagstack variable | 
| 51 | 10 |  |  |  |  | 38 | my $tagstack = []; | 
| 52 |  |  |  |  |  |  | my $smear_tags = sub { | 
| 53 | 1119 | 100 |  | 1119 |  | 2173 | if ($_ =~ /^<.*>/) { | 
| 54 | 160 | 100 |  |  |  | 339 | if ($_ =~ m|^|) { | 
| 55 | 38 |  |  |  |  | 99 | my ($tag) = m|^\s*([^ \t\n\r>]*)|; | 
| 56 | 38 |  |  |  |  | 57 | $tag = lc $tag; | 
| 57 |  |  |  |  |  |  | #                print STDERR "Found closer of $tag with " . (scalar @$tagstack) . " stack items\n"; | 
| 58 |  |  |  |  |  |  | # If we found the closer for the tag on top | 
| 59 |  |  |  |  |  |  | # of the stack, pop it off. | 
| 60 | 38 | 100 | 66 |  |  | 213 | if ((scalar @$tagstack) > 0 && $$tagstack[-1] eq $tag) { | 
| 61 | 36 |  |  |  |  | 56 | my $stacktag = pop @$tagstack; | 
| 62 |  |  |  |  |  |  | } | 
| 63 | 38 |  |  |  |  | 115 | return [$_, $tag]; | 
| 64 |  |  |  |  |  |  | } else { | 
| 65 | 122 |  |  |  |  | 332 | my ($tag) = m|^<\s*([^\s>]*)|; | 
| 66 | 122 |  |  |  |  | 179 | $tag = lc $tag; | 
| 67 |  |  |  |  |  |  | #                print STDERR "Found opener of $tag with " . (scalar @$tagstack) . " stack items\n"; | 
| 68 | 122 | 100 | 66 |  |  | 246 | if (member($tag, @UNBALANCED_TAGS) || $tag =~ m#/\s*>$#) | 
| 69 |  |  |  |  |  |  | {	                # (tags without correspond closer tags) | 
| 70 | 86 |  |  |  |  | 397 | return [$_, $tag]; | 
| 71 |  |  |  |  |  |  | } else { | 
| 72 | 36 |  |  |  |  | 70 | push @$tagstack, $tag; | 
| 73 |  |  |  |  |  |  | } | 
| 74 | 36 |  |  |  |  | 83 | return [$_, $_]; | 
| 75 |  |  |  |  |  |  | } | 
| 76 |  |  |  |  |  |  | } else { | 
| 77 | 959 |  |  |  |  | 2240 | my $result = [$_, (join "!!!", (@$tagstack, $_)) ]; | 
| 78 | 959 |  |  |  |  | 1697 | return $result; | 
| 79 |  |  |  |  |  |  | } | 
| 80 | 10 |  |  |  |  | 49 | }; | 
| 81 |  |  |  |  |  |  |  | 
| 82 |  |  |  |  |  |  | # Now do the "smear tags" operation across each of the chunk-lists | 
| 83 | 10 |  |  |  |  | 17 | $tagstack = []; | 
| 84 | 10 |  |  |  |  | 19 | @leftchks = map { &$smear_tags } @leftchks; | 
|  | 588 |  |  |  |  | 851 |  | 
| 85 |  |  |  |  |  |  | # TBD: better modularity would preclude having to reset the stack | 
| 86 | 10 |  |  |  |  | 49 | $tagstack = []; | 
| 87 | 10 |  |  |  |  | 20 | @rightchks = map { &$smear_tags } @rightchks; | 
|  | 531 |  |  |  |  | 774 |  | 
| 88 |  |  |  |  |  |  |  | 
| 89 |  |  |  |  |  |  | #    print STDERR Data::Dumper::Dumper(\@leftchks); | 
| 90 |  |  |  |  |  |  | #    print STDERR Data::Dumper::Dumper(\@rightchks); | 
| 91 |  |  |  |  |  |  |  | 
| 92 |  |  |  |  |  |  | # Now do the diff, using the "comparison" half of the pair to | 
| 93 |  |  |  |  |  |  | # compare two chuncks. | 
| 94 |  |  |  |  |  |  | my $chunks = sdiff(\@leftchks, \@rightchks, | 
| 95 | 10 |  |  | 1141 |  | 80 | sub { $_ = elem_cmprsn(shift); $_ =~ s/\s+$/ /g; $_ }); | 
|  | 1141 |  |  |  |  | 19874 |  | 
|  | 1141 |  |  |  |  | 2950 |  | 
|  | 1141 |  |  |  |  | 2609 |  | 
| 96 |  |  |  |  |  |  |  | 
| 97 |  |  |  |  |  |  | #    print STDERR Data::Dumper::Dumper($chunks); | 
| 98 |  |  |  |  |  |  |  | 
| 99 |  |  |  |  |  |  | # Finally, process the output of sdiff by concatenating | 
| 100 |  |  |  |  |  |  | # consecutive chunks that were "unchanged." | 
| 101 | 10 |  |  |  |  | 5727 | my $lastsignal = ''; | 
| 102 | 10 |  |  |  |  | 15 | my $lbuf = ""; | 
| 103 | 10 |  |  |  |  | 14 | my $rbuf = ""; | 
| 104 | 10 |  |  |  |  | 12 | my @result; | 
| 105 |  |  |  |  |  |  | my $ch; | 
| 106 | 10 |  |  |  |  | 18 | foreach $ch (@$chunks) { | 
| 107 | 594 |  |  |  |  | 1005 | my ($signal, $left, $right) = @$ch; | 
| 108 | 594 | 100 | 100 |  |  | 1551 | if ($signal ne $lastsignal && $lastsignal ne '') { | 
| 109 | 55 | 100 | 100 |  |  | 172 | if ($signal ne 'u' && $lastsignal ne 'u') { | 
| 110 | 7 |  |  |  |  | 9 | $signal = 'c'; | 
| 111 |  |  |  |  |  |  | } else { | 
| 112 | 48 |  |  |  |  | 106 | push @result, [$lastsignal, $lbuf, $rbuf]; | 
| 113 | 48 |  |  |  |  | 77 | $lbuf = ""; | 
| 114 | 48 |  |  |  |  | 62 | $rbuf = ""; | 
| 115 |  |  |  |  |  |  | } | 
| 116 |  |  |  |  |  |  | } | 
| 117 |  |  |  |  |  |  | # 	if ($signal eq 'u' && $lastsignal ne 'u') { | 
| 118 |  |  |  |  |  |  | # 	    push @result, [$lastsignal, $lbuf, $rbuf] | 
| 119 |  |  |  |  |  |  | # 		unless $lastsignal eq ''; | 
| 120 |  |  |  |  |  |  | # 	    $lbuf = ""; | 
| 121 |  |  |  |  |  |  | # 	    $rbuf = ""; | 
| 122 |  |  |  |  |  |  | # 	} elsif ($signal ne 'u' && $lastsignal eq 'u') { | 
| 123 |  |  |  |  |  |  | # 	    push @result, [$lastsignal, $lbuf, $rbuf]; | 
| 124 |  |  |  |  |  |  | # 	    $lbuf = ""; | 
| 125 |  |  |  |  |  |  | # 	    $rbuf = ""; | 
| 126 |  |  |  |  |  |  | # 	} | 
| 127 | 594 |  |  |  |  | 927 | my $lelem = elem_mkp($left); | 
| 128 | 594 |  |  |  |  | 1033 | my $relem = elem_mkp($right); | 
| 129 | 594 | 100 |  |  |  | 1060 | $lbuf .= (defined $lelem ? $lelem : ''); | 
| 130 | 594 | 100 |  |  |  | 1055 | $rbuf .= (defined $relem ? $relem : ''); | 
| 131 | 594 |  |  |  |  | 1019 | $lastsignal = $signal; | 
| 132 |  |  |  |  |  |  | } | 
| 133 | 10 |  |  |  |  | 26 | push @result, [$lastsignal, $lbuf, $rbuf]; | 
| 134 | 10 |  |  |  |  | 480 | return \@result; | 
| 135 |  |  |  |  |  |  | } | 
| 136 |  |  |  |  |  |  |  | 
| 137 |  |  |  |  |  |  | # these are like "accessors" for the two halves of the diff-chunk pairs | 
| 138 |  |  |  |  |  |  | sub elem_mkp { | 
| 139 | 1188 |  |  | 1188 | 0 | 1309 | my ($e) = @_; | 
| 140 | 1188 | 100 |  |  |  | 2190 | return undef unless ref $e eq 'ARRAY'; | 
| 141 | 1119 |  |  |  |  | 1481 | my ($mkp, $cmp) = @$e; | 
| 142 | 1119 |  |  |  |  | 1812 | return $mkp; | 
| 143 |  |  |  |  |  |  | } | 
| 144 |  |  |  |  |  |  |  | 
| 145 |  |  |  |  |  |  | sub elem_cmprsn { | 
| 146 | 1141 |  |  | 1141 | 0 | 1383 | my ($e) = @_; | 
| 147 | 1141 | 50 |  |  |  | 2263 | return undef unless ref $e eq 'ARRAY'; | 
| 148 | 1141 |  |  |  |  | 1665 | my ($mkp, $cmp) = @$e; | 
| 149 | 1141 |  |  |  |  | 2092 | return $cmp; | 
| 150 |  |  |  |  |  |  | } | 
| 151 |  |  |  |  |  |  |  | 
| 152 |  |  |  |  |  |  | # Finally a couple of non-HTML diff routines | 
| 153 |  |  |  |  |  |  |  | 
| 154 |  |  |  |  |  |  | sub line_diff { | 
| 155 | 1 |  |  | 1 | 0 | 319 | my ($left, $right) = @_; | 
| 156 | 1 |  |  |  |  | 19 | my (@leftchks) = $left =~ m/(.*\n?)/gm; | 
| 157 | 1 |  |  |  |  | 14 | my (@rightchks) = $right =~ m/(.*\n?)/gm; | 
| 158 | 1 |  |  |  |  | 4 | my $result = sdiff(\@leftchks, \@rightchks); | 
| 159 |  |  |  |  |  |  | #    my @result = map { [ $_->[1], $_->[2] ] } @$result; | 
| 160 | 1 |  |  |  |  | 268 | return $result; | 
| 161 |  |  |  |  |  |  | } | 
| 162 |  |  |  |  |  |  |  | 
| 163 |  |  |  |  |  |  | sub word_diff { | 
| 164 | 0 |  |  | 0 | 0 |  | my ($left, $right) = @_; | 
| 165 | 0 |  |  |  |  |  | my (@leftchks) = $left =~ m/([^\s]*\s?)/gm; | 
| 166 | 0 |  |  |  |  |  | my (@rightchks) = $right =~ m/([^\s]*\s?)/gm; | 
| 167 |  |  |  |  |  |  |  | 
| 168 | 0 |  |  |  |  |  | my $result = sdiff(\@leftchks, \@rightchks); | 
| 169 | 0 |  |  |  |  |  | my @result = (map { [ $_->[1], $_->[2] ] } @$result); | 
|  | 0 |  |  |  |  |  |  | 
| 170 | 0 |  |  |  |  |  | return $result; | 
| 171 |  |  |  |  |  |  | } | 
| 172 |  |  |  |  |  |  |  | 
| 173 |  |  |  |  |  |  | 1; | 
| 174 |  |  |  |  |  |  |  | 
| 175 |  |  |  |  |  |  | =head1 NAME | 
| 176 |  |  |  |  |  |  |  | 
| 177 |  |  |  |  |  |  | HTML::Diff - compare two HTML strings and return a list of differences | 
| 178 |  |  |  |  |  |  |  | 
| 179 |  |  |  |  |  |  | =head1 SYNOPSIS | 
| 180 |  |  |  |  |  |  |  | 
| 181 |  |  |  |  |  |  | use HTML::Diff; | 
| 182 |  |  |  |  |  |  |  | 
| 183 |  |  |  |  |  |  | $result = html_word_diff($left_text, $right_text); | 
| 184 |  |  |  |  |  |  |  | 
| 185 |  |  |  |  |  |  | =head1 DESCRIPTION | 
| 186 |  |  |  |  |  |  |  | 
| 187 |  |  |  |  |  |  | This module compares two strings of HTML and returns a list of a | 
| 188 |  |  |  |  |  |  | chunks which indicate the diff between the two input strings, where | 
| 189 |  |  |  |  |  |  | changes in formatting are considered changes. | 
| 190 |  |  |  |  |  |  |  | 
| 191 |  |  |  |  |  |  | HTML::Diff does not strictly parse the HTML. Instead, it uses regular | 
| 192 |  |  |  |  |  |  | expressions to make a decent effort at understanding the given HTML. | 
| 193 |  |  |  |  |  |  | As a result, there are many valid HTML documents for which it will not | 
| 194 |  |  |  |  |  |  | produce the correct answer. But there may be some invalid HTML | 
| 195 |  |  |  |  |  |  | documents for which it gives you the answer you're looking for. Your | 
| 196 |  |  |  |  |  |  | mileage may vary; test it on lots of inputs from your domain before | 
| 197 |  |  |  |  |  |  | relying on it. | 
| 198 |  |  |  |  |  |  |  | 
| 199 |  |  |  |  |  |  | Returns a reference to a list of triples [, , ]. | 
| 200 |  |  |  |  |  |  | Each triple represents a check of the input texts. The flag tells you | 
| 201 |  |  |  |  |  |  | whether it represents a deletion, insertion, a modification, or an | 
| 202 |  |  |  |  |  |  | unchanged chunk. | 
| 203 |  |  |  |  |  |  |  | 
| 204 |  |  |  |  |  |  | Every character of each input text is accounted for by some triple in | 
| 205 |  |  |  |  |  |  | the output. Specifically, Concatenating all the  members from | 
| 206 |  |  |  |  |  |  | the return value should produce C<$left_text>, and likewise the | 
| 207 |  |  |  |  |  |  | members concatenate together to produce C<$right_text>. | 
| 208 |  |  |  |  |  |  |  | 
| 209 |  |  |  |  |  |  | The  is either C<'u'>, C<'+'>, C<'-'>, or C<'c'>, indicating | 
| 210 |  |  |  |  |  |  | whether the two chunks are the same, the $right_text contained this | 
| 211 |  |  |  |  |  |  | chunk and the left chunk didn't, or vice versa, or the two chunks are | 
| 212 |  |  |  |  |  |  | simply different. This follows the usage of Algorithm::Diff. | 
| 213 |  |  |  |  |  |  |  | 
| 214 |  |  |  |  |  |  | The difference is computed on a word-by-word basis, "breaking" on | 
| 215 |  |  |  |  |  |  | visible words in the HTML text. If a tag only is changed, it will not | 
| 216 |  |  |  |  |  |  | be returned as an independent chunk but will be shown as a change to | 
| 217 |  |  |  |  |  |  | one of the neighboring words. For balanced tags, such as  , it | 
| 218 |  |  |  |  |  |  | is intended that a change to the tag will be treated as a change to | 
| 219 |  |  |  |  |  |  | all words in between. | 
| 220 |  |  |  |  |  |  |  | 
| 221 |  |  |  |  |  |  | =head1 SEE ALSO | 
| 222 |  |  |  |  |  |  |  | 
| 223 |  |  |  |  |  |  | L provides the diff algorithm used in this module. | 
| 224 |  |  |  |  |  |  |  | 
| 225 |  |  |  |  |  |  | L can generate a diff between two XML files, and also | 
| 226 |  |  |  |  |  |  | patch an XML file, given a diff. | 
| 227 |  |  |  |  |  |  |  | 
| 228 |  |  |  |  |  |  | =head1 REPOSITORY | 
| 229 |  |  |  |  |  |  |  | 
| 230 |  |  |  |  |  |  | L | 
| 231 |  |  |  |  |  |  |  | 
| 232 |  |  |  |  |  |  | =head1 AUTHOR | 
| 233 |  |  |  |  |  |  |  | 
| 234 |  |  |  |  |  |  | Whipped up by Ezra elias kilty Cooper, Eezra@ezrakilty.netE. | 
| 235 |  |  |  |  |  |  |  | 
| 236 |  |  |  |  |  |  | Patch contributed by Adam Easjo@koldfront.dkE. | 
| 237 |  |  |  |  |  |  |  | 
| 238 |  |  |  |  |  |  | =head1 COPYRIGHT AND LICENSE | 
| 239 |  |  |  |  |  |  |  | 
| 240 |  |  |  |  |  |  | Copyright 2003-2014 by Ezra elias kilty Cooper, Eezra@ezrakilty.netE | 
| 241 |  |  |  |  |  |  |  | 
| 242 |  |  |  |  |  |  | This program is free software; you can redistribute it and/or | 
| 243 |  |  |  |  |  |  | modify it under the same terms as Perl itself. | 
| 244 |  |  |  |  |  |  |  | 
| 245 |  |  |  |  |  |  | =cut |