line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package HTML::Diff; |
2
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
our $VERSION = '0.59'; |
4
|
|
|
|
|
|
|
|
5
|
1
|
|
|
1
|
|
58994
|
use 5.006; |
|
1
|
|
|
|
|
3
|
|
6
|
1
|
|
|
1
|
|
5
|
use strict; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
23
|
|
7
|
1
|
|
|
1
|
|
5
|
use warnings; |
|
1
|
|
|
|
|
6
|
|
|
1
|
|
|
|
|
33
|
|
8
|
|
|
|
|
|
|
|
9
|
1
|
|
|
1
|
|
5
|
use Exporter; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
110
|
|
10
|
|
|
|
|
|
|
our @ISA = qw(Exporter); |
11
|
|
|
|
|
|
|
our @EXPORT = qw(line_diff word_diff html_word_diff); |
12
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
# This list of tags is taken from the XHTML spec and includes |
14
|
|
|
|
|
|
|
# all those for which no closing tag is expected. In addition |
15
|
|
|
|
|
|
|
# the pattern below matches any tag which ends with a slash / |
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
our @UNBALANCED_TAGS = qw(br hr p li base basefont meta link |
18
|
|
|
|
|
|
|
col colgroup frame input isindex area |
19
|
|
|
|
|
|
|
embed img bgsound marquee); |
20
|
|
|
|
|
|
|
|
21
|
1
|
|
|
1
|
|
1111
|
use Algorithm::Diff 'sdiff'; |
|
1
|
|
|
|
|
6588
|
|
|
1
|
|
|
|
|
1580
|
|
22
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
sub member { |
24
|
122
|
|
|
122
|
0
|
434
|
my ($item, @list) = @_; |
25
|
|
|
|
|
|
|
|
26
|
122
|
|
|
|
|
177
|
return scalar(grep {$_ eq $item} @list); |
|
2196
|
|
|
|
|
3968
|
|
27
|
|
|
|
|
|
|
} |
28
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
sub html_word_diff { |
30
|
10
|
|
|
10
|
0
|
2765
|
my ($left, $right) = @_; |
31
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
# Split the two texts into words and tags. |
33
|
10
|
|
|
|
|
194
|
my (@leftchks) = $left =~ m/(<[^>]*>\s*|[^<]+)/gm; |
34
|
10
|
|
|
|
|
180
|
my (@rightchks) = $right =~ m/(<[^>]*>\s*|[^<]+)/gm; |
35
|
|
|
|
|
|
|
|
36
|
10
|
100
|
|
|
|
24
|
@leftchks = map { $_ =~ /^<[^>]*>$/ ? $_ : ($_ =~ m/(\S+\s*)/gm) } |
|
149
|
|
|
|
|
886
|
|
37
|
|
|
|
|
|
|
@leftchks; |
38
|
10
|
100
|
|
|
|
67
|
@rightchks = map { $_ =~ /^<[^>]*>$/ ? $_ : ($_ =~ m/(\S+\s*)/gm) } |
|
132
|
|
|
|
|
764
|
|
39
|
|
|
|
|
|
|
@rightchks; |
40
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
# Remove blanks; maybe the above regexes could handle this? |
42
|
10
|
|
|
|
|
62
|
@leftchks = grep { $_ ne '' } @leftchks; |
|
588
|
|
|
|
|
1048
|
|
43
|
10
|
|
|
|
|
38
|
@rightchks = grep { $_ ne '' } @rightchks; |
|
531
|
|
|
|
|
909
|
|
44
|
|
|
|
|
|
|
|
45
|
|
|
|
|
|
|
# Now we process each segment by turning it into a pair. The first element |
46
|
|
|
|
|
|
|
# is the text as we want it to read in the result. The second element is |
47
|
|
|
|
|
|
|
# the value we will to use in comparisons. It contains an identifier |
48
|
|
|
|
|
|
|
# for each of the balanced tags that it lies within. |
49
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
# This subroutine holds state in the tagstack variable |
51
|
10
|
|
|
|
|
38
|
my $tagstack = []; |
52
|
|
|
|
|
|
|
my $smear_tags = sub { |
53
|
1119
|
100
|
|
1119
|
|
2173
|
if ($_ =~ /^<.*>/) { |
54
|
160
|
100
|
|
|
|
339
|
if ($_ =~ m|^|) { |
55
|
38
|
|
|
|
|
99
|
my ($tag) = m|^\s*([^ \t\n\r>]*)|; |
56
|
38
|
|
|
|
|
57
|
$tag = lc $tag; |
57
|
|
|
|
|
|
|
# print STDERR "Found closer of $tag with " . (scalar @$tagstack) . " stack items\n"; |
58
|
|
|
|
|
|
|
# If we found the closer for the tag on top |
59
|
|
|
|
|
|
|
# of the stack, pop it off. |
60
|
38
|
100
|
66
|
|
|
213
|
if ((scalar @$tagstack) > 0 && $$tagstack[-1] eq $tag) { |
61
|
36
|
|
|
|
|
56
|
my $stacktag = pop @$tagstack; |
62
|
|
|
|
|
|
|
} |
63
|
38
|
|
|
|
|
115
|
return [$_, $tag]; |
64
|
|
|
|
|
|
|
} else { |
65
|
122
|
|
|
|
|
332
|
my ($tag) = m|^<\s*([^\s>]*)|; |
66
|
122
|
|
|
|
|
179
|
$tag = lc $tag; |
67
|
|
|
|
|
|
|
# print STDERR "Found opener of $tag with " . (scalar @$tagstack) . " stack items\n"; |
68
|
122
|
100
|
66
|
|
|
246
|
if (member($tag, @UNBALANCED_TAGS) || $tag =~ m#/\s*>$#) |
69
|
|
|
|
|
|
|
{ # (tags without correspond closer tags) |
70
|
86
|
|
|
|
|
397
|
return [$_, $tag]; |
71
|
|
|
|
|
|
|
} else { |
72
|
36
|
|
|
|
|
70
|
push @$tagstack, $tag; |
73
|
|
|
|
|
|
|
} |
74
|
36
|
|
|
|
|
83
|
return [$_, $_]; |
75
|
|
|
|
|
|
|
} |
76
|
|
|
|
|
|
|
} else { |
77
|
959
|
|
|
|
|
2240
|
my $result = [$_, (join "!!!", (@$tagstack, $_)) ]; |
78
|
959
|
|
|
|
|
1697
|
return $result; |
79
|
|
|
|
|
|
|
} |
80
|
10
|
|
|
|
|
49
|
}; |
81
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
# Now do the "smear tags" operation across each of the chunk-lists |
83
|
10
|
|
|
|
|
17
|
$tagstack = []; |
84
|
10
|
|
|
|
|
19
|
@leftchks = map { &$smear_tags } @leftchks; |
|
588
|
|
|
|
|
851
|
|
85
|
|
|
|
|
|
|
# TBD: better modularity would preclude having to reset the stack |
86
|
10
|
|
|
|
|
49
|
$tagstack = []; |
87
|
10
|
|
|
|
|
20
|
@rightchks = map { &$smear_tags } @rightchks; |
|
531
|
|
|
|
|
774
|
|
88
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
# print STDERR Data::Dumper::Dumper(\@leftchks); |
90
|
|
|
|
|
|
|
# print STDERR Data::Dumper::Dumper(\@rightchks); |
91
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
# Now do the diff, using the "comparison" half of the pair to |
93
|
|
|
|
|
|
|
# compare two chuncks. |
94
|
|
|
|
|
|
|
my $chunks = sdiff(\@leftchks, \@rightchks, |
95
|
10
|
|
|
1141
|
|
80
|
sub { $_ = elem_cmprsn(shift); $_ =~ s/\s+$/ /g; $_ }); |
|
1141
|
|
|
|
|
19874
|
|
|
1141
|
|
|
|
|
2950
|
|
|
1141
|
|
|
|
|
2609
|
|
96
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
# print STDERR Data::Dumper::Dumper($chunks); |
98
|
|
|
|
|
|
|
|
99
|
|
|
|
|
|
|
# Finally, process the output of sdiff by concatenating |
100
|
|
|
|
|
|
|
# consecutive chunks that were "unchanged." |
101
|
10
|
|
|
|
|
5727
|
my $lastsignal = ''; |
102
|
10
|
|
|
|
|
15
|
my $lbuf = ""; |
103
|
10
|
|
|
|
|
14
|
my $rbuf = ""; |
104
|
10
|
|
|
|
|
12
|
my @result; |
105
|
|
|
|
|
|
|
my $ch; |
106
|
10
|
|
|
|
|
18
|
foreach $ch (@$chunks) { |
107
|
594
|
|
|
|
|
1005
|
my ($signal, $left, $right) = @$ch; |
108
|
594
|
100
|
100
|
|
|
1551
|
if ($signal ne $lastsignal && $lastsignal ne '') { |
109
|
55
|
100
|
100
|
|
|
172
|
if ($signal ne 'u' && $lastsignal ne 'u') { |
110
|
7
|
|
|
|
|
9
|
$signal = 'c'; |
111
|
|
|
|
|
|
|
} else { |
112
|
48
|
|
|
|
|
106
|
push @result, [$lastsignal, $lbuf, $rbuf]; |
113
|
48
|
|
|
|
|
77
|
$lbuf = ""; |
114
|
48
|
|
|
|
|
62
|
$rbuf = ""; |
115
|
|
|
|
|
|
|
} |
116
|
|
|
|
|
|
|
} |
117
|
|
|
|
|
|
|
# if ($signal eq 'u' && $lastsignal ne 'u') { |
118
|
|
|
|
|
|
|
# push @result, [$lastsignal, $lbuf, $rbuf] |
119
|
|
|
|
|
|
|
# unless $lastsignal eq ''; |
120
|
|
|
|
|
|
|
# $lbuf = ""; |
121
|
|
|
|
|
|
|
# $rbuf = ""; |
122
|
|
|
|
|
|
|
# } elsif ($signal ne 'u' && $lastsignal eq 'u') { |
123
|
|
|
|
|
|
|
# push @result, [$lastsignal, $lbuf, $rbuf]; |
124
|
|
|
|
|
|
|
# $lbuf = ""; |
125
|
|
|
|
|
|
|
# $rbuf = ""; |
126
|
|
|
|
|
|
|
# } |
127
|
594
|
|
|
|
|
927
|
my $lelem = elem_mkp($left); |
128
|
594
|
|
|
|
|
1033
|
my $relem = elem_mkp($right); |
129
|
594
|
100
|
|
|
|
1060
|
$lbuf .= (defined $lelem ? $lelem : ''); |
130
|
594
|
100
|
|
|
|
1055
|
$rbuf .= (defined $relem ? $relem : ''); |
131
|
594
|
|
|
|
|
1019
|
$lastsignal = $signal; |
132
|
|
|
|
|
|
|
} |
133
|
10
|
|
|
|
|
26
|
push @result, [$lastsignal, $lbuf, $rbuf]; |
134
|
10
|
|
|
|
|
480
|
return \@result; |
135
|
|
|
|
|
|
|
} |
136
|
|
|
|
|
|
|
|
137
|
|
|
|
|
|
|
# these are like "accessors" for the two halves of the diff-chunk pairs |
138
|
|
|
|
|
|
|
sub elem_mkp { |
139
|
1188
|
|
|
1188
|
0
|
1309
|
my ($e) = @_; |
140
|
1188
|
100
|
|
|
|
2190
|
return undef unless ref $e eq 'ARRAY'; |
141
|
1119
|
|
|
|
|
1481
|
my ($mkp, $cmp) = @$e; |
142
|
1119
|
|
|
|
|
1812
|
return $mkp; |
143
|
|
|
|
|
|
|
} |
144
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
sub elem_cmprsn { |
146
|
1141
|
|
|
1141
|
0
|
1383
|
my ($e) = @_; |
147
|
1141
|
50
|
|
|
|
2263
|
return undef unless ref $e eq 'ARRAY'; |
148
|
1141
|
|
|
|
|
1665
|
my ($mkp, $cmp) = @$e; |
149
|
1141
|
|
|
|
|
2092
|
return $cmp; |
150
|
|
|
|
|
|
|
} |
151
|
|
|
|
|
|
|
|
152
|
|
|
|
|
|
|
# Finally a couple of non-HTML diff routines |
153
|
|
|
|
|
|
|
|
154
|
|
|
|
|
|
|
sub line_diff { |
155
|
1
|
|
|
1
|
0
|
319
|
my ($left, $right) = @_; |
156
|
1
|
|
|
|
|
19
|
my (@leftchks) = $left =~ m/(.*\n?)/gm; |
157
|
1
|
|
|
|
|
14
|
my (@rightchks) = $right =~ m/(.*\n?)/gm; |
158
|
1
|
|
|
|
|
4
|
my $result = sdiff(\@leftchks, \@rightchks); |
159
|
|
|
|
|
|
|
# my @result = map { [ $_->[1], $_->[2] ] } @$result; |
160
|
1
|
|
|
|
|
268
|
return $result; |
161
|
|
|
|
|
|
|
} |
162
|
|
|
|
|
|
|
|
163
|
|
|
|
|
|
|
sub word_diff { |
164
|
0
|
|
|
0
|
0
|
|
my ($left, $right) = @_; |
165
|
0
|
|
|
|
|
|
my (@leftchks) = $left =~ m/([^\s]*\s?)/gm; |
166
|
0
|
|
|
|
|
|
my (@rightchks) = $right =~ m/([^\s]*\s?)/gm; |
167
|
|
|
|
|
|
|
|
168
|
0
|
|
|
|
|
|
my $result = sdiff(\@leftchks, \@rightchks); |
169
|
0
|
|
|
|
|
|
my @result = (map { [ $_->[1], $_->[2] ] } @$result); |
|
0
|
|
|
|
|
|
|
170
|
0
|
|
|
|
|
|
return $result; |
171
|
|
|
|
|
|
|
} |
172
|
|
|
|
|
|
|
|
173
|
|
|
|
|
|
|
1; |
174
|
|
|
|
|
|
|
|
175
|
|
|
|
|
|
|
=head1 NAME |
176
|
|
|
|
|
|
|
|
177
|
|
|
|
|
|
|
HTML::Diff - compare two HTML strings and return a list of differences |
178
|
|
|
|
|
|
|
|
179
|
|
|
|
|
|
|
=head1 SYNOPSIS |
180
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
use HTML::Diff; |
182
|
|
|
|
|
|
|
|
183
|
|
|
|
|
|
|
$result = html_word_diff($left_text, $right_text); |
184
|
|
|
|
|
|
|
|
185
|
|
|
|
|
|
|
=head1 DESCRIPTION |
186
|
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
This module compares two strings of HTML and returns a list of a |
188
|
|
|
|
|
|
|
chunks which indicate the diff between the two input strings, where |
189
|
|
|
|
|
|
|
changes in formatting are considered changes. |
190
|
|
|
|
|
|
|
|
191
|
|
|
|
|
|
|
HTML::Diff does not strictly parse the HTML. Instead, it uses regular |
192
|
|
|
|
|
|
|
expressions to make a decent effort at understanding the given HTML. |
193
|
|
|
|
|
|
|
As a result, there are many valid HTML documents for which it will not |
194
|
|
|
|
|
|
|
produce the correct answer. But there may be some invalid HTML |
195
|
|
|
|
|
|
|
documents for which it gives you the answer you're looking for. Your |
196
|
|
|
|
|
|
|
mileage may vary; test it on lots of inputs from your domain before |
197
|
|
|
|
|
|
|
relying on it. |
198
|
|
|
|
|
|
|
|
199
|
|
|
|
|
|
|
Returns a reference to a list of triples [, , ]. |
200
|
|
|
|
|
|
|
Each triple represents a check of the input texts. The flag tells you |
201
|
|
|
|
|
|
|
whether it represents a deletion, insertion, a modification, or an |
202
|
|
|
|
|
|
|
unchanged chunk. |
203
|
|
|
|
|
|
|
|
204
|
|
|
|
|
|
|
Every character of each input text is accounted for by some triple in |
205
|
|
|
|
|
|
|
the output. Specifically, Concatenating all the members from |
206
|
|
|
|
|
|
|
the return value should produce C<$left_text>, and likewise the |
207
|
|
|
|
|
|
|
members concatenate together to produce C<$right_text>. |
208
|
|
|
|
|
|
|
|
209
|
|
|
|
|
|
|
The is either C<'u'>, C<'+'>, C<'-'>, or C<'c'>, indicating |
210
|
|
|
|
|
|
|
whether the two chunks are the same, the $right_text contained this |
211
|
|
|
|
|
|
|
chunk and the left chunk didn't, or vice versa, or the two chunks are |
212
|
|
|
|
|
|
|
simply different. This follows the usage of Algorithm::Diff. |
213
|
|
|
|
|
|
|
|
214
|
|
|
|
|
|
|
The difference is computed on a word-by-word basis, "breaking" on |
215
|
|
|
|
|
|
|
visible words in the HTML text. If a tag only is changed, it will not |
216
|
|
|
|
|
|
|
be returned as an independent chunk but will be shown as a change to |
217
|
|
|
|
|
|
|
one of the neighboring words. For balanced tags, such as , it |
218
|
|
|
|
|
|
|
is intended that a change to the tag will be treated as a change to |
219
|
|
|
|
|
|
|
all words in between. |
220
|
|
|
|
|
|
|
|
221
|
|
|
|
|
|
|
=head1 SEE ALSO |
222
|
|
|
|
|
|
|
|
223
|
|
|
|
|
|
|
L provides the diff algorithm used in this module. |
224
|
|
|
|
|
|
|
|
225
|
|
|
|
|
|
|
L can generate a diff between two XML files, and also |
226
|
|
|
|
|
|
|
patch an XML file, given a diff. |
227
|
|
|
|
|
|
|
|
228
|
|
|
|
|
|
|
=head1 REPOSITORY |
229
|
|
|
|
|
|
|
|
230
|
|
|
|
|
|
|
L |
231
|
|
|
|
|
|
|
|
232
|
|
|
|
|
|
|
=head1 AUTHOR |
233
|
|
|
|
|
|
|
|
234
|
|
|
|
|
|
|
Whipped up by Ezra elias kilty Cooper, Eezra@ezrakilty.netE. |
235
|
|
|
|
|
|
|
|
236
|
|
|
|
|
|
|
Patch contributed by Adam Easjo@koldfront.dkE. |
237
|
|
|
|
|
|
|
|
238
|
|
|
|
|
|
|
=head1 COPYRIGHT AND LICENSE |
239
|
|
|
|
|
|
|
|
240
|
|
|
|
|
|
|
Copyright 2003-2014 by Ezra elias kilty Cooper, Eezra@ezrakilty.netE |
241
|
|
|
|
|
|
|
|
242
|
|
|
|
|
|
|
This program is free software; you can redistribute it and/or |
243
|
|
|
|
|
|
|
modify it under the same terms as Perl itself. |
244
|
|
|
|
|
|
|
|
245
|
|
|
|
|
|
|
=cut |