File Coverage

blib/lib/HTML/Diff.pm

Criterion	Covered	Total	%
statement	87	94	92.5
branch	23	24	95.8
condition	10	12	83.3
subroutine	12	13	92.3
pod	0	6	0.0
total	132	149	88.5

line	stmt	bran	cond	sub	pod	time	code
1							package HTML::Diff;
2
3							our $VERSION = '0.59';
4
5	1			1		58994	use 5.006;
	1					3
6	1			1		5	use strict;
	1					2
	1					23
7	1			1		5	use warnings;
	1					6
	1					33
8
9	1			1		5	use Exporter;
	1					2
	1					110
10							our @ISA = qw(Exporter);
11							our @EXPORT = qw(line_diff word_diff html_word_diff);
12
13							# This list of tags is taken from the XHTML spec and includes
14							# all those for which no closing tag is expected. In addition
15							# the pattern below matches any tag which ends with a slash /
16
17							our @UNBALANCED_TAGS = qw(br hr p li base basefont meta link
18							col colgroup frame input isindex area
19							embed img bgsound marquee);
20
21	1			1		1111	use Algorithm::Diff 'sdiff';
	1					6588
	1					1580
22
23							sub member {
24	122			122	0	434	my ($item, @list) = @_;
25
26	122					177	return scalar(grep {$_ eq $item} @list);
	2196					3968
27							}
28
29							sub html_word_diff {
30	10			10	0	2765	my ($left, $right) = @_;
31
32							# Split the two texts into words and tags.
33	10					194	my (@leftchks) = $left =~ m/(<[^>]>\s\|[^<]+)/gm;
34	10					180	my (@rightchks) = $right =~ m/(<[^>]>\s\|[^<]+)/gm;
35
36	10	100				24	@leftchks = map { $_ =~ /^<[^>]>$/ ? $_ : ($_ =~ m/(\S+\s)/gm) }
	149					886
37							@leftchks;
38	10	100				67	@rightchks = map { $_ =~ /^<[^>]>$/ ? $_ : ($_ =~ m/(\S+\s)/gm) }
	132					764
39							@rightchks;
40
41							# Remove blanks; maybe the above regexes could handle this?
42	10					62	@leftchks = grep { $_ ne '' } @leftchks;
	588					1048
43	10					38	@rightchks = grep { $_ ne '' } @rightchks;
	531					909
44
45							# Now we process each segment by turning it into a pair. The first element
46							# is the text as we want it to read in the result. The second element is
47							# the value we will to use in comparisons. It contains an identifier
48							# for each of the balanced tags that it lies within.
49
50							# This subroutine holds state in the tagstack variable
51	10					38	my $tagstack = [];
52							my $smear_tags = sub {
53	1119	100		1119		2173	if ($_ =~ /^<.*>/) {
54	160	100				339	if ($_ =~ m\|^
55	38					99	my ($tag) = m\|^]*)\|;
56	38					57	$tag = lc $tag;
57							# print STDERR "Found closer of $tag with " . (scalar @$tagstack) . " stack items\n";
58							# If we found the closer for the tag on top
59							# of the stack, pop it off.
60	38	100	66			213	if ((scalar @$tagstack) > 0 && $$tagstack[-1] eq $tag) {
61	36					56	my $stacktag = pop @$tagstack;
62							}
63	38					115	return [$_, $tag];
64							} else {
65	122					332	my ($tag) = m\|^<\s([^\s>])\|;
66	122					179	$tag = lc $tag;
67							# print STDERR "Found opener of $tag with " . (scalar @$tagstack) . " stack items\n";
68	122	100	66			246	if (member($tag, @UNBALANCED_TAGS) \|\| $tag =~ m#/\s*>$#)
69							{ # (tags without correspond closer tags)
70	86					397	return [$_, $tag];
71							} else {
72	36					70	push @$tagstack, $tag;
73							}
74	36					83	return [$_, $_];
75							}
76							} else {
77	959					2240	my $result = [$_, (join "!!!", (@$tagstack, $_)) ];
78	959					1697	return $result;
79							}
80	10					49	};
81
82							# Now do the "smear tags" operation across each of the chunk-lists
83	10					17	$tagstack = [];
84	10					19	@leftchks = map { &$smear_tags } @leftchks;
	588					851
85							# TBD: better modularity would preclude having to reset the stack
86	10					49	$tagstack = [];
87	10					20	@rightchks = map { &$smear_tags } @rightchks;
	531					774
88
89							# print STDERR Data::Dumper::Dumper(\@leftchks);
90							# print STDERR Data::Dumper::Dumper(\@rightchks);
91
92							# Now do the diff, using the "comparison" half of the pair to
93							# compare two chuncks.
94							my $chunks = sdiff(\@leftchks, \@rightchks,
95	10			1141		80	sub { $_ = elem_cmprsn(shift); $_ =~ s/\s+$/ /g; $_ });
	1141					19874
	1141					2950
	1141					2609
96
97							# print STDERR Data::Dumper::Dumper($chunks);
98
99							# Finally, process the output of sdiff by concatenating
100							# consecutive chunks that were "unchanged."
101	10					5727	my $lastsignal = '';
102	10					15	my $lbuf = "";
103	10					14	my $rbuf = "";
104	10					12	my @result;
105							my $ch;
106	10					18	foreach $ch (@$chunks) {
107	594					1005	my ($signal, $left, $right) = @$ch;
108	594	100	100			1551	if ($signal ne $lastsignal && $lastsignal ne '') {
109	55	100	100			172	if ($signal ne 'u' && $lastsignal ne 'u') {
110	7					9	$signal = 'c';
111							} else {
112	48					106	push @result, [$lastsignal, $lbuf, $rbuf];
113	48					77	$lbuf = "";
114	48					62	$rbuf = "";
115							}
116							}
117							# if ($signal eq 'u' && $lastsignal ne 'u') {
118							# push @result, [$lastsignal, $lbuf, $rbuf]
119							# unless $lastsignal eq '';
120							# $lbuf = "";
121							# $rbuf = "";
122							# } elsif ($signal ne 'u' && $lastsignal eq 'u') {
123							# push @result, [$lastsignal, $lbuf, $rbuf];
124							# $lbuf = "";
125							# $rbuf = "";
126							# }
127	594					927	my $lelem = elem_mkp($left);
128	594					1033	my $relem = elem_mkp($right);
129	594	100				1060	$lbuf .= (defined $lelem ? $lelem : '');
130	594	100				1055	$rbuf .= (defined $relem ? $relem : '');
131	594					1019	$lastsignal = $signal;
132							}
133	10					26	push @result, [$lastsignal, $lbuf, $rbuf];
134	10					480	return \@result;
135							}
136
137							# these are like "accessors" for the two halves of the diff-chunk pairs
138							sub elem_mkp {
139	1188			1188	0	1309	my ($e) = @_;
140	1188	100				2190	return undef unless ref $e eq 'ARRAY';
141	1119					1481	my ($mkp, $cmp) = @$e;
142	1119					1812	return $mkp;
143							}
144
145							sub elem_cmprsn {
146	1141			1141	0	1383	my ($e) = @_;
147	1141	50				2263	return undef unless ref $e eq 'ARRAY';
148	1141					1665	my ($mkp, $cmp) = @$e;
149	1141					2092	return $cmp;
150							}
151
152							# Finally a couple of non-HTML diff routines
153
154							sub line_diff {
155	1			1	0	319	my ($left, $right) = @_;
156	1					19	my (@leftchks) = $left =~ m/(.*\n?)/gm;
157	1					14	my (@rightchks) = $right =~ m/(.*\n?)/gm;
158	1					4	my $result = sdiff(\@leftchks, \@rightchks);
159							# my @result = map { [ $_->[1], $_->[2] ] } @$result;
160	1					268	return $result;
161							}
162
163							sub word_diff {
164	0			0	0		my ($left, $right) = @_;
165	0						my (@leftchks) = $left =~ m/([^\s]*\s?)/gm;
166	0						my (@rightchks) = $right =~ m/([^\s]*\s?)/gm;
167
168	0						my $result = sdiff(\@leftchks, \@rightchks);
169	0						my @result = (map { [ $_->[1], $_->[2] ] } @$result);
	0
170	0						return $result;
171							}
172
173							1;
174
175							=head1 NAME
176
177							HTML::Diff - compare two HTML strings and return a list of differences
178
179							=head1 SYNOPSIS
180
181							use HTML::Diff;
182
183							$result = html_word_diff($left_text, $right_text);
184
185							=head1 DESCRIPTION
186
187							This module compares two strings of HTML and returns a list of a
188							chunks which indicate the diff between the two input strings, where
189							changes in formatting are considered changes.
190
191							HTML::Diff does not strictly parse the HTML. Instead, it uses regular
192							expressions to make a decent effort at understanding the given HTML.
193							As a result, there are many valid HTML documents for which it will not
194							produce the correct answer. But there may be some invalid HTML
195							documents for which it gives you the answer you're looking for. Your
196							mileage may vary; test it on lots of inputs from your domain before
197							relying on it.
198
199							Returns a reference to a list of triples [, , ].
200							Each triple represents a check of the input texts. The flag tells you
201							whether it represents a deletion, insertion, a modification, or an
202							unchanged chunk.
203
204							Every character of each input text is accounted for by some triple in
205							the output. Specifically, Concatenating all the members from
206							the return value should produce C<$left_text>, and likewise the
207							members concatenate together to produce C<$right_text>.
208
209							The is either C<'u'>, C<'+'>, C<'-'>, or C<'c'>, indicating
210							whether the two chunks are the same, the $right_text contained this
211							chunk and the left chunk didn't, or vice versa, or the two chunks are
212							simply different. This follows the usage of Algorithm::Diff.
213
214							The difference is computed on a word-by-word basis, "breaking" on
215							visible words in the HTML text. If a tag only is changed, it will not
216							be returned as an independent chunk but will be shown as a change to
217							one of the neighboring words. For balanced tags, such as , it
218							is intended that a change to the tag will be treated as a change to
219							all words in between.
220
221							=head1 SEE ALSO
222
223							L provides the diff algorithm used in this module.
224
225							L can generate a diff between two XML files, and also
226							patch an XML file, given a diff.
227
228							=head1 REPOSITORY
229
230							L
231
232							=head1 AUTHOR
233
234							Whipped up by Ezra elias kilty Cooper, Eezra@ezrakilty.netE.
235
236							Patch contributed by Adam Easjo@koldfront.dkE.
237
238							=head1 COPYRIGHT AND LICENSE
239
240							Copyright 2003-2014 by Ezra elias kilty Cooper, Eezra@ezrakilty.netE
241
242							This program is free software; you can redistribute it and/or
243							modify it under the same terms as Perl itself.
244
245							=cut