File Coverage

blib/lib/XML/SemanticDiff.pm
Criterion Covered Total %
statement 10 12 83.3
branch n/a
condition n/a
subroutine 4 4 100.0
pod n/a
total 14 16 87.5


line stmt bran cond sub pod time code
1             package XML::SemanticDiff;
2              
3 15     15   261012 use strict;
  15         39  
  15         565  
4 15     15   79 use warnings;
  15         32  
  15         493  
5              
6 15     15   90 use vars qw/$VERSION/;
  15         29  
  15         909  
7              
8             $VERSION = '1.0004';
9              
10 15     15   26475 use XML::Parser;
  0            
  0            
11              
12             =head1 NAME
13              
14             XML::SemanticDiff - Perl extension for comparing XML documents.
15              
16             =cut
17              
18             sub new {
19             my ($proto, %args) = @_;
20             my $class = ref($proto) || $proto;
21             my $self = \%args;
22              
23             require XML::SemanticDiff::BasicHandler unless defined $args{diffhandler};
24              
25             bless ($self, $class);
26             return $self;
27             }
28              
29             sub _is_file
30             {
31             my ($self, $specifier) = @_;
32             return $specifier !~ /\n/g && -f $specifier;
33             }
34              
35             sub _get_pathfinder_obj {
36             my $self = shift;
37              
38             return XML::SemanticDiff::PathFinder::Obj->new();
39             }
40              
41             sub read_xml {
42             my $self = shift;
43              
44             my ($xml_specifier) = @_;
45              
46             if (ref($xml_specifier) eq 'HASH')
47             {
48             return $xml_specifier;
49             }
50             else
51             {
52             $self->{path_finder_obj} = $self->_get_pathfinder_obj();
53              
54             my $p = XML::Parser->new(
55             Style => 'Stream',
56             Pkg => 'XML::SemanticDiff::PathFinder',
57             'Non-Expat-Options' => $self,
58             Namespaces => 1
59             );
60              
61             my $ret =
62             $self->_is_file($xml_specifier)
63             ? $p->parsefile($xml_specifier)
64             : $p->parse($xml_specifier)
65             ;
66            
67             $self->{path_finder_obj} = undef;
68              
69             return $ret;
70             }
71             }
72              
73             sub _same_namespace
74             {
75             my ($self, $to, $from) = @_;
76            
77             my $t_e = exists($to->{NamespaceURI});
78             my $f_e = exists($from->{NamespaceURI});
79             if (!$t_e && !$f_e)
80             {
81             return 1;
82             }
83             elsif ($t_e && $f_e)
84             {
85             return ($to->{NamespaceURI} eq $from->{NamespaceURI});
86             }
87             else
88             {
89             return 0;
90             }
91             }
92              
93             sub _match_xpath {
94             my $self = shift;
95             my ($xpath, $flat_name) = @_;
96             my @x_way = split /\//, $xpath;
97             my @f_way = split /\//, $flat_name;
98             for my $i (0..$#x_way) {
99             $x_way[$i]=~s/.*?://g;
100             }
101             for my $i (0..$#f_way) {
102             $f_way[$i]=~s/\[.*?\]$//g;
103             }
104             return 0 if $#x_way > $#f_way;
105             for my $i (0..$#x_way) {
106             if ($x_way[$i] ne $f_way[$i]) {
107             return 0;
108             }
109             }
110             return 1;
111             }
112              
113             # Okay, it's pretty basic...
114             #
115             # We flatten each doc tree to a Perl hash where the keys are "fully qualified"
116             # XPath expressions (/root[1]/element[3]) that represent the unique location
117             # of each XML element, then compare the two hashes.
118             #
119             # Just loop over all the elements of the first hash- if the same key exists
120             # in the second, you compare the text and attributes and delete it. Any
121             # keys not found in the second hash are declared 'missing', and any keys leftover
122             # in the second hash after looping through the elements in the first are 'rogues'.
123              
124             sub compare {
125             my $self = shift;
126             my ($from_xml, $to_xml) = @_;
127              
128             my $from_doc = $self->read_xml($from_xml);
129             my $to_doc = $self->read_xml($to_xml);
130              
131             my @warnings = ();
132              
133             my $handler = $self->{diffhandler} || XML::SemanticDiff::BasicHandler->new(%$self);
134              
135             # drop away nodes matching xpaths to be ignored
136             if (defined $self->{ignorexpath}) {
137             my $ignore = $self->{ignorexpath};
138             for my $path (@$ignore) {
139             for my $ref ($from_doc, $to_doc) {
140             for my $key (keys %$ref) {
141             if ($self->_match_xpath($path, $key)) {
142             delete $ref->{$key};
143             }
144             }
145             }
146             }
147             }
148              
149             # fire the init handler
150             push (@warnings, $handler->init($self)) if $handler->can('init');
151              
152             # loop the elements
153             foreach my $element (sort keys (%$from_doc)) {
154              
155             # element existence check
156             if (defined $to_doc->{$element}) {
157              
158             # element value test
159             unless ($from_doc->{$element}->{TextChecksum} eq $to_doc->{$element}->{TextChecksum}) {
160             push (@warnings, $handler->element_value($element,
161             $to_doc->{$element},
162             $from_doc->{$element}))
163             if $handler->can('element_value');
164             }
165            
166             # namespace test
167             unless ($self->_same_namespace($from_doc->{$element},$to_doc->{$element})) {
168             push (@warnings, $handler->namespace_uri($element,
169             $to_doc->{$element},
170             $from_doc->{$element}))
171             if $handler->can('namespace_uri');
172             }
173            
174             # attribute tests
175             foreach my $attr (keys(%{$from_doc->{$element}->{Attributes}})) {
176            
177             # attr existence check
178             if (defined ($to_doc->{$element}->{Attributes}->{$attr})) {
179              
180             # attr value test
181             if ($to_doc->{$element}->{Attributes}->{$attr} ne $from_doc->{$element}->{Attributes}->{$attr}){
182             push (@warnings, $handler->attribute_value($attr,
183             $element,
184             $to_doc->{$element},
185             $from_doc->{$element}))
186             if $handler->can('attribute_value');
187             }
188             delete $to_doc->{$element}->{Attributes}->{$attr};
189             }
190             else {
191             push (@warnings, $handler->missing_attribute($attr,
192             $element,
193             $to_doc->{$element},
194             $from_doc->{$element}))
195             if $handler->can('missing_attribute');
196             }
197             }
198              
199             # rogue attrs
200             foreach my $leftover (keys(%{$to_doc->{$element}->{Attributes}})) {
201             push (@warnings, $handler->rogue_attribute($leftover,
202             $element,
203             $to_doc->{$element},
204             $from_doc->{$element}))
205             if $handler->can('rogue_attribute');
206             }
207            
208             delete $to_doc->{$element};
209             }
210             else {
211             push (@warnings, $handler->missing_element($element, $from_doc->{$element}))
212             if $handler->can('missing_element');
213             }
214             }
215              
216             # rogue elements
217             foreach my $leftover ( keys (%$to_doc) ) {
218             push (@warnings, $handler->rogue_element($leftover, $to_doc->{$leftover}))
219             if $handler->can('rogue_element');
220             }
221              
222             push (@warnings, $handler->final($self)) if $handler->can('final');
223            
224             return @warnings;
225             }
226              
227             1;
228              
229             package XML::SemanticDiff::PathFinder;
230              
231             foreach my $func (qw(StartTag EndTag Text StartDocument EndDocument PI))
232             {
233             no strict 'refs';
234             *{__PACKAGE__.'::'.$func} = sub {
235             my $expat = shift;
236             return $expat->{'Non-Expat-Options'}->{path_finder_obj}->$func(
237             $expat, @_
238             );
239             };
240             }
241              
242             package XML::SemanticDiff::PathFinder::Obj;
243              
244             use strict;
245              
246             use Digest::MD5 qw(md5_base64);
247              
248             use Encode qw(encode_utf8);
249              
250             foreach my $accessor (qw(descendents char_accumulator doc
251             opts xml_context PI_position_index))
252             {
253             no strict 'refs';
254             *{__PACKAGE__.'::'.$accessor} = sub {
255             my $self = shift;
256              
257             if (@_)
258             {
259             $self->{$accessor} = shift;
260             }
261             return $self->{$accessor};
262             };
263             }
264              
265             # PI_position_index is the position index for the PI's below - the processing
266             # instructions.
267              
268             sub new {
269             my $class = shift;
270              
271             my $self = {};
272             bless $self, $class;
273              
274             $self->_init(@_);
275              
276             return $self;
277             }
278              
279             sub _init {
280             return 0;
281             }
282              
283             sub StartTag {
284             my ($self, $expat, $element) = @_;
285              
286              
287             my %attrs = %_;
288            
289             my @context = $expat->context;
290             my $context_length = scalar (@context);
291             my $parent = $context[$context_length -1];
292             push (@{$self->descendents()->{$parent}}, $element) if $parent;
293              
294             my $last_ctx_elem = $self->xml_context()->[-1] || { position_index => {}};
295              
296             push @{$self->xml_context()},
297             {
298             element => "$element",
299             'index' => ++$last_ctx_elem->{position_index}->{"$element"},
300             position_index => {},
301             };
302              
303             my $test_context;
304            
305             # if (@context){
306             # $test_context = '/' . join ('/', map { $_ . '[' . $position_index->{$_} . ']' } @context);
307             # }
308            
309             # $test_context .= '/' . $element . '[' . $position_index->{$element} . ']';
310              
311             $test_context = $self->_calc_test_context();
312              
313             $self->doc()->{$test_context} =
314             {
315             NamespaceURI => ($expat->namespace($element) || ""),
316             Attributes => \%attrs,
317             ($self->opts()->{keeplinenums}
318             ? ( TagStart => $expat->current_line)
319             : ()
320             ),
321             };
322             }
323              
324             sub _calc_test_context
325             {
326             my $self = shift;
327              
328             return
329             join("",
330             map { "/". $_->{'element'} . "[" . $_->{'index'} . "]" }
331             @{$self->xml_context()}
332             );
333             }
334              
335             sub EndTag {
336             my ($self, $expat, $element) = @_;
337            
338             my @context = $expat->context;
339              
340             # if (@context){
341             # $test_context = '/' . join ('/', map { $_ . '[' . $position_index->{$_} . ']' } @context);
342             #}
343             # $test_context .= '/' . $element . '[' . $position_index->{$element} . ']';
344              
345             my $test_context = $self->_calc_test_context();
346              
347             my $text;
348             if ( defined( $self->char_accumulator()->{$element} )) {
349             $text = $self->char_accumulator()->{$element};
350             delete $self->char_accumulator()->{$element};
351             }
352             $text ||= 'o';
353            
354             # warn "text is '$text' \n";
355             # my $ctx = Digest::MD5->new;
356             # $ctx->add("$text");
357             # $self->doc()->{"$test_context"}->{TextChecksum} = $ctx->b64digest;
358              
359             $self->doc()->{"$test_context"}->{TextChecksum} = md5_base64(encode_utf8("$text"));
360             if ($self->opts()->{keepdata}) {
361             $self->doc()->{"$test_context"}->{CData} = $text;
362             }
363            
364            
365             if (defined ( $self->descendents()->{$element})) {
366             my $seen = {};
367             foreach my $child (@{$self->descendents()->{$element}}) {
368             next if $seen->{$child};
369             $seen->{$child}++;
370             }
371             }
372            
373             $self->doc()->{"$test_context"}->{TagEnd} = $expat->current_line if $self->opts()->{keeplinenums};
374              
375             pop(@{$self->xml_context()});
376             }
377              
378             sub Text {
379             my $self = shift;
380             my $expat = shift;
381            
382             my $element = $expat->current_element;
383             my $char = $_;
384            
385             $char =~ s/^\s*//;
386             $char =~ s/\s*$//;
387             $char =~ s/\s+/ /g;
388             $self->char_accumulator()->{$element} .= $char if $char;
389            
390             }
391            
392             sub StartDocument {
393             my $self = shift;
394             my $expat = shift;
395             $self->doc({});
396             $self->descendents({});
397             $self->char_accumulator({});
398             $self->opts($expat->{'Non-Expat-Options'});
399             $self->xml_context([]);
400             $self->PI_position_index({});
401             }
402            
403             sub EndDocument {
404             my $self = shift;
405              
406             return $self->doc();
407             }
408              
409              
410             sub PI {
411             my ($self, $expat, $target, $data) = @_;
412             my $attrs = {};
413             $self->PI_position_index()->{$target}++;
414              
415             foreach my $pair (split /\s+/, $data) {
416             $attrs->{$1} = $2 if $pair =~ /^(.+?)=["'](.+?)["']$/;
417             }
418              
419             my $slug = '?' . $target . '[' . $self->PI_position_index()->{$target} . ']';
420              
421             $self->doc()->{$slug} =
422             {
423             Attributes => ($attrs || {}),
424             TextChecksum => "1",
425             NamespaceURI => "",
426             ( $self->opts()->{keeplinenums}
427             ? (
428             TagStart => $expat->current_line(),
429             TagEnd => $expat->current_line(),
430             )
431             : ()
432             ),
433             };
434             }
435              
436             1;
437              
438              
439             =head1 SYNOPSIS
440              
441             use XML::SemanticDiff;
442             my $diff = XML::SemanticDiff->new();
443              
444             foreach my $change ($diff->compare($file, $file2)) {
445             print "$change->{message} in context $change->{context}\n";
446             }
447              
448             # or, if you want line numbers:
449              
450             my $diff = XML::SemanticDiff->new(keeplinenums => 1);
451              
452             foreach my $change ($diff->compare($file, $file2)) {
453             print "$change->{message} (between lines $change->{startline} and $change->{endline})\n";
454             }
455              
456             =head1 DESCRIPTION
457              
458             XML::SematicDiff provides a way to compare the contents and structure of two XML documents. By default, it returns a list of
459             hashrefs where each hashref describes a single difference between the two docs.
460              
461             =head1 METHODS
462              
463             =head2 $obj->new([%options])
464              
465             Ye olde object constructor.
466              
467             The new() method recognizes the following options:
468              
469             =over 4
470              
471             =item * keeplinenums
472              
473             When this option is enabled XML::SemanticDiff will add the 'startline' and 'endline' properties (containing the line numbers
474             for the reported element's start tag and end tag) to each warning. For attribute events these numbers reflect the start and
475             end tags of the element which contains that attribute.
476              
477             =item * keepdata
478              
479             When this option is enabled XML::SemanticDiff will add the 'old_value' and 'new_value' properties to each warning. These
480             properties contain, surprisingly, the old and new values for the element or attribute being reported.
481              
482             In the case of missing elements or attributes (those in the first document, not in the second) only the 'old_value' property
483             will be defined. Similarly, in the case of rogue elements or attributes (those in the second document but not in the
484             first) only the 'new_value' property will be defined.
485              
486             Note that using this option will greatly increase the amount of memory used by your application.
487              
488             =item * diffhandler
489              
490             Taking a blessed object as it's sole argument, this option provides a way to hook the basic semantic diff engine into your own
491             custom handler class.
492              
493             Please see the section on 'CUSTOM HANDLERS' below.
494              
495             =item * ignorexpath
496              
497             This option takes array of strings as argument. Strings are interpreted as simple xpath expressions. Nodes matching these expressions are ignored during comparison. All xpath expressions should be absolute (start with '/').
498              
499             Current implementation ignores namespaces during comparison.
500              
501             =back
502              
503             =head2 @results = $differ->compare($xml1, $xml2)
504              
505             Compares the XMLs $xml1 and $xml2 . $xml1 and $xml2 can be:
506              
507             =over 4
508              
509             =item * filenames
510              
511             This will be considered if it is a string that does not contain newlines and
512             exists in the filesystem.
513              
514             =item * the XML text itself.
515              
516             This will be considered if it's any kind of string.
517              
518             =item * the results of read_xml(). (see below)
519              
520             This will be considered if it's a hash reference.
521              
522             =back
523              
524             =head2 my $doc = read_xml($xml_location)
525              
526             This will read the XML, process it for comparison and return it. See compare()
527             for how it is determined.
528              
529             =head1 CUSTOM HANDLERS
530              
531             Internally, XML::SemanticDiff uses an event-based model somewhat reminiscent of SAX where the various 'semantic diff events'
532             are handed off to a separate handler class to cope with the details. For most general cases where the user only cares about
533             reporting the differences between two docs, the default handler, XML::SemanticDiff::BasicHandler, will probably
534             suffice. However, it is often desirable to add side-effects to the diff process (updating datastores, widget callbacks,
535             etc.) and a custom handler allows you to be creative with what to do about differences between two XML documents and how
536             those differences are reported back to the application through the compare() method.
537              
538             =head1 HANDLER METHODS
539              
540             The following is a list of handler methods that can be used for your custom diff-handler class.
541              
542             =head2 init($self, $diff_obj)
543              
544             The C method is called immediately before the the two document HASHes are compared. The blessed XML::SemanticDiff object
545             is passed as the sole argument, so any values that you wish to pass from your application to your custom handler can safely
546             be added to the call to XML::SemanticDiff's constructor method.
547              
548             =head2 rogue_element($self, $element_name, $todoc_element_properties)
549              
550             The C method handles those cases where a given element exists in the to-file but not in the from-file.
551              
552             =head2 missing_element($self, $element_name, $fromdoc_element_properties)
553              
554             The C method handles those cases where a given element exists in the from-file but not in the to-file.
555              
556             =head2 element_value($self, $element, $to_element_properties, $fromdoc_element_properties)
557              
558             The C method handles those cases where the text data differs between two elements that have the same name,
559             namespace URI, and are at the same location in the document tree. Note that all whitespace is normalized and the text from
560             mixed-content elements (those containing both text and child elements mixed together) is aggregated down to a single value.
561              
562             =head2 namespace_uri($self, $element, $todoc_element_properties, $fromdoc_element_properties)
563              
564             The C method handles case where the XML namespace URI differs between a given element in the two
565             documents. Note that the namespace URI is checked, not the element prefixes since and
566             are all considered equivalent as long as they are bound to the same namespace URI.
567            
568              
569             =head2 rogue_attribute($self, $attr_name, $element, $todoc_element_properties)
570              
571             The C method handles those cases where an attribute exists in a given element the to-file but not in the
572             from-file.
573              
574             =head2 missing_attribute($self, $attr_name, $element, $todoc_element_properties, $fromdoc_element_properties)
575              
576             The C method handles those cases where an attribute exists in a given element exists in the from-file but
577             not in the to-file.
578              
579             =head2 attribute_value($self, $attr_name, $element, $todoc_element_properties, $fromdoc_element_properties)
580              
581             The C method handles those cases where the value of an attribute varies between the same element in both
582             documents.
583              
584             =head2 final($self, $diff_obj)
585              
586             The C method is called immediately after the two document HASHes are compared. Like the C handler, it is passed a
587             copy of the XML::SemanticDiff object as it's sole argument.
588              
589             Note that if a given method is not implemented in your custom handler class, XML::SemanticDiff will not complain; but it means
590             that all of those events will be silently ignored. Consider yourself warned.
591              
592             =head1 AUTHOR
593              
594             Originally by Kip Hampton, khampton@totalcinema.com .
595              
596             Further Maintained by Shlomi Fish, L .
597              
598             =head1 COPYRIGHT
599              
600             Copyright (c) 2000 Kip Hampton. All rights reserved. This program is
601             free software; you can redistribute it and/or modify it under the same terms
602             as Perl itself.
603              
604             Shlomi Fish hereby disclaims any implicit or explicit copyrights on this
605             software.
606              
607             =head1 LICENSE
608              
609             This program is free software; you can redistribute it and/or modify it under
610             the same terms as Perl itself.
611              
612             =head1 SEE ALSO
613              
614             perl(1).
615              
616             =cut