File Coverage

blib/lib/MARC/File/XML.pm
Criterion Covered Total %
statement 157 178 88.2
branch 38 66 57.5
condition 31 69 44.9
subroutine 27 30 90.0
pod 9 14 64.2
total 262 357 73.3


line stmt bran cond sub pod time code
1             package MARC::File::XML;
2              
3 7     7   32902 use warnings;
  7         15  
  7         242  
4 7     7   35 use strict;
  7         13  
  7         163  
5 7     7   42 use vars qw( $VERSION %_load_args );
  7         14  
  7         382  
6 7     7   41 use base qw( MARC::File );
  7         17  
  7         3058  
7 7     7   8993 use MARC::Record;
  7         30915  
  7         287  
8 7     7   48 use MARC::Field;
  7         53  
  7         114  
9 7     7   2656 use MARC::File::SAX;
  7         23  
  7         253  
10 7     7   45 use XML::SAX qw(Namespaces Validation);
  7         15  
  7         373  
11              
12 7     7   41 use MARC::Charset qw( marc8_to_utf8 utf8_to_marc8 );
  7         16  
  7         250  
13 7     7   3065 use IO::File;
  7         38853  
  7         766  
14 7     7   50 use Carp qw( croak );
  7         15  
  7         238  
15 7     7   33 use Encode ();
  7         15  
  7         10863  
16              
17             $VERSION = '0.93';
18              
19             my $factory = XML::SAX::ParserFactory->new();
20             $factory->require_feature(Namespaces);
21              
22             sub import {
23 5     5   52 my $class = shift;
24 5         15 %_load_args = @_;
25 5   50     39 $_load_args{ DefaultEncoding } ||= 'UTF-8';
26 5   50     1382 $_load_args{ RecordFormat } ||= 'USMARC';
27             }
28              
29             =head1 NAME
30              
31             MARC::File::XML - Work with MARC data encoded as XML
32              
33             =head1 SYNOPSIS
34              
35             ## Loading with USE options
36             use MARC::File::XML ( BinaryEncoding => 'utf8', RecordFormat => 'UNIMARC' );
37              
38             ## Setting the record format without USE options
39             MARC::File::XML->default_record_format('USMARC');
40            
41             ## reading with MARC::Batch
42             my $batch = MARC::Batch->new( 'XML', $filename );
43             my $record = $batch->next();
44              
45             ## or reading with MARC::File::XML explicitly
46             my $file = MARC::File::XML->in( $filename );
47             my $record = $file->next();
48              
49             ## serialize a single MARC::Record object as XML
50             print $record->as_xml();
51              
52             ## write a bunch of records to a file
53             my $file = MARC::File::XML->out( 'myfile.xml' );
54             $file->write( $record1 );
55             $file->write( $record2 );
56             $file->write( $record3 );
57             $file->close();
58              
59             ## instead of writing to disk, get the xml directly
60             my $xml = join( "\n",
61             MARC::File::XML::header(),
62             MARC::File::XML::record( $record1 ),
63             MARC::File::XML::record( $record2 ),
64             MARC::File::XML::footer()
65             );
66              
67             =head1 DESCRIPTION
68              
69             The MARC-XML distribution is an extension to the MARC-Record distribution for
70             working with MARC21 data that is encoded as XML. The XML encoding used is the
71             MARC21slim schema supplied by the Library of Congress. More information may
72             be obtained here: http://www.loc.gov/standards/marcxml/
73              
74             You must have MARC::Record installed to use MARC::File::XML. In fact
75             once you install the MARC-XML distribution you will most likely not use it
76             directly, but will have an additional file format available to you when you
77             use MARC::Batch.
78              
79             This version of MARC-XML supersedes an the versions ending with 0.25 which
80             were used with the MARC.pm framework. MARC-XML now uses MARC::Record
81             exclusively.
82              
83             If you have any questions or would like to contribute to this module please
84             sign on to the perl4lib list. More information about perl4lib is available
85             at L.
86              
87             =head1 METHODS
88              
89             When you use MARC::File::XML your MARC::Record objects will have two new
90             additional methods available to them:
91              
92             =head2 MARC::File::XML->default_record_format([$format])
93              
94             Sets or returns the default record format used by MARC::File::XML. Valid
95             formats are B, B, B and B.
96              
97             MARC::File::XML->default_record_format('UNIMARC');
98              
99             =cut
100              
101             sub default_record_format {
102 0     0 1 0 my $self = shift;
103 0         0 my $format = shift;
104              
105 0 0       0 $_load_args{RecordFormat} = $format if ($format);
106              
107 0         0 return $_load_args{RecordFormat};
108             }
109              
110              
111             =head2 as_xml()
112              
113             Returns a MARC::Record object serialized in XML. You can pass an optional format
114             parameter to tell MARC::File::XML what type of record (USMARC, UNIMARC, UNIMARCAUTH) you are
115             serializing.
116              
117             print $record->as_xml([$format]);
118              
119             =cut
120              
121             sub MARC::Record::as_xml {
122 3     3 0 1107191 my $record = shift;
123 3   33     22 my $format = shift || $_load_args{RecordFormat};
124 3         13 return( MARC::File::XML::encode( $record, $format ) );
125             }
126              
127             =head2 as_xml_record([$format])
128              
129             Returns a MARC::Record object serialized in XML without a collection wrapper.
130             You can pass an optional format parameter to tell MARC::File::XML what type of
131             record (USMARC, UNIMARC, UNIMARCAUTH) you are serializing.
132              
133             print $record->as_xml_record('UNIMARC');
134              
135             =cut
136              
137             sub MARC::Record::as_xml_record {
138 0     0 0 0 my $record = shift;
139 0   0     0 my $format = shift || $_load_args{RecordFormat};
140 0         0 return( MARC::File::XML::encode( $record, $format, 1 ) );
141             }
142              
143             =head2 new_from_xml([$encoding, $format])
144              
145             If you have a chunk of XML and you want a record object for it you can use
146             this method to generate a MARC::Record object. You can pass an optional
147             encoding parameter to specify which encoding (UTF-8 or MARC-8) you would like
148             the resulting record to be in. You can also pass a format parameter to specify
149             the source record type, such as UNIMARC, UNIMARCAUTH, USMARC or MARC21.
150              
151             my $record = MARC::Record->new_from_xml( $xml, $encoding, $format );
152              
153             Note: only works for single record XML chunks.
154              
155             =cut
156              
157             sub MARC::Record::new_from_xml {
158 4     4 0 342099 my $xml = shift;
159             ## to allow calling as MARC::Record::new_from_xml()
160             ## or MARC::Record->new_from_xml()
161 4 100 66     56 $xml = shift if ( ref($xml) || ($xml eq "MARC::Record") );
162              
163 4   66     21 my $enc = shift || $_load_args{BinaryEncoding};
164 4   33     24 my $format = shift || $_load_args{RecordFormat};
165 4         21 return( MARC::File::XML::decode( $xml, $enc, $format ) );
166             }
167              
168             =pod
169              
170             If you want to write records as XML to a file you can use out() with write()
171             to serialize more than one record as XML.
172              
173             =head2 out()
174              
175             A constructor for creating a MARC::File::XML object that can write XML to a
176             file. You must pass in the name of a file to write XML to. If the $encoding
177             parameter or the DefaultEncoding (see above) is set to UTF-8 then the binmode
178             of the output file will be set appropriately.
179              
180             my $file = MARC::File::XML->out( $filename [, $encoding] );
181              
182             =cut
183              
184             sub out {
185 1     1 1 787 my ( $class, $filename, $enc ) = @_;
186 1 50       11 my $fh = IO::File->new( ">$filename" ) or croak( $! );
187 1   33     214 $enc ||= $_load_args{DefaultEncoding};
188              
189 1 50       9 if ($enc =~ /^utf-?8$/oi) {
190 1         4 $fh->binmode(':utf8');
191             } else {
192 0         0 $fh->binmode(':raw');
193             }
194              
195 1         20 my %self = (
196             filename => $filename,
197             fh => $fh,
198             header => 0,
199             encoding => $enc
200             );
201 1   33     9 return( bless \%self, ref( $class ) || $class );
202             }
203              
204             =head2 write()
205              
206             Used in tandem with out() to write records to a file.
207              
208             my $file = MARC::File::XML->out( $filename );
209             $file->write( $record1 );
210             $file->write( $record2 );
211              
212             =cut
213              
214             sub write {
215 1     1 1 497 my ( $self, $record, $enc ) = @_;
216 1 50       9 if ( ! $self->{ fh } ) {
217 0         0 croak( "MARC::File::XML object not open for writing" );
218             }
219 1 50       5 if ( ! $record ) {
220 0         0 croak( "must pass write() a MARC::Record object" );
221             }
222             ## print the XML header if we haven't already
223 1 50       4 if ( ! $self->{ header } ) {
224 1   33     10 $enc ||= $self->{ encoding } || $_load_args{DefaultEncoding};
      33        
225 1         3 $self->{ fh }->print( header( $enc ) );
226 1         19 $self->{ header } = 1;
227             }
228             ## print out the record
229 1 50       4 $self->{ fh }->print( record( $record ) ) || croak( $! );
230 1         21 return( 1 );
231             }
232              
233             =head2 close()
234              
235             When writing records to disk the filehandle is automatically closed when you
236             the MARC::File::XML object goes out of scope. If you want to close it explicitly
237             use the close() method.
238              
239             =cut
240              
241             sub close {
242 7     7 1 18 my $self = shift;
243 7 100       30 if ( $self->{ fh } ) {
244 6 100       65 $self->{ fh }->print( footer() ) if $self->{ header };
245 6         22 $self->{ fh } = undef;
246 6         256 $self->{ filename } = undef;
247 6         38 $self->{ header } = undef;
248             }
249 7         183 return( 1 );
250             }
251              
252             ## makes sure that the XML file is closed off
253              
254             sub DESTROY {
255 6     6   3842 shift->close();
256             }
257              
258             =pod
259              
260             If you want to generate batches of records as XML, but don't want to write to
261             disk you'll have to use header(), record() and footer() to generate the
262             different portions.
263              
264             $xml = join( "\n",
265             MARC::File::XML::header(),
266             MARC::File::XML::record( $record1 ),
267             MARC::File::XML::record( $record2 ),
268             MARC::File::XML::record( $record3 ),
269             MARC::File::XML::footer()
270             );
271              
272             =head2 header()
273              
274             Returns a string of XML to use as the header to your XML file.
275              
276             =cut
277              
278             sub header {
279 5     5 1 1157 my $enc = shift;
280 5 50 33     35 $enc = shift if ( $enc && (ref($enc) || ($enc eq "MARC::File::XML")) );
      66        
281 5   100     19 $enc ||= 'UTF-8';
282 5         30 return( <
283            
284            
285             xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
286             xsi:schemaLocation="http://www.loc.gov/MARC21/slim http://www.loc.gov/standards/marcxml/schema/MARC21slim.xsd"
287             xmlns="http://www.loc.gov/MARC21/slim">
288             MARC_XML_HEADER
289             }
290              
291             =head2 footer()
292              
293             Returns a string of XML to use at the end of your XML file.
294              
295             =cut
296              
297             sub footer {
298 5     5 1 17 return( "" );
299             }
300              
301             =head2 record()
302              
303             Returns a chunk of XML suitable for placement between the header and the footer.
304              
305             =cut
306              
307             sub record {
308 5     5 1 12 my $record = shift;
309 5         8 my $format = shift;
310 5         11 my $include_full_record_header = shift;
311 5         8 my $enc = shift;
312              
313 5   66     19 $format ||= $_load_args{RecordFormat};
314              
315 5         9 my $_transcode = 0;
316 5         17 my $ldr = $record->leader;
317 5         43 my $original_encoding = substr($ldr,9,1);
318              
319             # Does the record think it is already Unicode?
320 5 50 33     34 if ($original_encoding ne 'a' && lc($format) !~ /^unimarc/o) {
321             # If not, we'll make it so
322 5         13 $_transcode++;
323 5         15 substr($ldr,9,1,'a');
324 5         13 $record->leader( $ldr );
325             }
326              
327 5         68 my @xml = ();
328              
329 5 50       16 if ($include_full_record_header) {
330 0         0 push @xml, <
331            
332            
333             xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
334             xsi:schemaLocation="http://www.loc.gov/MARC21/slim http://www.loc.gov/standards/marcxml/schema/MARC21slim.xsd"
335             xmlns="http://www.loc.gov/MARC21/slim">
336             HEADER
337              
338             } else {
339 5         10 push( @xml, "" );
340             }
341              
342 5         17 push( @xml, " " . escape( $record->leader ) . "" );
343              
344 5         23 foreach my $field ( $record->fields() ) {
345 38         164 my ($tag) = escape( $field->tag() );
346 38 100       110 if ( $field->is_control_field() ) {
347 7         49 my $data = $field->data;
348 7 50       114 push( @xml, qq( ) .
349             escape( ($_transcode ? marc8_to_utf8($data) : $data) ). qq() );
350             } else {
351 31         186 my ($i1) = escape( $field->indicator( 1 ) );
352 31         79 my ($i2) = escape( $field->indicator( 2 ) );
353 31         111 push( @xml, qq( ) );
354 31         87 foreach my $subfield ( $field->subfields() ) {
355 59         730 my ( $code, $data ) = ( escape( $$subfield[0] ), $$subfield[1] );
356 59 50       228 push( @xml, qq( ).
357             escape( ($_transcode ? marc8_to_utf8($data) : $data) ).qq() );
358             }
359 31         120 push( @xml, " " );
360             }
361             }
362 5         15 push( @xml, "\n" );
363              
364 5 50       17 if ($_transcode) {
365 5         15 substr($ldr,9,1,$original_encoding);
366 5         26 $record->leader( $ldr );
367             }
368              
369 5         128 return( join( "\n", @xml ) );
370             }
371              
372             my %ESCAPES = (
373             '&' => '&',
374             '<' => '<',
375             '>' => '>',
376             );
377             my $ESCAPE_REGEX =
378             eval 'qr/' .
379             join( '|', map { $_ = "\Q$_\E" } keys %ESCAPES ) .
380             '/;'
381             ;
382              
383             sub escape {
384 233     233 0 53874 my $string = shift;
385 233 50 33     948 return '' if ! defined $string or $string eq '';
386 233         679 $string =~ s/($ESCAPE_REGEX)/$ESCAPES{$1}/oge;
  16         59  
387 233         658 return( $string );
388             }
389              
390             sub _next {
391 25     25   34888 my $self = shift;
392 25         57 my $fh = $self->{ fh };
393              
394             ## return undef at the end of the file
395 25 50       2109403 return if eof($fh);
396              
397             ## get a chunk of xml for a record
398 25         198 local $/ = 'record>';
399 25         185 my $xml = <$fh>;
400              
401             ## do we have enough?
402 25 100       386726 $xml .= <$fh> if $xml !~ m!$!;
403             ## trim stuff before the start record element
404 25         580 $xml =~ s/.*?<(([^:]+:){0,1})record.*?>/<$1record>/s;
405              
406             ## return undef if there isn't a good chunk of xml
407 25 100       401 return if ( $xml !~ m|<(([^:]+:){0,1})record>.*|s );
408              
409             ## if we have a namespace prefix, restore the declaration
410 23 100       139 if ($xml =~ /<([^:]+:)record>/) {
411 10         87 $xml =~ s!<([^:]+):record>!<$1:record xmlns:$1="http://www.loc.gov/MARC21/slim">!;
412             }
413              
414             ## return the chunk of xml
415 23         156 return( $xml );
416             }
417              
418             =head2 decode()
419              
420             You probably don't ever want to call this method directly. If you do
421             you should pass in a chunk of XML as the argument.
422              
423             It is normally invoked by a call to next(), see L or L.
424              
425             =cut
426              
427             sub decode {
428 27     27 1 204 my $text;
429 27         62 my $location = '';
430 27         63 my $self = shift;
431              
432             ## see MARC::File::USMARC::decode for explanation of what's going on
433             ## here
434 27 100       140 if ( ref($self) =~ /^MARC::File/ ) {
435 23         83 $location = 'in record '.$self->{recnum};
436 23         47 $text = shift;
437             } else {
438 4         27 $location = 'in record 1';
439 4 50       52 $text = $self=~/MARC::File/ ? shift : $self;
440             }
441              
442 27   66     144 my $enc = shift || $_load_args{BinaryEncoding};
443 27   66     138 my $format = shift || $_load_args{RecordFormat};
444              
445 27         212 my $handler = MARC::File::SAX->new();
446             my $parser = $factory->parser(
447             Handler => $handler,
448             ProtocolEncoding => $_load_args{DefaultEncoding}
449 27         212 );
450 27         439342 $parser->{ Handler }{ toMARC8 } = decideMARC8Binary($format,$enc);
451              
452 27         193 $parser->parse_string( $text );
453              
454 26         3386 return( $handler->record() );
455             }
456              
457             sub decideMARC8Binary {
458 27     27 0 72 my $format = shift;
459 27         59 my $enc = shift;
460              
461 27 50 66     206 return 0 if (defined($format) && lc($format) =~ /^unimarc/o);
462 27 100 66     119 return 0 if (defined($enc) && lc($enc) =~ /^utf-?8/o);
463 26         95 return 1;
464             }
465              
466              
467             =head2 encode()
468              
469             You probably want to use the as_xml() method on your MARC::Record object
470             instead of calling this directly. But if you want to you just need to
471             pass in the MARC::Record object you wish to encode as XML, and you will be
472             returned the XML as a scalar.
473              
474             =cut
475              
476             sub encode {
477 3     3 1 8 my $record = shift;
478 3   33     12 my $format = shift || $_load_args{RecordFormat};
479 3         6 my $without_collection_header = shift;
480 3   33     15 my $enc = shift || $_load_args{DefaultEncoding};
481              
482 3 50       13 if (lc($format) =~ /^unimarc/o) {
483 0         0 $enc = _unimarc_encoding( $format => $record );
484             }
485              
486 3         8 my @xml = ();
487 3 50       15 push( @xml, header( $enc ) ) unless ($without_collection_header);
488             # verbose, but naming the header output flags this way to avoid
489             # the potential confusion identified in CPAN bug #34082
490             # http://rt.cpan.org/Public/Bug/Display.html?id=34082
491 3 50       11 my $include_full_record_header = ($without_collection_header) ? 1 : 0;
492 3         22 push( @xml, record( $record, $format, $include_full_record_header, $enc ) );
493 3 50       16 push( @xml, footer() ) unless ($without_collection_header);
494              
495 3         31 return( join( "\n", @xml ) );
496             }
497              
498             sub _unimarc_encoding {
499 0     0     my $f = shift;
500 0           my $r = shift;
501              
502 0           my $pos = 26;
503 0 0         $pos = 13 if (lc($f) eq 'unimarcauth');
504              
505 0           my $enc = substr( $r->subfield(100 => 'a'), $pos, 2 );
506              
507 0 0 0       if ($enc eq '01' || $enc eq '03') {
    0          
508 0           return 'ISO-8859-1';
509             } elsif ($enc eq '50') {
510 0           return 'UTF-8';
511             } else {
512 0           die "Unsupported UNIMARC character encoding [$enc] for XML output for $f; 100\$a -> " . $r->subfield(100 => 'a');
513             }
514             }
515              
516             =head1 TODO
517              
518             =over 4
519              
520             =item * Support for callback filters in decode().
521              
522             =back
523              
524             =head1 SEE ALSO
525              
526             =over 4
527              
528             =item L
529              
530             =item L
531              
532             =item L
533              
534             =item L
535              
536             =back
537              
538             =head1 AUTHORS
539              
540             =over 4
541              
542             =item * Ed Summers
543              
544             =back
545              
546             =cut
547              
548             1;