| line | stmt | bran | cond | sub | pod | time | code | 
| 1 |  |  |  |  |  |  | package Net::OAI::Harvester; | 
| 2 |  |  |  |  |  |  |  | 
| 3 | 16 |  |  | 16 |  | 80606 | use strict; | 
|  | 16 |  |  |  |  | 731 |  | 
|  | 16 |  |  |  |  | 668 |  | 
| 4 | 14 |  |  | 14 |  | 80 | use warnings; | 
|  | 14 |  |  |  |  | 32 |  | 
|  | 14 |  |  |  |  | 473 |  | 
| 5 |  |  |  |  |  |  |  | 
| 6 | 14 |  |  | 14 |  | 35489 | use URI; | 
|  | 14 |  |  |  |  | 97317 |  | 
|  | 14 |  |  |  |  | 631 |  | 
| 7 | 14 |  |  | 14 |  | 17969 | use LWP::UserAgent; | 
|  | 14 |  |  |  |  | 1914621 |  | 
|  | 14 |  |  |  |  | 561 |  | 
| 8 | 14 |  |  | 14 |  | 14272 | use XML::SAX qw( Namespaces Validation ); | 
|  | 14 |  |  |  |  | 101093 |  | 
|  | 14 |  |  |  |  | 987 |  | 
| 9 | 14 |  |  | 14 |  | 18828 | use File::Temp qw( tempfile ); | 
|  | 14 |  |  |  |  | 307495 |  | 
|  | 14 |  |  |  |  | 1163 |  | 
| 10 | 14 |  |  | 14 |  | 142 | use Carp qw( croak ); | 
|  | 14 |  |  |  |  | 31 |  | 
|  | 14 |  |  |  |  | 665 |  | 
| 11 |  |  |  |  |  |  |  | 
| 12 | 14 |  |  | 14 |  | 15418 | use Net::OAI::Error; | 
|  | 14 |  |  |  |  | 57 |  | 
|  | 14 |  |  |  |  | 926 |  | 
| 13 | 14 |  |  | 14 |  | 10561 | use Net::OAI::ResumptionToken; | 
|  | 14 |  |  |  |  | 37 |  | 
|  | 14 |  |  |  |  | 347 |  | 
| 14 | 14 |  |  | 14 |  | 9833 | use Net::OAI::Identify; | 
|  | 14 |  |  |  |  | 48 |  | 
|  | 14 |  |  |  |  | 694 |  | 
| 15 | 14 |  |  | 14 |  | 8799 | use Net::OAI::ListMetadataFormats; | 
|  | 14 |  |  |  |  | 57 |  | 
|  | 14 |  |  |  |  | 424 |  | 
| 16 | 14 |  |  | 14 |  | 8558 | use Net::OAI::ListIdentifiers; | 
|  | 14 |  |  |  |  | 41 |  | 
|  | 14 |  |  |  |  | 504 |  | 
| 17 | 14 |  |  | 14 |  | 8798 | use Net::OAI::ListRecords; | 
|  | 14 |  |  |  |  | 45 |  | 
|  | 14 |  |  |  |  | 407 |  | 
| 18 | 14 |  |  | 14 |  | 8042 | use Net::OAI::GetRecord; | 
|  | 14 |  |  |  |  | 77 |  | 
|  | 14 |  |  |  |  | 433 |  | 
| 19 | 14 |  |  | 14 |  | 92 | use Net::OAI::ListRecords; | 
|  | 14 |  |  |  |  | 27 |  | 
|  | 14 |  |  |  |  | 270 |  | 
| 20 | 14 |  |  | 14 |  | 8226 | use Net::OAI::ListSets; | 
|  | 14 |  |  |  |  | 37 |  | 
|  | 14 |  |  |  |  | 383 |  | 
| 21 | 14 |  |  | 14 |  | 90 | use Net::OAI::Record::Header; | 
|  | 14 |  |  |  |  | 30 |  | 
|  | 14 |  |  |  |  | 365 |  | 
| 22 | 14 |  |  | 14 |  | 8928 | use Net::OAI::Record::OAI_DC; | 
|  | 14 |  |  |  |  | 459 |  | 
|  | 14 |  |  |  |  | 42652 |  | 
| 23 |  |  |  |  |  |  |  | 
| 24 |  |  |  |  |  |  | our $VERSION = '1.15'; | 
| 25 |  |  |  |  |  |  | our $DEBUG = 0; | 
| 26 |  |  |  |  |  |  |  | 
| 27 |  |  |  |  |  |  | =head1 NAME | 
| 28 |  |  |  |  |  |  |  | 
| 29 |  |  |  |  |  |  | Net::OAI::Harvester - A package for harvesting metadata using OAI-PMH | 
| 30 |  |  |  |  |  |  |  | 
| 31 |  |  |  |  |  |  | =head1 SYNOPSIS | 
| 32 |  |  |  |  |  |  |  | 
| 33 |  |  |  |  |  |  | ## create a harvester for the Library of Congress | 
| 34 |  |  |  |  |  |  | my $harvester = Net::OAI::Harvester->new( | 
| 35 |  |  |  |  |  |  | 'baseURL' => 'http://memory.loc.gov/cgi-bin/oai2_0' | 
| 36 |  |  |  |  |  |  | ); | 
| 37 |  |  |  |  |  |  |  | 
| 38 |  |  |  |  |  |  | ## list all the records in a repository | 
| 39 |  |  |  |  |  |  | my $records = $harvester->listRecords( | 
| 40 |  |  |  |  |  |  | 'metadataPrefix'    => 'oai_dc' | 
| 41 |  |  |  |  |  |  | ); | 
| 42 |  |  |  |  |  |  | while ( my $record = $records->next() ) { | 
| 43 |  |  |  |  |  |  | my $header = $record->header(); | 
| 44 |  |  |  |  |  |  | my $metadata = $record->metadata(); | 
| 45 |  |  |  |  |  |  | print "identifier: ", $header->identifier(), "\n"; | 
| 46 |  |  |  |  |  |  | print "title: ", $metadata->title(), "\n"; | 
| 47 |  |  |  |  |  |  | } | 
| 48 |  |  |  |  |  |  |  | 
| 49 |  |  |  |  |  |  | ## find out the name for a repository | 
| 50 |  |  |  |  |  |  | my $identity = $harvester->identify(); | 
| 51 |  |  |  |  |  |  | print "name: ",$identity->repositoryName(),"\n"; | 
| 52 |  |  |  |  |  |  |  | 
| 53 |  |  |  |  |  |  | ## get a list of identifiers | 
| 54 |  |  |  |  |  |  | my $identifiers = $harvester->listIdentifiers( | 
| 55 |  |  |  |  |  |  | 'metadataPrefix'    => 'oai_dc' | 
| 56 |  |  |  |  |  |  | ); | 
| 57 |  |  |  |  |  |  | while ( my $header = $identifiers->next() ) { | 
| 58 |  |  |  |  |  |  | print "identifier: ",$header->identifier(), "\n"; | 
| 59 |  |  |  |  |  |  | } | 
| 60 |  |  |  |  |  |  |  | 
| 61 |  |  |  |  |  |  | ## list all the records in a repository | 
| 62 |  |  |  |  |  |  | my $records = $harvester->listRecords( | 
| 63 |  |  |  |  |  |  | 'metadataPrefix'    => 'oai_dc' | 
| 64 |  |  |  |  |  |  | ); | 
| 65 |  |  |  |  |  |  | while ( my $record = $records->next() ) { | 
| 66 |  |  |  |  |  |  | my $header = $record->header(); | 
| 67 |  |  |  |  |  |  | my $metadata = $record->metadata(); | 
| 68 |  |  |  |  |  |  | print "identifier: ", $header->identifier(), "\n"; | 
| 69 |  |  |  |  |  |  | print "title: ", $metadata->title(), "\n"; | 
| 70 |  |  |  |  |  |  | } | 
| 71 |  |  |  |  |  |  |  | 
| 72 |  |  |  |  |  |  | ## GetRecord, ListSets, ListMetadataFormats also supported | 
| 73 |  |  |  |  |  |  |  | 
| 74 |  |  |  |  |  |  | =head1 DESCRIPTION | 
| 75 |  |  |  |  |  |  |  | 
| 76 |  |  |  |  |  |  | Net::OAI::Harvester is a Perl extension for easily querying OAI-PMH | 
| 77 |  |  |  |  |  |  | repositories. OAI-PMH is the Open Archives Initiative Protocol for Metadata | 
| 78 |  |  |  |  |  |  | Harvesting.  OAI-PMH allows data repositories to share metadata about their | 
| 79 |  |  |  |  |  |  | digital assets.  Net::OAI::Harvester is a OAI-PMH client, so it does for | 
| 80 |  |  |  |  |  |  | OAI-PMH what LWP::UserAgent does for HTTP. | 
| 81 |  |  |  |  |  |  |  | 
| 82 |  |  |  |  |  |  | You create a Net::OAI::Harvester object which you can then use to | 
| 83 |  |  |  |  |  |  | retrieve metadata from a selected repository. Net::OAI::Harvester tries to keep | 
| 84 |  |  |  |  |  |  | things simple by providing an API to get at the data you want; but it also has | 
| 85 |  |  |  |  |  |  | a framework which is easy to extend should you need to get more fancy. | 
| 86 |  |  |  |  |  |  |  | 
| 87 |  |  |  |  |  |  | The guiding principle behind OAI-PMH is to allow metadata about online | 
| 88 |  |  |  |  |  |  | resources to be shared by data providers, so that the metadata can be harvested | 
| 89 |  |  |  |  |  |  | by interested parties. The protocol is essentially XML over HTTP (much like | 
| 90 |  |  |  |  |  |  | XMLRPC or SOAP). Net::OAI::Harvester does XML parsing for you | 
| 91 |  |  |  |  |  |  | (using XML::SAX internally), but you can get at the raw XML if you want to do | 
| 92 |  |  |  |  |  |  | your own XML processing, and you can drop in your own XML::SAX handler if you | 
| 93 |  |  |  |  |  |  | would like to do your own parsing of metadata elements. | 
| 94 |  |  |  |  |  |  |  | 
| 95 |  |  |  |  |  |  | A OAI-PMH repository supports 6 verbs: GetRecord, Identify, ListIdentifiers, | 
| 96 |  |  |  |  |  |  | ListMetadataFormats, ListRecords, and ListSets. The verbs translate directly | 
| 97 |  |  |  |  |  |  | into methods that you can call on a Net::OAI::Harvester object. More details | 
| 98 |  |  |  |  |  |  | about these methods are supplied below, however for the real story please | 
| 99 |  |  |  |  |  |  | consult the spec at http://www.openarchives.org. | 
| 100 |  |  |  |  |  |  |  | 
| 101 |  |  |  |  |  |  | Net::OAI::Harvester has a few features that are worth mentioning: | 
| 102 |  |  |  |  |  |  |  | 
| 103 |  |  |  |  |  |  | =over 4 | 
| 104 |  |  |  |  |  |  |  | 
| 105 |  |  |  |  |  |  | =item 1 | 
| 106 |  |  |  |  |  |  |  | 
| 107 |  |  |  |  |  |  | Since the OAI-PMH results can be arbitrarily large, a stream based (XML::SAX) | 
| 108 |  |  |  |  |  |  | parser is used. As the document is parsed corresponding Perl objects are | 
| 109 |  |  |  |  |  |  | created (records, headers, etc), which are then serialized on disk (using | 
| 110 |  |  |  |  |  |  | Storable if you are curious). The serialized objects on disk can then be | 
| 111 |  |  |  |  |  |  | iterated over one at a time. The benefit of this is a lower memory footprint | 
| 112 |  |  |  |  |  |  | when (for example) a ListRecords verb is exercised on a repository that | 
| 113 |  |  |  |  |  |  | returns 100,000 records. | 
| 114 |  |  |  |  |  |  |  | 
| 115 |  |  |  |  |  |  | =item 2 | 
| 116 |  |  |  |  |  |  |  | 
| 117 |  |  |  |  |  |  | XML::SAX filters are used which will allow interested developers to write | 
| 118 |  |  |  |  |  |  | their own metadata parsing packages, and drop them into place. This is useful | 
| 119 |  |  |  |  |  |  | because OAI-PMH is itself metadata schema agnostic, so you can use OAI-PMH | 
| 120 |  |  |  |  |  |  | to distribute all kinds of metadata (Dublin Core, MARC, EAD, or your favorite | 
| 121 |  |  |  |  |  |  | metadata schema). OAI-PMH does require that a repository at least provides | 
| 122 |  |  |  |  |  |  | Dublin Core metadata as a baseline. Net::OAI::Harvester has built in support for | 
| 123 |  |  |  |  |  |  | unqualified Dublin Core, and has a framework for dropping in your own parser | 
| 124 |  |  |  |  |  |  | for other kinds of metadata. If you create a XML::Handler that you would like | 
| 125 |  |  |  |  |  |  | to contribute back into the Net::OAI::Harvester project please get in touch! | 
| 126 |  |  |  |  |  |  |  | 
| 127 |  |  |  |  |  |  | =back | 
| 128 |  |  |  |  |  |  |  | 
| 129 |  |  |  |  |  |  | =head1 METHODS | 
| 130 |  |  |  |  |  |  |  | 
| 131 |  |  |  |  |  |  | All the Net::OAI::Harvester methods return other objects. As you would expect | 
| 132 |  |  |  |  |  |  | new() returns an Net::OAI::Harvester object; similarly getRecord() returns an | 
| 133 |  |  |  |  |  |  | Net::OAI::Record object, listIdentifiers() returns a Net::OAI::ListIdentifiers | 
| 134 |  |  |  |  |  |  | object, identify() returns an Net::OAI::Identify object, and so on. So when | 
| 135 |  |  |  |  |  |  | you use one of these methods you'll probably want to check out the docs for | 
| 136 |  |  |  |  |  |  | the object that gets returned so you can see what to do with it. Many | 
| 137 |  |  |  |  |  |  | of these classes inherit from Net::OAI::Base which provides some base | 
| 138 |  |  |  |  |  |  | functionality for retrieving errors, getting the raw XML, and the | 
| 139 |  |  |  |  |  |  | temporary file where the XML is stored (see Net::OAI::Base documentation for | 
| 140 |  |  |  |  |  |  | more details). | 
| 141 |  |  |  |  |  |  |  | 
| 142 |  |  |  |  |  |  | =head2 new() | 
| 143 |  |  |  |  |  |  |  | 
| 144 |  |  |  |  |  |  | The constructor which returns an Net::OAI::Harvester object. You must supply the | 
| 145 |  |  |  |  |  |  | baseURL parameter, to tell Net::OAI::Harvester what data repository you are | 
| 146 |  |  |  |  |  |  | going to be harvesting. For a list of data providers check out the directory | 
| 147 |  |  |  |  |  |  | available on the Open Archives Initiative homepage. | 
| 148 |  |  |  |  |  |  |  | 
| 149 |  |  |  |  |  |  | my $harvester = Net::OAI::Harvester->new( | 
| 150 |  |  |  |  |  |  | baseURL => 'http://memory.loc.gov/cgi-bin/oai2_0' | 
| 151 |  |  |  |  |  |  | ); | 
| 152 |  |  |  |  |  |  |  | 
| 153 |  |  |  |  |  |  | If you want to pull down all the XML files and keep them in a directory, rather | 
| 154 |  |  |  |  |  |  | than having the stored as transient temp files pass in the dumpDir parameter. | 
| 155 |  |  |  |  |  |  |  | 
| 156 |  |  |  |  |  |  | my $harvester = Net::OAI::Harvester->new( | 
| 157 |  |  |  |  |  |  | baseUrl => 'http://memory.loc.gov/cgi-bin/oai2_0', | 
| 158 |  |  |  |  |  |  | dumpDir => 'american-memory' | 
| 159 |  |  |  |  |  |  | ); | 
| 160 |  |  |  |  |  |  |  | 
| 161 |  |  |  |  |  |  | Also if you would like to fine tune the HTTP client used by Net::OAI::Harvester | 
| 162 |  |  |  |  |  |  | you can pass in a configured object. For example this can be handy if you | 
| 163 |  |  |  |  |  |  | want to adjust the client timeout: | 
| 164 |  |  |  |  |  |  |  | 
| 165 |  |  |  |  |  |  | my $ua = LWP::UserAgent->new(); | 
| 166 |  |  |  |  |  |  | $ua->timeout(20); ## set timeout to 20 seconds | 
| 167 |  |  |  |  |  |  | my $harvester = Net::OAI::Harvester->new( | 
| 168 |  |  |  |  |  |  | baseURL     => 'http://memory.loc.gov/cgi-bin/oai2_0', | 
| 169 |  |  |  |  |  |  | userAgent   => $ua | 
| 170 |  |  |  |  |  |  | ); | 
| 171 |  |  |  |  |  |  |  | 
| 172 |  |  |  |  |  |  | =cut | 
| 173 |  |  |  |  |  |  |  | 
| 174 |  |  |  |  |  |  | sub new { | 
| 175 | 17 |  |  | 17 | 1 | 17125 | my ( $class, %opts ) = @_; | 
| 176 |  |  |  |  |  |  |  | 
| 177 |  |  |  |  |  |  | ## uppercase options | 
| 178 | 17 |  |  |  |  | 67 | my %normalOpts = map { ( uc($_), $opts{$_} ) } keys( %opts ); | 
|  | 19 |  |  |  |  | 129 |  | 
| 179 |  |  |  |  |  |  |  | 
| 180 |  |  |  |  |  |  | ## we must be told a baseURL | 
| 181 | 17 | 50 |  |  |  | 101 | croak( "new() needs the baseUrl parameter" ) if !$normalOpts{ BASEURL }; | 
| 182 | 17 |  |  |  |  | 156 | my $baseURL = URI->new( $normalOpts{ BASEURL } ); | 
| 183 |  |  |  |  |  |  |  | 
| 184 | 17 |  | 33 |  |  | 5082610 | my $self = bless( { baseURL => $baseURL }, ref( $class ) || $class ); | 
| 185 |  |  |  |  |  |  |  | 
| 186 |  |  |  |  |  |  | ## set the user agent | 
| 187 | 17 | 100 |  |  |  | 99 | if ( $normalOpts{ USERAGENT } ) { | 
| 188 | 1 |  |  |  |  | 6 | $self->userAgent( $normalOpts{ USERAGENT } ); | 
| 189 |  |  |  |  |  |  | } else { | 
| 190 | 16 |  |  |  |  | 253 | my $ua = LWP::UserAgent->new(); | 
| 191 | 16 |  |  |  |  | 47969 | $ua->agent( $class ); | 
| 192 | 16 |  |  |  |  | 1109 | $self->userAgent( $ua ); | 
| 193 |  |  |  |  |  |  | } | 
| 194 |  |  |  |  |  |  |  | 
| 195 |  |  |  |  |  |  | # set up some stuff if we are dumping xml to a directory | 
| 196 | 17 | 100 |  |  |  | 114 | if ($normalOpts{ DUMPDIR }) { | 
| 197 | 1 |  |  |  |  | 2 | my $dir = $normalOpts{ DUMPDIR }; | 
| 198 | 1 | 50 |  |  |  | 18 | croak "no such directory '$dir'" unless -d $dir; | 
| 199 | 1 |  |  |  |  | 3 | $self->{ dumpDir } = $dir; | 
| 200 | 1 |  |  |  |  | 3 | $self->{ lastDump } = 0; | 
| 201 |  |  |  |  |  |  | } | 
| 202 |  |  |  |  |  |  |  | 
| 203 | 17 |  |  |  |  | 105 | return( $self ); | 
| 204 |  |  |  |  |  |  | } | 
| 205 |  |  |  |  |  |  |  | 
| 206 |  |  |  |  |  |  | =head2 identify() | 
| 207 |  |  |  |  |  |  |  | 
| 208 |  |  |  |  |  |  | identify() is the OAI verb that tells a metadata repository to provide a | 
| 209 |  |  |  |  |  |  | description of itself. A call to identify() returns a Net::OAI::Identify object | 
| 210 |  |  |  |  |  |  | which you can then call methods on to retrieve the information you are | 
| 211 |  |  |  |  |  |  | intersted in. For example: | 
| 212 |  |  |  |  |  |  |  | 
| 213 |  |  |  |  |  |  | my $identity = $harvester->identify(); | 
| 214 |  |  |  |  |  |  | print "repository name: ",$identity->repositoryName(),"\n"; | 
| 215 |  |  |  |  |  |  | print "protocol version: ",$identity->protocolVersion(),"\n"; | 
| 216 |  |  |  |  |  |  | print "earliest date stamp: ",$identity->earliestDatestamp(),"\n"; | 
| 217 |  |  |  |  |  |  | print "admin email(s): ", join( ", ", $identity->adminEmail() ), "\n"; | 
| 218 |  |  |  |  |  |  | ... | 
| 219 |  |  |  |  |  |  |  | 
| 220 |  |  |  |  |  |  | For more details see the Net::OAI::Identify documentation. | 
| 221 |  |  |  |  |  |  |  | 
| 222 |  |  |  |  |  |  | =cut | 
| 223 |  |  |  |  |  |  |  | 
| 224 |  |  |  |  |  |  | sub identify { | 
| 225 | 4 |  |  | 4 | 1 | 1589 | my $self = shift; | 
| 226 | 4 |  |  |  |  | 12 | my $uri = $self->{ baseURL }; | 
| 227 | 4 |  |  |  |  | 43 | $uri->query_form( 'verb' => 'Identify' ); | 
| 228 |  |  |  |  |  |  |  | 
| 229 | 4 |  |  |  |  | 677 | my $identity = Net::OAI::Identify->new( $self->_get( $uri ) ); | 
| 230 | 4 | 100 |  |  |  | 35 | return $identity if $identity->errorCode(); | 
| 231 |  |  |  |  |  |  |  | 
| 232 | 2 |  |  |  |  | 23 | my $token = Net::OAI::ResumptionToken->new( Handler => $identity ); | 
| 233 | 2 |  |  |  |  | 19 | my $error = Net::OAI::Error->new( Handler => $token ); | 
| 234 | 2 |  |  |  |  | 8 | my $parser = _parser( $error ); | 
| 235 | 2 |  |  |  |  | 20 | debug( "parsing Identify response " .  $identity->file() ); | 
| 236 | 2 |  |  |  |  | 4 | eval { $parser->parse_uri( $identity->file() ) }; | 
|  | 2 |  |  |  |  | 7 |  | 
| 237 | 2 | 50 |  |  |  | 664 | if ( $@ ) {_xmlError( $error ); } | 
|  | 0 |  |  |  |  | 0 |  | 
| 238 | 2 | 50 |  |  |  | 11 | $identity->{ token } = $token->token() ? $token : undef; | 
| 239 | 2 |  |  |  |  | 9 | $identity->{ error } = $error; | 
| 240 | 2 |  |  |  |  | 69 | return( $identity ); | 
| 241 |  |  |  |  |  |  | } | 
| 242 |  |  |  |  |  |  |  | 
| 243 |  |  |  |  |  |  | =head2 listMetadataFormats() | 
| 244 |  |  |  |  |  |  |  | 
| 245 |  |  |  |  |  |  | listMetadataFormats() asks the repository to return a list of metadata formats | 
| 246 |  |  |  |  |  |  | that it supports. A call to listMetadataFormats() returns an | 
| 247 |  |  |  |  |  |  | Net::OAI::ListMetadataFormats object. | 
| 248 |  |  |  |  |  |  |  | 
| 249 |  |  |  |  |  |  | my $list = $harvester->listMetadataFormats(); | 
| 250 |  |  |  |  |  |  | print "archive supports metadata prefixes: ", | 
| 251 |  |  |  |  |  |  | join( ',', $list->prefixes() ),"\n"; | 
| 252 |  |  |  |  |  |  |  | 
| 253 |  |  |  |  |  |  | If you are interested in the metadata formats available for | 
| 254 |  |  |  |  |  |  | a particular resource identifier then you can pass in that identifier. | 
| 255 |  |  |  |  |  |  |  | 
| 256 |  |  |  |  |  |  | my $list = $harvester->listMetadataFormats( identifier => '1234567' ); | 
| 257 |  |  |  |  |  |  | print "record identifier 1234567 can be retrieved as ", | 
| 258 |  |  |  |  |  |  | join( ',', $list->prefixes() ),"\n"; | 
| 259 |  |  |  |  |  |  |  | 
| 260 |  |  |  |  |  |  | See documentation for Net::OAI::ListMetadataFormats for more details. | 
| 261 |  |  |  |  |  |  |  | 
| 262 |  |  |  |  |  |  | =cut | 
| 263 |  |  |  |  |  |  |  | 
| 264 |  |  |  |  |  |  | sub listMetadataFormats { | 
| 265 | 2 |  |  | 2 | 1 | 452 | my ( $self, %opts ) = @_; | 
| 266 | 2 |  |  |  |  | 8 | my $uri = $self->{ baseURL }; | 
| 267 | 2 |  |  |  |  | 8 | my %pairs = ( verb => 'ListMetadataFormats' ); | 
| 268 | 2 | 100 |  |  |  | 8 | if ( $opts{ identifier } ) { | 
| 269 | 1 |  |  |  |  | 4 | $pairs{ identifier } = $opts{ identifier }; | 
| 270 |  |  |  |  |  |  | } | 
| 271 | 2 |  |  |  |  | 26 | $uri->query_form( %pairs ); | 
| 272 |  |  |  |  |  |  |  | 
| 273 | 2 |  |  |  |  | 348 | my $list = Net::OAI::ListMetadataFormats->new( $self->_get( $uri ) ); | 
| 274 | 2 | 50 |  |  |  | 21 | return $list if $list->errorCode(); | 
| 275 |  |  |  |  |  |  |  | 
| 276 | 2 |  |  |  |  | 24 | my $token = Net::OAI::ResumptionToken->new( Handler => $list ); | 
| 277 | 2 |  |  |  |  | 23 | my $error = Net::OAI::Error->new( Handler => $token ); | 
| 278 | 2 |  |  |  |  | 9 | my $parser = _parser( $error ); | 
| 279 | 2 |  |  |  |  | 24 | debug( "parsing ListMetadataFormats response: ".$list->file() ); | 
| 280 | 2 |  |  |  |  | 6 | eval{ $parser->parse_uri( $list->file() ) }; | 
|  | 2 |  |  |  |  | 11 |  | 
| 281 | 2 | 50 |  |  |  | 9530 | if ( $@ ) { _xmlError( $error ); } | 
|  | 0 |  |  |  |  | 0 |  | 
| 282 | 2 | 50 |  |  |  | 16 | $list->{ token } = $token->token() ? $token : undef; | 
| 283 | 2 |  |  |  |  | 7 | $list->{ error } = $error; | 
| 284 | 2 |  |  |  |  | 100 | return( $list ); | 
| 285 |  |  |  |  |  |  | } | 
| 286 |  |  |  |  |  |  |  | 
| 287 |  |  |  |  |  |  | =head2 getRecord() | 
| 288 |  |  |  |  |  |  |  | 
| 289 |  |  |  |  |  |  | getRecord() is used to retrieve a single record from a repository. You must pass | 
| 290 |  |  |  |  |  |  | in the C and an optional C parameters to identify | 
| 291 |  |  |  |  |  |  | the record, and the flavor of metadata you would like. Net::OAI::Harvester | 
| 292 |  |  |  |  |  |  | includes a parser for OAI DublinCore, so if you do not specifiy a | 
| 293 |  |  |  |  |  |  | metadataPrefix 'oai_dc' will be assumed. If you would like to drop in you own | 
| 294 |  |  |  |  |  |  | XML::Handler for another type of metadata use the C parameter. | 
| 295 |  |  |  |  |  |  |  | 
| 296 |  |  |  |  |  |  | my $record = $harvester->getRecord( | 
| 297 |  |  |  |  |  |  | identifier	=> 'abc123', | 
| 298 |  |  |  |  |  |  | ); | 
| 299 |  |  |  |  |  |  |  | 
| 300 |  |  |  |  |  |  | ## get the Net::OAI::Record::Header object | 
| 301 |  |  |  |  |  |  | my $header = $record->header(); | 
| 302 |  |  |  |  |  |  |  | 
| 303 |  |  |  |  |  |  | ## get the metadata object | 
| 304 |  |  |  |  |  |  | my $metadata = $record->metadata(); | 
| 305 |  |  |  |  |  |  |  | 
| 306 |  |  |  |  |  |  | ## or if you would rather use your own XML::Handler | 
| 307 |  |  |  |  |  |  | ## pass in the package name for the object you would like to create | 
| 308 |  |  |  |  |  |  | my $record = $harvester->getRecord( | 
| 309 |  |  |  |  |  |  | identifier		=> 'abc123', | 
| 310 |  |  |  |  |  |  | metadataHandler		=> 'MyHandler' | 
| 311 |  |  |  |  |  |  | ); | 
| 312 |  |  |  |  |  |  | my $metadata = $record->metadata(); | 
| 313 |  |  |  |  |  |  |  | 
| 314 |  |  |  |  |  |  | =cut | 
| 315 |  |  |  |  |  |  |  | 
| 316 |  |  |  |  |  |  | sub getRecord { | 
| 317 | 3 |  |  | 3 | 1 | 1323 | my ( $self, %opts ) = @_; | 
| 318 |  |  |  |  |  |  |  | 
| 319 | 3 | 50 |  |  |  | 15 | croak( "you must pass the identifier parameter to getRecord()" ) | 
| 320 |  |  |  |  |  |  | if ( ! exists( $opts{ 'identifier' } ) ); | 
| 321 | 3 | 50 |  |  |  | 59 | croak( "you must pass the metadataPrefix parameter to getRecord()" ) | 
| 322 |  |  |  |  |  |  | if ( ! exists( $opts{ 'metadataPrefix' } ) ); | 
| 323 |  |  |  |  |  |  |  | 
| 324 | 3 |  |  |  |  | 6 | my $metadataHandler; | 
| 325 | 3 | 100 |  |  |  | 13 | if ( exists( $opts{ metadataHandler } ) ) { | 
| 326 | 1 |  |  |  |  | 4 | my $package = $opts{ metadataHandler }; | 
| 327 | 1 |  |  |  |  | 5 | _verifyMetadataHandler( $package ); | 
| 328 | 1 |  |  |  |  | 9 | $metadataHandler = $package->new(); | 
| 329 |  |  |  |  |  |  | } else { | 
| 330 | 2 |  |  |  |  | 30 | $metadataHandler = Net::OAI::Record::OAI_DC->new(); | 
| 331 |  |  |  |  |  |  | } | 
| 332 |  |  |  |  |  |  |  | 
| 333 | 3 |  |  |  |  | 91 | my $uri = $self->{ baseURL }; | 
| 334 | 3 |  |  |  |  | 41 | $uri->query_form( | 
| 335 |  |  |  |  |  |  | verb		=> 'GetRecord', | 
| 336 |  |  |  |  |  |  | identifier	=> $opts{ 'identifier' }, | 
| 337 |  |  |  |  |  |  | metadataPrefix	=> $opts{ 'metadataPrefix' } | 
| 338 |  |  |  |  |  |  | ); | 
| 339 |  |  |  |  |  |  |  | 
| 340 | 3 |  |  |  |  | 644 | my $record = Net::OAI::GetRecord->new( $self->_get( $uri ) ); | 
| 341 | 3 | 50 |  |  |  | 31 | return $record if $record->errorCode(); | 
| 342 |  |  |  |  |  |  |  | 
| 343 | 3 |  |  |  |  | 37 | my $header = Net::OAI::Record::Header->new( Handler => $metadataHandler ); | 
| 344 | 3 |  |  |  |  | 36 | my $error = Net::OAI::Error->new( Handler => $header ); | 
| 345 | 3 |  |  |  |  | 12 | my $parser = _parser( $error ); | 
| 346 | 3 |  |  |  |  | 69 | debug( "parsing GetRecord response " . $record->file() ); | 
| 347 | 3 |  |  |  |  | 13 | $parser->parse_uri( $record->file() ); | 
| 348 | 3 | 50 |  |  |  | 1262 | if ( $@ ) { _xmlError( $error ); } | 
|  | 0 |  |  |  |  | 0 |  | 
| 349 |  |  |  |  |  |  |  | 
| 350 | 3 |  |  |  |  | 16 | $record->{ error } = $error; | 
| 351 | 3 |  |  |  |  | 10 | $record->{ metadata } = $metadataHandler; | 
| 352 | 3 |  |  |  |  | 11 | $record->{ header } = $header; | 
| 353 | 3 |  |  |  |  | 126 | return( $record ); | 
| 354 |  |  |  |  |  |  |  | 
| 355 |  |  |  |  |  |  | } | 
| 356 |  |  |  |  |  |  |  | 
| 357 |  |  |  |  |  |  |  | 
| 358 |  |  |  |  |  |  | =head2 listRecords() | 
| 359 |  |  |  |  |  |  |  | 
| 360 |  |  |  |  |  |  | listRecords() allows you to retrieve all the records in a data repository. | 
| 361 |  |  |  |  |  |  | You must supply the C parameter to tell your Net::OAI::Harvester | 
| 362 |  |  |  |  |  |  | which type of records you are interested in. listRecords() returns an | 
| 363 |  |  |  |  |  |  | Net::OAI::ListRecords object. There are four other optional parameters C, | 
| 364 |  |  |  |  |  |  | C, C, and C which are better described in the | 
| 365 |  |  |  |  |  |  | OAI-PMH spec. | 
| 366 |  |  |  |  |  |  |  | 
| 367 |  |  |  |  |  |  | my $records = $harvester->listRecords( | 
| 368 |  |  |  |  |  |  | metadataPrefix	=> 'oai_dc' | 
| 369 |  |  |  |  |  |  | ); | 
| 370 |  |  |  |  |  |  |  | 
| 371 |  |  |  |  |  |  | ## iterate through the results with next() | 
| 372 |  |  |  |  |  |  | while ( my $record = $records->next() ) { | 
| 373 |  |  |  |  |  |  | my $metadata = $record->metadata(); | 
| 374 |  |  |  |  |  |  | ... | 
| 375 |  |  |  |  |  |  | } | 
| 376 |  |  |  |  |  |  |  | 
| 377 |  |  |  |  |  |  | If you would like to use your own metadata handler then you can specify | 
| 378 |  |  |  |  |  |  | the package name of the handler. | 
| 379 |  |  |  |  |  |  |  | 
| 380 |  |  |  |  |  |  | my $records = $harvester->listRecords( | 
| 381 |  |  |  |  |  |  | metadataPrefix	=> 'mods', | 
| 382 |  |  |  |  |  |  | metadataHandler	=> 'MODS::Handler' | 
| 383 |  |  |  |  |  |  | ); | 
| 384 |  |  |  |  |  |  |  | 
| 385 |  |  |  |  |  |  | while ( my $record = $records->next() ) { | 
| 386 |  |  |  |  |  |  | my $metadata = $record->metadata(); | 
| 387 |  |  |  |  |  |  | # $metadata will be a MODS::Handler object | 
| 388 |  |  |  |  |  |  | } | 
| 389 |  |  |  |  |  |  |  | 
| 390 |  |  |  |  |  |  | If you want to automatically handle resumption tokens you can with | 
| 391 |  |  |  |  |  |  | the listAllRecords() method. | 
| 392 |  |  |  |  |  |  |  | 
| 393 |  |  |  |  |  |  | If you prefer you can handle resumption tokens yourself with a | 
| 394 |  |  |  |  |  |  | loop, and the resumptionToken() method. You might want to do this | 
| 395 |  |  |  |  |  |  | if you are working with a repository that wants you to wait between | 
| 396 |  |  |  |  |  |  | requests. | 
| 397 |  |  |  |  |  |  |  | 
| 398 |  |  |  |  |  |  | my $records = $harvester->listRecords( metadataPrefix => 'oai_dc' ); | 
| 399 |  |  |  |  |  |  | my $finished = 0; | 
| 400 |  |  |  |  |  |  |  | 
| 401 |  |  |  |  |  |  | while ( ! $finished ) { | 
| 402 |  |  |  |  |  |  |  | 
| 403 |  |  |  |  |  |  | while ( my $record = $records->next() ) { | 
| 404 |  |  |  |  |  |  | my $metadata = $record->metadata(); | 
| 405 |  |  |  |  |  |  | # do interesting stuff here | 
| 406 |  |  |  |  |  |  | } | 
| 407 |  |  |  |  |  |  |  | 
| 408 |  |  |  |  |  |  | my $rToken = $records->resumptionToken(); | 
| 409 |  |  |  |  |  |  | if ( $rToken ) { | 
| 410 |  |  |  |  |  |  | $records = $harvester->listRecords( | 
| 411 |  |  |  |  |  |  | resumptionToken => $rToken->token() | 
| 412 |  |  |  |  |  |  | ); | 
| 413 |  |  |  |  |  |  | } else { | 
| 414 |  |  |  |  |  |  | $finished = 1; | 
| 415 |  |  |  |  |  |  | } | 
| 416 |  |  |  |  |  |  |  | 
| 417 |  |  |  |  |  |  | } | 
| 418 |  |  |  |  |  |  |  | 
| 419 |  |  |  |  |  |  | =cut | 
| 420 |  |  |  |  |  |  |  | 
| 421 |  |  |  |  |  |  | sub listRecords { | 
| 422 | 5 |  |  | 5 | 1 | 1121 | my ( $self, %opts ) = @_; | 
| 423 |  |  |  |  |  |  |  | 
| 424 | 5 | 50 | 66 |  |  | 34 | croak( "you must pass the metadataPrefix parameter to listRecords()" ) | 
| 425 |  |  |  |  |  |  | if ( ! exists( $opts{ 'metadataPrefix' } ) | 
| 426 |  |  |  |  |  |  | and ! exists( $opts{ 'resumptionToken' } ) ); | 
| 427 | 5 |  |  |  |  | 17 | my %pairs = ( | 
| 428 |  |  |  |  |  |  | verb		  => 'ListRecords', | 
| 429 |  |  |  |  |  |  | ); | 
| 430 |  |  |  |  |  |  |  | 
| 431 | 5 |  |  |  |  | 16 | foreach ( qw( metadataPrefix from until set resumptionToken ) ) { | 
| 432 | 25 | 100 |  |  |  | 68 | if ( exists( $opts{ $_ } ) ) { | 
| 433 | 8 |  |  |  |  | 22 | $pairs{ $_ } = $opts{ $_ }; | 
| 434 |  |  |  |  |  |  | } | 
| 435 |  |  |  |  |  |  | } | 
| 436 | 5 |  |  |  |  | 26 | my $uri = $self->{ baseURL }; | 
| 437 | 5 |  |  |  |  | 62 | $uri->query_form( %pairs ); | 
| 438 |  |  |  |  |  |  |  | 
| 439 |  |  |  |  |  |  | my $list = Net::OAI::ListRecords->new( $self->_get( $uri ), | 
| 440 | 5 |  |  |  |  | 903 | metadataHandler => $opts{ metadataHandler } ); | 
| 441 | 5 | 50 |  |  |  | 64 | return $list if $list->errorCode(); | 
| 442 |  |  |  |  |  |  |  | 
| 443 | 5 |  |  |  |  | 53 | my $token = Net::OAI::ResumptionToken->new( Handler => $list ); | 
| 444 | 5 |  |  |  |  | 53 | my $error = Net::OAI::Error->new( Handler => $token ); | 
| 445 | 5 |  |  |  |  | 24 | my $parser = _parser( $error ); | 
| 446 | 5 |  |  |  |  | 64 | debug( "parsing ListRecords response " . $list->file() ); | 
| 447 | 5 |  |  |  |  | 13 | eval { $parser->parse_uri( $list->file() ) }; | 
|  | 5 |  |  |  |  | 19 |  | 
| 448 | 5 | 50 |  |  |  | 1724 | if ( $@ ) { _xmlError( $error ); } | 
|  | 0 |  |  |  |  | 0 |  | 
| 449 |  |  |  |  |  |  |  | 
| 450 | 5 |  |  |  |  | 19 | $list->{ error } = $error; | 
| 451 | 5 | 100 |  |  |  | 28 | $list->{ token } = $token->token() ? $token : undef; | 
| 452 | 5 |  |  |  |  | 196 | return( $list ); | 
| 453 |  |  |  |  |  |  | } | 
| 454 |  |  |  |  |  |  |  | 
| 455 |  |  |  |  |  |  | =head2 listAllRecords() | 
| 456 |  |  |  |  |  |  |  | 
| 457 |  |  |  |  |  |  | Does exactly what listRecords() does except it will automatically | 
| 458 |  |  |  |  |  |  | submit resumption tokens as needed. | 
| 459 |  |  |  |  |  |  |  | 
| 460 |  |  |  |  |  |  | =cut | 
| 461 |  |  |  |  |  |  |  | 
| 462 |  |  |  |  |  |  | sub listAllRecords { | 
| 463 | 1 |  |  | 1 | 1 | 426 | my $self = shift; | 
| 464 | 1 |  |  |  |  | 6 | debug( "calling listRecords() as part of listAllRecords request" ); | 
| 465 | 1 |  |  |  |  | 4 | my $list = listRecords( $self, @_ ); | 
| 466 | 1 |  |  |  |  | 4 | $list->{ harvester } = $self; | 
| 467 | 1 |  |  |  |  | 5 | return( $list ); | 
| 468 |  |  |  |  |  |  | } | 
| 469 |  |  |  |  |  |  |  | 
| 470 |  |  |  |  |  |  | =head2 listIdentifiers() | 
| 471 |  |  |  |  |  |  |  | 
| 472 |  |  |  |  |  |  | listIdentifiers() takes the same parameters that listRecords() takes, but it | 
| 473 |  |  |  |  |  |  | returns only the record headers, allowing you to quickly retrieve all the | 
| 474 |  |  |  |  |  |  | record identifiers for a particular repository. The object returned is a | 
| 475 |  |  |  |  |  |  | Net::OAI::ListIdentifiers object. | 
| 476 |  |  |  |  |  |  |  | 
| 477 |  |  |  |  |  |  | my $headers = $harvester->listIdentifiers( | 
| 478 |  |  |  |  |  |  | metadataPrefix	=> 'oai_dc' | 
| 479 |  |  |  |  |  |  | ); | 
| 480 |  |  |  |  |  |  |  | 
| 481 |  |  |  |  |  |  | ## iterate through the results with next() | 
| 482 |  |  |  |  |  |  | while ( my $header = $identifiers->next() ) { | 
| 483 |  |  |  |  |  |  | print "identifier: ", $header->identifier(), "\n"; | 
| 484 |  |  |  |  |  |  | } | 
| 485 |  |  |  |  |  |  |  | 
| 486 |  |  |  |  |  |  | If you want to automtically handle resumption tokens use listAllIdentifiers(). | 
| 487 |  |  |  |  |  |  | If you are working with a repository that encourages pauses between requests | 
| 488 |  |  |  |  |  |  | you can handle the tokens yourself using the technique described above | 
| 489 |  |  |  |  |  |  | in listRecords(). | 
| 490 |  |  |  |  |  |  |  | 
| 491 |  |  |  |  |  |  | =cut | 
| 492 |  |  |  |  |  |  |  | 
| 493 |  |  |  |  |  |  | sub listIdentifiers { | 
| 494 | 7 |  |  | 7 | 1 | 12425 | my ( $self, %opts ) = @_; | 
| 495 |  |  |  |  |  |  | croak( "listIdentifiers(): metadataPrefix is a required parameter" ) | 
| 496 |  |  |  |  |  |  | if ( ! exists( $opts{ metadataPrefix } ) | 
| 497 | 7 | 50 | 66 |  |  | 44 | and ! exists( $opts{ resumptionToken } ) ); | 
| 498 | 7 |  |  |  |  | 24 | my $uri = $self->{ baseURL }; | 
| 499 | 7 |  |  |  |  | 31 | my %pairs = ( | 
| 500 |  |  |  |  |  |  | verb		=> 'ListIdentifiers', | 
| 501 |  |  |  |  |  |  | ); | 
| 502 | 7 |  |  |  |  | 29 | foreach ( qw( metadataPrefix from until set resumptionToken ) ) { | 
| 503 | 35 | 100 |  |  |  | 100 | if ( exists( $opts{ $_ } ) ) { | 
| 504 | 10 |  |  |  |  | 30 | $pairs{ $_ } = $opts{ $_ }; | 
| 505 |  |  |  |  |  |  | } | 
| 506 |  |  |  |  |  |  | } | 
| 507 | 7 |  |  |  |  | 128 | $uri->query_form( %pairs ); | 
| 508 |  |  |  |  |  |  |  | 
| 509 | 7 |  |  |  |  | 1506 | my $list = Net::OAI::ListIdentifiers->new( $self->_get( $uri ) ); | 
| 510 | 7 | 50 |  |  |  | 264 | return( $list ) if $list->errorCode(); | 
| 511 |  |  |  |  |  |  |  | 
| 512 | 7 |  |  |  |  | 83 | my $token = Net::OAI::ResumptionToken->new( Handler => $list ); | 
| 513 | 7 |  |  |  |  | 84 | my $error = Net::OAI::Error->new( Handler => $token ); | 
| 514 | 7 |  |  |  |  | 34 | my $parser = _parser( $error ); | 
| 515 | 7 |  |  |  |  | 85 | debug( "parsing ListIdentifiers response " . $list->file() ); | 
| 516 | 7 |  |  |  |  | 16 | eval { $parser->parse_uri( $list->file() ) }; | 
|  | 7 |  |  |  |  | 32 |  | 
| 517 | 7 | 50 |  |  |  | 2693 | if ( $@ ) { _xmlError( $error ); } | 
|  | 0 |  |  |  |  | 0 |  | 
| 518 | 7 | 100 |  |  |  | 41 | $list->{ token } = $token->token() ? $token : undef; | 
| 519 | 7 |  |  |  |  | 25 | $list->{ error } = $error; | 
| 520 | 7 |  |  |  |  | 320 | return( $list ); | 
| 521 |  |  |  |  |  |  | } | 
| 522 |  |  |  |  |  |  |  | 
| 523 |  |  |  |  |  |  | =head2 listAllIdentifiers() | 
| 524 |  |  |  |  |  |  |  | 
| 525 |  |  |  |  |  |  | Does exactly what listIdentifiers() does except it will automatically | 
| 526 |  |  |  |  |  |  | submit resumption tokens as needed. | 
| 527 |  |  |  |  |  |  |  | 
| 528 |  |  |  |  |  |  | =cut | 
| 529 |  |  |  |  |  |  |  | 
| 530 |  |  |  |  |  |  | sub listAllIdentifiers { | 
| 531 | 1 |  |  | 1 | 1 | 735 | my $self = shift; | 
| 532 | 1 |  |  |  |  | 10 | debug( "calling listIdentifiers() as part of listAllIdentifiers() call" ); | 
| 533 | 1 |  |  |  |  | 7 | my $list = listIdentifiers( $self, @_ ); | 
| 534 | 1 |  |  |  |  | 4 | $list->{ harvester } = $self; | 
| 535 | 1 |  |  |  |  | 4 | return( $list ); | 
| 536 |  |  |  |  |  |  | } | 
| 537 |  |  |  |  |  |  |  | 
| 538 |  |  |  |  |  |  | =head2 listSets() | 
| 539 |  |  |  |  |  |  |  | 
| 540 |  |  |  |  |  |  | listSets() takes an optional C parameter, and returns a | 
| 541 |  |  |  |  |  |  | Net::OAI::ListSets object. listSets() allows you to harvest a subset of a | 
| 542 |  |  |  |  |  |  | particular repository with listRecords(). For more information see the OAI-PMH | 
| 543 |  |  |  |  |  |  | spec and the Net::OAI::ListSets docs. | 
| 544 |  |  |  |  |  |  |  | 
| 545 |  |  |  |  |  |  | my $sets = $harvester->listSets(); | 
| 546 |  |  |  |  |  |  | foreach ( $sets->setSpecs() ) { | 
| 547 |  |  |  |  |  |  | print "set spec: $_ ; set name: ", $sets->setName( $_ ), "\n"; | 
| 548 |  |  |  |  |  |  | } | 
| 549 |  |  |  |  |  |  |  | 
| 550 |  |  |  |  |  |  | =cut | 
| 551 |  |  |  |  |  |  |  | 
| 552 |  |  |  |  |  |  | sub listSets { | 
| 553 | 1 |  |  | 1 | 1 | 880 | my ( $self, %opts ) = @_; | 
| 554 | 1 |  |  |  |  | 7 | my %pairs = ( verb => 'ListSets' ); | 
| 555 | 1 | 50 |  |  |  | 6 | if ( exists( $opts{ resumptionToken } ) ) { | 
| 556 | 0 |  |  |  |  | 0 | $pairs{ resumptionToken } = $opts{ resumptionToken }; | 
| 557 |  |  |  |  |  |  | } | 
| 558 | 1 |  |  |  |  | 4 | my $uri = $self->{ baseURL }; | 
| 559 | 1 |  |  |  |  | 21 | $uri->query_form( %pairs ); | 
| 560 |  |  |  |  |  |  |  | 
| 561 | 1 |  |  |  |  | 264 | my $list = Net::OAI::ListSets->new( $self->_get( $uri ) ); | 
| 562 | 1 | 50 |  |  |  | 11 | return( $list ) if $list->errorCode(); | 
| 563 |  |  |  |  |  |  |  | 
| 564 | 1 |  |  |  |  | 8 | my $token = Net::OAI::ResumptionToken->new( Handler => $list ); | 
| 565 | 1 |  |  |  |  | 10 | my $error = Net::OAI::Error->new( Handler => $token ); | 
| 566 | 1 |  |  |  |  | 6 | my $parser = _parser( $error ); | 
| 567 | 1 |  |  |  |  | 18 | debug( "parsing ListSets response " . $list->file() ); | 
| 568 | 1 |  |  |  |  | 2 | eval{ $parser->parse_uri( $list->file() ) }; | 
|  | 1 |  |  |  |  | 6 |  | 
| 569 | 1 | 50 |  |  |  | 368 | if ( $@ ) { _xmlError( $error ); } | 
|  | 0 |  |  |  |  | 0 |  | 
| 570 | 1 |  |  |  |  | 6 | $list->{ error } = $error; | 
| 571 | 1 | 50 |  |  |  | 7 | $list->{ token } = $token->token() ? $token : undef; | 
| 572 | 1 |  |  |  |  | 55 | return( $list ); | 
| 573 |  |  |  |  |  |  | } | 
| 574 |  |  |  |  |  |  |  | 
| 575 |  |  |  |  |  |  | =head2 baseURL() | 
| 576 |  |  |  |  |  |  |  | 
| 577 |  |  |  |  |  |  | Gets or sets the base URL for the repository being harvested. | 
| 578 |  |  |  |  |  |  |  | 
| 579 |  |  |  |  |  |  | $harvester->baseURL( 'http://memory.loc.gov/cgi-bin/oai2_0' ); | 
| 580 |  |  |  |  |  |  |  | 
| 581 |  |  |  |  |  |  | Or if you want to know what the current baseURL is | 
| 582 |  |  |  |  |  |  |  | 
| 583 |  |  |  |  |  |  | $baseURL = $harvester->baseURL(); | 
| 584 |  |  |  |  |  |  |  | 
| 585 |  |  |  |  |  |  | =cut | 
| 586 |  |  |  |  |  |  |  | 
| 587 |  |  |  |  |  |  | sub baseURL { | 
| 588 | 0 |  |  | 0 | 1 | 0 | my ( $self, $url ) = @_; | 
| 589 | 0 | 0 |  |  |  | 0 | if ( $url ) { $self->{ baseURL } = URI->new( $url ); } | 
|  | 0 |  |  |  |  | 0 |  | 
| 590 |  |  |  |  |  |  | # The HTTP UserAgent modifies its URI object upon execution, | 
| 591 |  |  |  |  |  |  | # therefore we have to reconstruct: trim the query part ... | 
| 592 | 0 |  |  |  |  | 0 | my $c = $self->{ baseURL }->canonical(); | 
| 593 | 0 | 0 | 0 |  |  | 0 | if ( $c && ($c =~ /^([^\?]*)\?/) ) {  # $c might be undefined | 
| 594 | 0 |  |  |  |  | 0 | return $1}; | 
| 595 | 0 |  |  |  |  | 0 | return $c; | 
| 596 |  |  |  |  |  |  | } | 
| 597 |  |  |  |  |  |  |  | 
| 598 |  |  |  |  |  |  | =head2 userAgent() | 
| 599 |  |  |  |  |  |  |  | 
| 600 |  |  |  |  |  |  | Gets or sets the LWP::UserAgent object being used to perform the HTTP | 
| 601 |  |  |  |  |  |  | transactions. This method could be useful if you wanted to change the | 
| 602 |  |  |  |  |  |  | agent string, timeout, or some other feature. | 
| 603 |  |  |  |  |  |  |  | 
| 604 |  |  |  |  |  |  | =cut | 
| 605 |  |  |  |  |  |  |  | 
| 606 |  |  |  |  |  |  | sub userAgent { | 
| 607 | 18 |  |  | 18 | 1 | 49 | my ( $self, $ua ) = @_; | 
| 608 | 18 | 100 |  |  |  | 120 | if ( $ua ) { | 
| 609 | 17 | 50 |  |  |  | 105 | croak( "userAgent() needs a valid LWP::UserAgent" ) | 
| 610 |  |  |  |  |  |  | if ref( $ua ) ne 'LWP::UserAgent'; | 
| 611 | 17 |  |  |  |  | 110 | $self->{ userAgent } = $ua; | 
| 612 |  |  |  |  |  |  | } | 
| 613 | 18 |  |  |  |  | 56 | return( $self->{ userAgent } ); | 
| 614 |  |  |  |  |  |  | } | 
| 615 |  |  |  |  |  |  |  | 
| 616 |  |  |  |  |  |  | ## internal stuff | 
| 617 |  |  |  |  |  |  |  | 
| 618 |  |  |  |  |  |  | sub _get { | 
| 619 | 22 |  |  | 22 |  | 55 | my ($self,$uri) = @_; | 
| 620 | 22 |  |  |  |  | 67 | my $ua = $self->{ userAgent }; | 
| 621 |  |  |  |  |  |  |  | 
| 622 | 22 |  |  |  |  | 64 | my ($fh, $file); | 
| 623 |  |  |  |  |  |  |  | 
| 624 | 22 | 100 |  |  |  | 89 | if ( $self->{ dumpDir } ) { | 
| 625 | 2 |  |  |  |  | 6 | my $filePrefix = $self->{lastDump}++; | 
| 626 | 2 |  |  |  |  | 14 | $file = sprintf("%s/%08d.xml", $self->{dumpDir}, $filePrefix); | 
| 627 | 2 |  |  |  |  | 23 | $fh = IO::File->new($file, 'w'); | 
| 628 |  |  |  |  |  |  | } | 
| 629 |  |  |  |  |  |  |  | 
| 630 |  |  |  |  |  |  | else { | 
| 631 | 20 |  |  |  |  | 162 | ( $fh, $file ) = tempfile(UNLINK => 1); | 
| 632 |  |  |  |  |  |  | } | 
| 633 |  |  |  |  |  |  |  | 
| 634 | 22 |  |  |  |  | 15741 | debug( "fetching ".$uri->as_string() ); | 
| 635 | 22 |  |  |  |  | 94 | debug( "writing to file: $file" ); | 
| 636 | 22 |  |  |  |  | 89 | my $request = HTTP::Request->new( GET => $uri->as_string() ); | 
| 637 | 22 |  |  | 1588 |  | 4023 | my $response = $ua->request( $request, sub { print $fh shift; }, 4096 ); | 
|  | 1588 |  |  |  |  | 117661344 |  | 
| 638 | 22 |  |  |  |  | 1020756 | close( $fh ); | 
| 639 |  |  |  |  |  |  |  | 
| 640 | 22 | 100 |  |  |  | 172 | if ( $response->is_error() ) { | 
| 641 |  |  |  |  |  |  | # HTTP::Request does not provide a file in case of HTTP level errors, | 
| 642 |  |  |  |  |  |  | # therefore we do not return the name of the non-existant file but | 
| 643 |  |  |  |  |  |  | # rather the original HTTP::Response object | 
| 644 | 2 |  |  |  |  | 33 | debug( "caught HTTP level error" . $response->message() ); | 
| 645 | 2 |  |  |  |  | 13 | my $error = Net::OAI::Error->new( | 
| 646 |  |  |  |  |  |  | errorString     => 'HTTP Level Error: ' . $response->message(), | 
| 647 |  |  |  |  |  |  | errorCode       => $response->code(), | 
| 648 |  |  |  |  |  |  | HTTPError       => $response, | 
| 649 |  |  |  |  |  |  | ); | 
| 650 |  |  |  |  |  |  | return( | 
| 651 |  |  |  |  |  |  | #	    file	    => $file, | 
| 652 | 2 |  |  |  |  | 26 | error           => $error | 
| 653 |  |  |  |  |  |  | ); | 
| 654 |  |  |  |  |  |  | } | 
| 655 |  |  |  |  |  |  |  | 
| 656 |  |  |  |  |  |  | return( | 
| 657 | 20 |  |  |  |  | 1452 | file	    => $file, | 
| 658 |  |  |  |  |  |  | ); | 
| 659 |  |  |  |  |  |  |  | 
| 660 |  |  |  |  |  |  | } | 
| 661 |  |  |  |  |  |  |  | 
| 662 |  |  |  |  |  |  | sub _parser { | 
| 663 | 22 |  |  | 22 |  | 1328 | my $handler = shift; | 
| 664 | 22 |  |  |  |  | 247 | my $factory = XML::SAX::ParserFactory->new(); | 
| 665 | 22 |  |  |  |  | 4000 | my $parser; | 
| 666 | 22 |  |  |  |  | 125 | $factory->require_feature(Namespaces); | 
| 667 | 22 |  |  |  |  | 231 | eval { $parser = $factory->parser( Handler => $handler ) }; | 
|  | 22 |  |  |  |  | 115 |  | 
| 668 | 22 | 100 |  |  |  | 1937 | warn ref($factory)." threw an exception:\n\t$@" if $@; | 
| 669 |  |  |  |  |  |  |  | 
| 670 | 22 | 100 | 66 |  |  | 3807 | if ( $parser && ref($parser) ) { | 
| 671 | 1 |  |  |  |  | 12 | debug( "using SAX parser " . ref($parser) . " " . $parser->VERSION ); | 
| 672 | 1 |  |  |  |  | 5 | return $parser; | 
| 673 |  |  |  |  |  |  | }; | 
| 674 |  |  |  |  |  |  |  | 
| 675 | 21 |  |  |  |  | 1603 | warn "!!! Please check your setup of XML::SAX, especially ParserDetails.ini !!!\n"; | 
| 676 | 21 |  |  |  |  | 93 | local($XML::SAX::ParserPackage) = "XML::SAX::PurePerl"; | 
| 677 | 21 |  |  |  |  | 62 | eval { $parser = $factory->parser( Handler => $handler ) }; | 
|  | 21 |  |  |  |  | 210 |  | 
| 678 | 21 | 50 |  |  |  | 10575526 | warn ref($factory)." threw an exception again:\n\t$@" if $@; | 
| 679 | 21 | 50 | 33 |  |  | 209 | if ( $parser && ref($parser) ) { | 
| 680 | 21 |  |  |  |  | 2431 | warn "Successfuly forced assignment of a parser: " . ref($parser) . " " . $parser->VERSION ."\n"; | 
| 681 | 21 |  |  |  |  | 322 | return $parser; | 
| 682 |  |  |  |  |  |  | }; | 
| 683 |  |  |  |  |  |  |  | 
| 684 | 0 |  |  |  |  | 0 | croak( ref($factory)." on request did not even give us the default XML::SAX::PurePerl parser.\nGiving up." ); | 
| 685 |  |  |  |  |  |  | } | 
| 686 |  |  |  |  |  |  |  | 
| 687 |  |  |  |  |  |  | sub _xmlError { | 
| 688 | 0 |  |  | 0 |  | 0 | my $e = shift; | 
| 689 | 0 |  |  |  |  | 0 | warn "caught xml parsing error: $@"; | 
| 690 | 0 |  |  |  |  | 0 | $e->errorString( "XML parsing error: $@" ); | 
| 691 | 0 |  |  |  |  | 0 | $e->errorCode( 'xmlParseError' ); | 
| 692 |  |  |  |  |  |  | } | 
| 693 |  |  |  |  |  |  |  | 
| 694 |  |  |  |  |  |  |  | 
| 695 |  |  |  |  |  |  | sub _verifyMetadataHandler { | 
| 696 | 6 |  |  | 6 |  | 85 | my $package = shift; | 
| 697 | 6 |  |  | 4 |  | 991 | eval( "use $package" ); | 
|  | 4 |  |  |  |  | 409 |  | 
|  | 4 |  |  |  |  | 291 |  | 
|  | 4 |  |  |  |  | 72 |  | 
| 698 | 6 | 50 |  |  |  | 36 | _fatal( "unable to locate metadataHandler $package in: " . | 
| 699 |  |  |  |  |  |  | join( "\n\t", @INC ) ) if $@; | 
| 700 | 6 |  |  |  |  | 33 | _fatal( "metadataHandler $package must inherit from XML::SAX::Base\n" ) | 
| 701 | 6 | 50 |  |  |  | 296 | if ( ! grep { 'XML::SAX::Base' } eval( '@' . $package . '::ISA' ) ); | 
| 702 | 6 |  |  |  |  | 26 | return( 1 ); | 
| 703 |  |  |  |  |  |  | } | 
| 704 |  |  |  |  |  |  |  | 
| 705 |  |  |  |  |  |  |  | 
| 706 |  |  |  |  |  |  | sub debug { | 
| 707 | 6683 |  |  | 6683 | 0 | 14369 | my $msg = shift; | 
| 708 | 6683 | 50 |  |  |  | 28477 | if ( $Net::OAI::Harvester::DEBUG ) { | 
| 709 | 0 |  |  |  |  | 0 | print STDERR "oai-harvester: " . localtime() . ": $msg\n"; | 
| 710 |  |  |  |  |  |  | } | 
| 711 |  |  |  |  |  |  | } | 
| 712 |  |  |  |  |  |  |  | 
| 713 |  |  |  |  |  |  | sub _fatal { | 
| 714 | 0 |  |  | 0 |  | 0 | my $msg = shift; | 
| 715 | 0 |  |  |  |  | 0 | print STDERR "fatal: $msg\n\n"; | 
| 716 | 0 |  |  |  |  | 0 | exit( 1 ); | 
| 717 |  |  |  |  |  |  | } | 
| 718 |  |  |  |  |  |  |  | 
| 719 |  |  |  |  |  |  | =head1 DIAGNOSTICS | 
| 720 |  |  |  |  |  |  |  | 
| 721 |  |  |  |  |  |  | If you would like to see diagnostic information when harvesting is running | 
| 722 |  |  |  |  |  |  | then set Net::OAI::Harvester::DEBUG to a true value. | 
| 723 |  |  |  |  |  |  |  | 
| 724 |  |  |  |  |  |  | $Net::OAI::Harvester::DEBUG = 1; | 
| 725 |  |  |  |  |  |  |  | 
| 726 |  |  |  |  |  |  | =head1 PERFORMANCE | 
| 727 |  |  |  |  |  |  |  | 
| 728 |  |  |  |  |  |  | XML::SAX is used for parsing, but it presents a generalized interface to many | 
| 729 |  |  |  |  |  |  | parsers. It comes with XML::Parser::PurePerl by default, which is nice since | 
| 730 |  |  |  |  |  |  | you don't have to worry about getting the right libraries installed. However | 
| 731 |  |  |  |  |  |  | XML::Parser::PurePerl is rather slow compared to XML::LibXML. If you | 
| 732 |  |  |  |  |  |  | are a speed freak install XML::LibXML from CPAN today. | 
| 733 |  |  |  |  |  |  |  | 
| 734 |  |  |  |  |  |  | If you have a particular parser you want to use you can set the | 
| 735 |  |  |  |  |  |  | $XML::SAX::ParserPackage variable appropriately. See XML::SAX::ParserFactory | 
| 736 |  |  |  |  |  |  | documentation for details. | 
| 737 |  |  |  |  |  |  |  | 
| 738 |  |  |  |  |  |  | =head1 TODO | 
| 739 |  |  |  |  |  |  |  | 
| 740 |  |  |  |  |  |  | =over 4 | 
| 741 |  |  |  |  |  |  |  | 
| 742 |  |  |  |  |  |  | =item * | 
| 743 |  |  |  |  |  |  |  | 
| 744 |  |  |  |  |  |  | Allow Net::OAI::ListMetadataFormats to store more than just the metadata | 
| 745 |  |  |  |  |  |  | prefixes. | 
| 746 |  |  |  |  |  |  |  | 
| 747 |  |  |  |  |  |  | =item * | 
| 748 |  |  |  |  |  |  |  | 
| 749 |  |  |  |  |  |  | Implement Net::OAI::Set for iterator access to Net::OAI::ListSets. | 
| 750 |  |  |  |  |  |  |  | 
| 751 |  |  |  |  |  |  | =item * | 
| 752 |  |  |  |  |  |  |  | 
| 753 |  |  |  |  |  |  | Implement Net::OAI::Harvester::listAllSets(). | 
| 754 |  |  |  |  |  |  |  | 
| 755 |  |  |  |  |  |  | =item * | 
| 756 |  |  |  |  |  |  |  | 
| 757 |  |  |  |  |  |  | More documentation of other classes. | 
| 758 |  |  |  |  |  |  |  | 
| 759 |  |  |  |  |  |  | =item * | 
| 760 |  |  |  |  |  |  |  | 
| 761 |  |  |  |  |  |  | Document custom XML::Handler creation. | 
| 762 |  |  |  |  |  |  |  | 
| 763 |  |  |  |  |  |  | =item * | 
| 764 |  |  |  |  |  |  |  | 
| 765 |  |  |  |  |  |  | Handle optional compression. | 
| 766 |  |  |  |  |  |  |  | 
| 767 |  |  |  |  |  |  | =item * | 
| 768 |  |  |  |  |  |  |  | 
| 769 |  |  |  |  |  |  | Create common handlers for other metadata formats (MARC, qualified DC, etc). | 
| 770 |  |  |  |  |  |  |  | 
| 771 |  |  |  |  |  |  | =item * | 
| 772 |  |  |  |  |  |  |  | 
| 773 |  |  |  |  |  |  | Selectively load Net::OAI::* classes as needed, rather than getting all of them | 
| 774 |  |  |  |  |  |  | at once at the beginning of Net::OAI::Harvester. | 
| 775 |  |  |  |  |  |  |  | 
| 776 |  |  |  |  |  |  | =back | 
| 777 |  |  |  |  |  |  |  | 
| 778 |  |  |  |  |  |  | =head1 SEE ALSO | 
| 779 |  |  |  |  |  |  |  | 
| 780 |  |  |  |  |  |  | =over 4 | 
| 781 |  |  |  |  |  |  |  | 
| 782 |  |  |  |  |  |  | =item * | 
| 783 |  |  |  |  |  |  |  | 
| 784 |  |  |  |  |  |  | OAI-PMH Specification at L | 
| 785 |  |  |  |  |  |  |  | 
| 786 |  |  |  |  |  |  | =item * | 
| 787 |  |  |  |  |  |  |  | 
| 788 |  |  |  |  |  |  | L | 
| 789 |  |  |  |  |  |  |  | 
| 790 |  |  |  |  |  |  | =item * | 
| 791 |  |  |  |  |  |  |  | 
| 792 |  |  |  |  |  |  | L | 
| 793 |  |  |  |  |  |  |  | 
| 794 |  |  |  |  |  |  | =item * | 
| 795 |  |  |  |  |  |  |  | 
| 796 |  |  |  |  |  |  | L | 
| 797 |  |  |  |  |  |  |  | 
| 798 |  |  |  |  |  |  | =item * | 
| 799 |  |  |  |  |  |  |  | 
| 800 |  |  |  |  |  |  | L | 
| 801 |  |  |  |  |  |  |  | 
| 802 |  |  |  |  |  |  | =item * | 
| 803 |  |  |  |  |  |  |  | 
| 804 |  |  |  |  |  |  | L | 
| 805 |  |  |  |  |  |  |  | 
| 806 |  |  |  |  |  |  | =item * | 
| 807 |  |  |  |  |  |  |  | 
| 808 |  |  |  |  |  |  | L | 
| 809 |  |  |  |  |  |  |  | 
| 810 |  |  |  |  |  |  | =item * | 
| 811 |  |  |  |  |  |  |  | 
| 812 |  |  |  |  |  |  | L | 
| 813 |  |  |  |  |  |  |  | 
| 814 |  |  |  |  |  |  | =item * | 
| 815 |  |  |  |  |  |  |  | 
| 816 |  |  |  |  |  |  | L | 
| 817 |  |  |  |  |  |  |  | 
| 818 |  |  |  |  |  |  | =item * | 
| 819 |  |  |  |  |  |  |  | 
| 820 |  |  |  |  |  |  | L | 
| 821 |  |  |  |  |  |  |  | 
| 822 |  |  |  |  |  |  | =item * | 
| 823 |  |  |  |  |  |  |  | 
| 824 |  |  |  |  |  |  | L | 
| 825 |  |  |  |  |  |  |  | 
| 826 |  |  |  |  |  |  | =item * | 
| 827 |  |  |  |  |  |  |  | 
| 828 |  |  |  |  |  |  | L | 
| 829 |  |  |  |  |  |  |  | 
| 830 |  |  |  |  |  |  | =item * | 
| 831 |  |  |  |  |  |  |  | 
| 832 |  |  |  |  |  |  | L | 
| 833 |  |  |  |  |  |  |  | 
| 834 |  |  |  |  |  |  | =item * | 
| 835 |  |  |  |  |  |  |  | 
| 836 |  |  |  |  |  |  | L | 
| 837 |  |  |  |  |  |  |  | 
| 838 |  |  |  |  |  |  | =back | 
| 839 |  |  |  |  |  |  |  | 
| 840 |  |  |  |  |  |  | =head1 AUTHORS | 
| 841 |  |  |  |  |  |  |  | 
| 842 |  |  |  |  |  |  | =over 4 | 
| 843 |  |  |  |  |  |  |  | 
| 844 |  |  |  |  |  |  | =item * | 
| 845 |  |  |  |  |  |  |  | 
| 846 |  |  |  |  |  |  | Ed Summers | 
| 847 |  |  |  |  |  |  |  | 
| 848 |  |  |  |  |  |  | =item * | 
| 849 |  |  |  |  |  |  |  | 
| 850 |  |  |  |  |  |  | Martin Emmerich | 
| 851 |  |  |  |  |  |  |  | 
| 852 |  |  |  |  |  |  | =back | 
| 853 |  |  |  |  |  |  |  | 
| 854 |  |  |  |  |  |  | =cut | 
| 855 |  |  |  |  |  |  |  | 
| 856 |  |  |  |  |  |  | 1; |