File Coverage

blib/lib/Museum/Rijksmuseum/Object/Harvester.pm
Criterion Covered Total %
statement 27 59 45.7
branch 0 26 0.0
condition 0 17 0.0
subroutine 9 10 90.0
pod 1 1 100.0
total 37 113 32.7


line stmt bran cond sub pod time code
1             package Museum::Rijksmuseum::Object::Harvester;
2              
3 1     1   1244 use strictures 2;
  1         8  
  1         35  
4              
5 1     1   372 use Carp;
  1         3  
  1         129  
6 1     1   561 use HTTP::OAI;
  1         140432  
  1         32  
7 1     1   8 use Moo;
  1         1  
  1         20  
8 1     1   420 use Time::HiRes qw( sleep );
  1         2  
  1         9  
9 1     1   55 use URI;
  1         2  
  1         16  
10 1     1   3 use URI::QueryParam;
  1         1  
  1         16  
11              
12             =head1 NAME
13              
14             Museum::Rijksmuseum::Object::Harvester - Bulk-fetching of Rijksmuseum data via the OAI-PMH interface
15              
16             =head1 VERSION
17              
18             See L
19              
20             =cut
21              
22 1     1   4 use Museum::Rijksmuseum::Object; our $VERSION = $Museum::Rijksmuseum::Object::VERSION;
  1         1  
  1         25  
23 1     1   3 use namespace::clean;
  1         2  
  1         10  
24              
25             =head1 SYNOPSIS
26              
27             Does a bulk fetch of the Rijksmuseum collection database using the OAI-PMH
28             interface. For each record a callback will be called with the data. Note that
29             the format of this data won't necessarily be the same as returned by the
30             L calls, as it's coming from a different endpoint.
31              
32             use Museum::Rijksmuseum::Object::Harvester;
33              
34             my $h = Museum::Rijksmuseum::Object::Harvester->new();
35             my $status = $h->harvest(
36             set => 'subject:PublicDomainImages',
37             from => '2023-01-01',
38             type => 'identifiers',
39             callback => \&process_record,
40             );
41             if ( $status->{error} ) {
42             die "Error: $status->{error}\nLast resumption token: $status->{resumptionToken}\n";
43             }
44             if ( $status->{resumptionToken} ) {
45             print "Finished, token: $status->{resumptionToken}\n";
46             }
47              
48             =head1 SUBROUTINES/METHODS
49              
50             =head2 new
51              
52             my $h = Museum::Rijksmuseum::Object::Harvester->new();
53              
54             Create a new instance of the harvester.
55              
56             =cut
57              
58             =head2 harvest
59              
60             my $status = $h->harvest(
61             set => 'subject:PublicDomainImages',
62             from => '2023-01-01',
63             to => '2023-01-31',
64             resumptionToken => $last_token_you_saw,
65             delay => 1_000, # 1 second
66             type => 'identifiers',
67             callback => \&process_record,
68             );
69            
70             Begins harvesting the records from the Rijksmuseum. The only required fields
71             are C and C, but the default delay is 10 seconds so you
72             probably want to think about putting something sensible in there (or leave it
73             at 10 seconds if you don't mind being very polite.) If you have a resumption
74             token, perhaps you're recovering from a previous failure, you can supply that.
75             C and C are not defined in the API documentation, so it's uncertain
76             what they refer to. Latest update time maybe?
77              
78             C can in theory be C or C (mapping to
79             C and C internally), but C is currently
80             unsupported as at writing time I don't need it and it's a fair bit of work to
81             do right.
82              
83             C will be called for every identifier or record, in the case of
84             identifers it'll eceive a hashref containing C and C.
85             If the callback returns a non-false value (i.e. any value), we quietly shut down.
86             Due to the way resumption tokens work (i.e. they can be the same for subsequent
87             requests), even if you request a shutdown, you'll still be fed the rest of the
88             batch. This helps avoid missing records.
89              
90             The return value is a hashref that contains C if something went wrong,
91             and possibly a C to let you know how to pick up again.
92              
93             There is some basic retry logic with exponential backoff that'll hopefully help
94             seamlessly recover from transient network or service issues.
95              
96             =cut
97              
98             sub harvest {
99 0     0 1   my ( $self, %args ) = @_;
100              
101             my $params = {
102             $args{set} ? ( set => $args{set} ) : (),
103             $args{from} ? ( from => $args{from} ) : (),
104             $args{until} ? ( until => $args{until} ) : (),
105 0 0         $args{resumptionToken} ? ( resumptionToken => $args{resumptionToken} ) : (),
    0          
    0          
    0          
106             };
107              
108 0   0       my $delay = $args{delay} // 10_000;
109 0 0 0       if ( !$args{type} || $args{type} ne 'identifiers' ) {
110 0           croak 'Only type "identifiers" is currently supported, but you still have to say it';
111             }
112 0           my $verb = 'ListIdentifiers';
113 0           my $callback = $args{callback};
114 0 0         croak 'A "callback" parameter is required.' unless $callback;
115              
116 0           my $harv = HTTP::OAI::Harvester->new( baseURL => 'https://data.rijksmuseum.nl/oai' );
117             # We'll handle resume ourselves, because I think the default way
118             # wants to load _everything_ all in one go, or something. It's weird
119             # and not useful anyway.
120 0           $harv->resume(0);
121              
122 0           $params->{metadataPrefix} = 'oai_dc';
123 0           my $last_resumption_token = undef;
124 0           my ( $li, $shutdown );
125 0   0       do {
      0        
126 0           $li = $harv->ListIdentifiers(%$params);
127 0           my $retries = 10;
128 0           my $backoff_delay = 1;
129 0           while (ref($li) ne 'HTTP::OAI::Response') {
130             # TODO it'd be nice to put some proper logging in here.
131 0           sleep($backoff_delay);
132 0 0         if (--$retries <= 0) {
133 0           die "Error connecting to API server, all retries used up: " . $li->status_line . "\n";
134             }
135 0           $backoff_delay *= 1.5; # poor man's exponential backoff
136 0           $li = $harv->ListIdentifiers(%$params);
137             }
138              
139 0           while ( my $rec = $li->next ) {
140 0           my $sd = $callback->($rec);
141 0   0       $shutdown ||= $sd;
142             }
143 0 0         if ( $li->is_error ) {
    0          
144             return {
145 0 0         $last_resumption_token ? ( resumptionToken => $last_resumption_token ) : (),
146             error => $li->message,
147             };
148             } elsif ( !$shutdown ) {
149 0           $last_resumption_token = $li->resumptionToken;
150 0 0         $params = { resumptionToken => $last_resumption_token->resumptionToken }
151             if $last_resumption_token;
152 0 0 0       sleep( $delay / 1000.0 ) unless !$delay || !$last_resumption_token;
153             }
154             } while ( !$shutdown && $li->is_success && $last_resumption_token );
155              
156             return {
157 0 0         resumptionToken => $last_resumption_token ? $last_resumption_token->resumptionToken : undef,
158             shutdownRequested => $shutdown,
159             };
160             }
161              
162             =head1 AUTHOR
163              
164             Robin Sheat, C<< >>
165              
166             =head1 TODO
167              
168             =over 4
169              
170             =item Handle the ListRecords verb
171              
172             This'll require writing a parser for EDM-DC or similar.
173              
174             =item Implement logging
175              
176             A proper logging system would allow recording of transient failures to see if
177             they are becoming a problem. It would also allow the option for more
178             fine-grained progress information to be displayed.
179              
180             =back
181              
182             =cut
183              
184             =head1 BUGS
185              
186             Please report any bugs or feature requests to C, or through
187             the web interface at L. I will be notified, and then you'll
188             automatically be notified of progress on your bug as I make changes.
189              
190             Alternately, use the tracker on the repository page at L.
191              
192              
193             =head1 SUPPORT
194              
195             You can find documentation for this module with the perldoc command.
196              
197             perldoc Museum::Rijksmuseum::Object::Harvester
198              
199              
200             You can also look for information at:
201              
202             =over 4
203              
204             =item * Repository page (report bugs here)
205              
206             L
207              
208             =item * RT: CPAN's request tracker (or here)
209              
210             L
211              
212             =item * Search CPAN
213              
214             L
215              
216              
217             =back
218              
219              
220             =head1 ACKNOWLEDGEMENTS
221              
222              
223             =head1 LICENSE AND COPYRIGHT
224              
225             This software is Copyright (c) 2023-2026 by Robin Sheat.
226              
227             This is free software, licensed under:
228              
229             The Artistic License 2.0 (GPL Compatible)
230              
231              
232             =cut
233              
234             1;