line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Museum::Rijksmuseum::Object::Harvester; |
2
|
|
|
|
|
|
|
|
3
|
1
|
|
|
1
|
|
1316
|
use strictures 2; |
|
1
|
|
|
|
|
9
|
|
|
1
|
|
|
|
|
63
|
|
4
|
|
|
|
|
|
|
|
5
|
1
|
|
|
1
|
|
232
|
use Carp; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
52
|
|
6
|
1
|
|
|
1
|
|
480
|
use HTTP::OAI; |
|
1
|
|
|
|
|
120649
|
|
|
1
|
|
|
|
|
39
|
|
7
|
1
|
|
|
1
|
|
9
|
use Moo; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
9
|
|
8
|
1
|
|
|
1
|
|
404
|
use Time::HiRes qw( sleep ); |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
10
|
|
9
|
1
|
|
|
1
|
|
107
|
use URI; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
22
|
|
10
|
1
|
|
|
1
|
|
5
|
use URI::QueryParam; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
21
|
|
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
=head1 NAME |
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
Museum::Rijksmuseum::Object::Harvester - Bulk-fetching of Rijksmuseum data via the OAI-PMH interface |
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
=head1 VERSION |
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
See L<Museum::Rijksmuseum::Object> |
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
=cut |
21
|
|
|
|
|
|
|
|
22
|
1
|
|
|
1
|
|
5
|
use Museum::Rijksmuseum::Object; our $VERSION = $Museum::Rijksmuseum::Object::VERSION; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
35
|
|
23
|
1
|
|
|
1
|
|
5
|
use namespace::clean; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
11
|
|
24
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
=head1 SYNOPSIS |
26
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
Does a bulk fetch of the Rijksmuseum collection database using the OAI-PMH |
28
|
|
|
|
|
|
|
interface. For each record a callback will be called with the data. Note that |
29
|
|
|
|
|
|
|
the format of this data won't necessarily be the same as returned by the |
30
|
|
|
|
|
|
|
L<Museum::Rijksmuseum::Object> calls, as it's coming from a different endpoint. |
31
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
use Museum::Rijksmuseum::Object::Harvester; |
33
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
my $h = Museum::Rijksmuseum::Object::Harvester->new( key => 'abc123xyz' ); |
35
|
|
|
|
|
|
|
my $status = $h->harvest( |
36
|
|
|
|
|
|
|
set => 'subject:PublicDomainImages', |
37
|
|
|
|
|
|
|
from => '2023-01-01', |
38
|
|
|
|
|
|
|
type => 'identifiers', |
39
|
|
|
|
|
|
|
callback => \&process_record, |
40
|
|
|
|
|
|
|
); |
41
|
|
|
|
|
|
|
if ( $status->{error} ) { |
42
|
|
|
|
|
|
|
die "Error: $status->{error}\nLast resumption token: $status->{resumptionToken}\n"; |
43
|
|
|
|
|
|
|
} |
44
|
|
|
|
|
|
|
if ( $status->{resumptionToken} ) { |
45
|
|
|
|
|
|
|
print "Finished, token: $status->{resumptionToken}\n"; |
46
|
|
|
|
|
|
|
} |
47
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
=head1 SUBROUTINES/METHODS |
49
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
=head2 new |
51
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
my $h = Museum::Rijksmuseum::Object::Harvester->new( key => 'abc123xyz' ); |
53
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
Create a new instance of the harvester. C<key> is required. |
55
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
=cut |
57
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
=head2 harvest |
59
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
my $status = $h->harvest( |
61
|
|
|
|
|
|
|
set => 'subject:PublicDomainImages', |
62
|
|
|
|
|
|
|
from => '2023-01-01', |
63
|
|
|
|
|
|
|
to => '2023-01-31', |
64
|
|
|
|
|
|
|
resumptionToken => $last_token_you_saw, |
65
|
|
|
|
|
|
|
delay => 1_000, # 1 second |
66
|
|
|
|
|
|
|
type => 'identifiers', |
67
|
|
|
|
|
|
|
callback => \&process_record, |
68
|
|
|
|
|
|
|
); |
69
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
Begins harvesting the records from the Rijksmuseum. The only required fields |
71
|
|
|
|
|
|
|
are C<callback> and C<type>, but the default delay is 10 seconds so you |
72
|
|
|
|
|
|
|
probably want to think about putting something sensible in there (or leave it |
73
|
|
|
|
|
|
|
at 10 seconds if you don't mind being very polite.) If you have a resumption |
74
|
|
|
|
|
|
|
token, perhaps you're recovering from a previous failure, you can supply that. |
75
|
|
|
|
|
|
|
C<from> and C<to> are not defined in the API documentation, so it's uncertain |
76
|
|
|
|
|
|
|
what they refer to. Latest update time maybe? |
77
|
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
C<type> can in theory be C<identifiers> or C<records> (mapping to |
79
|
|
|
|
|
|
|
C<ListIdenifiers> and C<ListRecords> internally), but C<records> is currently |
80
|
|
|
|
|
|
|
unsupported as at writing time I don't need it and it's a fair bit of work to |
81
|
|
|
|
|
|
|
do right. |
82
|
|
|
|
|
|
|
|
83
|
|
|
|
|
|
|
C<callback> will be called for every identifier or record, in the case of |
84
|
|
|
|
|
|
|
identifers it'll eceive a hashref containing C<identifier> and C<datestamp>. |
85
|
|
|
|
|
|
|
If the callback returns a non-false value (i.e. any value), we quietly shut down. |
86
|
|
|
|
|
|
|
Due to the way resumption tokens work (i.e. they can be the same for subsequent |
87
|
|
|
|
|
|
|
requests), even if you request a shutdown, you'll still be fed the rest of the |
88
|
|
|
|
|
|
|
batch. This helps avoid missing records. |
89
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
The return value is a hashref that contains C<error> if something went wrong, |
91
|
|
|
|
|
|
|
and possibly a C<resumptionToken> to let you know how to pick up again. |
92
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
There is some basic retry logic with exponential backoff that'll hopefully help |
94
|
|
|
|
|
|
|
seamlessly recover from transient network or service issues. |
95
|
|
|
|
|
|
|
|
96
|
|
|
|
|
|
|
=cut |
97
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
sub harvest { |
99
|
0
|
|
|
0
|
1
|
|
my ( $self, %args ) = @_; |
100
|
|
|
|
|
|
|
|
101
|
|
|
|
|
|
|
my $params = { |
102
|
|
|
|
|
|
|
$args{set} ? ( set => $args{set} ) : (), |
103
|
|
|
|
|
|
|
$args{from} ? ( from => $args{from} ) : (), |
104
|
|
|
|
|
|
|
$args{until} ? ( until => $args{until} ) : (), |
105
|
0
|
0
|
|
|
|
|
$args{resumptionToken} ? ( resumptionToken => $args{resumptionToken} ) : (), |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
106
|
|
|
|
|
|
|
}; |
107
|
|
|
|
|
|
|
|
108
|
0
|
|
0
|
|
|
|
my $delay = $args{delay} // 10_000; |
109
|
0
|
0
|
0
|
|
|
|
if ( !$args{type} || $args{type} ne 'identifiers' ) { |
110
|
0
|
|
|
|
|
|
croak 'Only type "identifiers" is currently supported, but you still have to say it'; |
111
|
|
|
|
|
|
|
} |
112
|
0
|
|
|
|
|
|
my $verb = 'ListIdentifiers'; |
113
|
0
|
|
|
|
|
|
my $callback = $args{callback}; |
114
|
0
|
0
|
|
|
|
|
croak 'A "callback" parameter is required.' unless $callback; |
115
|
|
|
|
|
|
|
|
116
|
0
|
|
|
|
|
|
my $url = sprintf( 'https://www.rijksmuseum.nl/api/oai/%s', $self->key ); |
117
|
0
|
|
|
|
|
|
my $harv = HTTP::OAI::Harvester->new( baseURL => $url ); |
118
|
|
|
|
|
|
|
# We'll handle resume ourselves, because I think the default way |
119
|
|
|
|
|
|
|
# wants to load _everything_ all in one go, or something. It's weird |
120
|
|
|
|
|
|
|
# and not useful anyway. |
121
|
0
|
|
|
|
|
|
$harv->resume(0); |
122
|
|
|
|
|
|
|
|
123
|
0
|
|
|
|
|
|
$params->{metadataPrefix} = 'edm_dc'; |
124
|
0
|
|
|
|
|
|
my $last_resumption_token = undef; |
125
|
0
|
|
|
|
|
|
my ( $li, $shutdown ); |
126
|
0
|
|
0
|
|
|
|
do { |
|
|
|
0
|
|
|
|
|
127
|
0
|
|
|
|
|
|
$li = $harv->ListIdentifiers(%$params); |
128
|
0
|
|
|
|
|
|
my $retries = 10; |
129
|
0
|
|
|
|
|
|
my $backoff_delay = 1; |
130
|
0
|
|
|
|
|
|
while (ref($li) ne 'HTTP::OAI::Response') { |
131
|
|
|
|
|
|
|
# TODO it'd be nice to put some proper logging in here. |
132
|
0
|
|
|
|
|
|
sleep($backoff_delay); |
133
|
0
|
0
|
|
|
|
|
if (--$retries <= 0) { |
134
|
0
|
|
|
|
|
|
die "Error connecting to API server, all retries used up: " . $li->status_line . "\n"; |
135
|
|
|
|
|
|
|
} |
136
|
0
|
|
|
|
|
|
$backoff_delay *= 1.5; # poor man's exponential backoff |
137
|
0
|
|
|
|
|
|
$li = $harv->ListIdentifiers(%$params); |
138
|
|
|
|
|
|
|
} |
139
|
|
|
|
|
|
|
|
140
|
0
|
|
|
|
|
|
while ( my $rec = $li->next ) { |
141
|
0
|
|
|
|
|
|
my $sd = $callback->($rec); |
142
|
0
|
|
0
|
|
|
|
$shutdown ||= $sd; |
143
|
|
|
|
|
|
|
} |
144
|
0
|
0
|
|
|
|
|
if ( $li->is_error ) { |
|
|
0
|
|
|
|
|
|
145
|
|
|
|
|
|
|
return { |
146
|
0
|
0
|
|
|
|
|
$last_resumption_token ? ( resumptionToken => $last_resumption_token ) : (), |
147
|
|
|
|
|
|
|
error => $li->message, |
148
|
|
|
|
|
|
|
}; |
149
|
|
|
|
|
|
|
} elsif ( !$shutdown ) { |
150
|
0
|
|
|
|
|
|
$last_resumption_token = $li->resumptionToken; |
151
|
0
|
|
|
|
|
|
$params = { resumptionToken => $last_resumption_token->resumptionToken }; |
152
|
0
|
0
|
0
|
|
|
|
sleep( $delay / 1000.0 ) unless !$delay || !$last_resumption_token; |
153
|
|
|
|
|
|
|
} |
154
|
|
|
|
|
|
|
} while ( !$shutdown && $li->is_success && $last_resumption_token ); |
155
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
return { |
157
|
0
|
0
|
|
|
|
|
resumptionToken => $last_resumption_token ? $last_resumption_token->resumptionToken : undef, |
158
|
|
|
|
|
|
|
shutdownRequested => $shutdown, |
159
|
|
|
|
|
|
|
}; |
160
|
|
|
|
|
|
|
} |
161
|
|
|
|
|
|
|
|
162
|
|
|
|
|
|
|
=head1 ATTRIBUTES |
163
|
|
|
|
|
|
|
|
164
|
|
|
|
|
|
|
=head2 key |
165
|
|
|
|
|
|
|
|
166
|
|
|
|
|
|
|
The API key provided by the Rijksmuseum. |
167
|
|
|
|
|
|
|
|
168
|
|
|
|
|
|
|
=cut |
169
|
|
|
|
|
|
|
|
170
|
|
|
|
|
|
|
has key => ( |
171
|
|
|
|
|
|
|
is => 'rw', |
172
|
|
|
|
|
|
|
required => 1, |
173
|
|
|
|
|
|
|
); |
174
|
|
|
|
|
|
|
|
175
|
|
|
|
|
|
|
=head1 AUTHOR |
176
|
|
|
|
|
|
|
|
177
|
|
|
|
|
|
|
Robin Sheat, C<< <rsheat at cpan.org> >> |
178
|
|
|
|
|
|
|
|
179
|
|
|
|
|
|
|
=head1 TODO |
180
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
=over 4 |
182
|
|
|
|
|
|
|
|
183
|
|
|
|
|
|
|
=item Handle the ListRecords verb |
184
|
|
|
|
|
|
|
|
185
|
|
|
|
|
|
|
This'll require writing a parser for EDM-DC or similar. |
186
|
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
=item Implement logging |
188
|
|
|
|
|
|
|
|
189
|
|
|
|
|
|
|
A proper logging system would allow recording of transient failures to see if |
190
|
|
|
|
|
|
|
they are becoming a problem. It would also allow the option for more |
191
|
|
|
|
|
|
|
fine-grained progress information to be displayed. |
192
|
|
|
|
|
|
|
|
193
|
|
|
|
|
|
|
=back |
194
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
=cut |
196
|
|
|
|
|
|
|
|
197
|
|
|
|
|
|
|
=head1 BUGS |
198
|
|
|
|
|
|
|
|
199
|
|
|
|
|
|
|
Please report any bugs or feature requests to C<bug-museum-rijksmuseum-object at rt.cpan.org>, or through |
200
|
|
|
|
|
|
|
the web interface at L<https://rt.cpan.org/NoAuth/ReportBug.html?Queue=Museum-Rijksmuseum-Object>. I will be notified, and then you'll |
201
|
|
|
|
|
|
|
automatically be notified of progress on your bug as I make changes. |
202
|
|
|
|
|
|
|
|
203
|
|
|
|
|
|
|
Alternately, use the tracker on the repository page at L<https://gitlab.com/eythian/museum-rijksmuseum-object>. |
204
|
|
|
|
|
|
|
|
205
|
|
|
|
|
|
|
|
206
|
|
|
|
|
|
|
=head1 SUPPORT |
207
|
|
|
|
|
|
|
|
208
|
|
|
|
|
|
|
You can find documentation for this module with the perldoc command. |
209
|
|
|
|
|
|
|
|
210
|
|
|
|
|
|
|
perldoc Museum::Rijksmuseum::Object::Harvester |
211
|
|
|
|
|
|
|
|
212
|
|
|
|
|
|
|
|
213
|
|
|
|
|
|
|
You can also look for information at: |
214
|
|
|
|
|
|
|
|
215
|
|
|
|
|
|
|
=over 4 |
216
|
|
|
|
|
|
|
|
217
|
|
|
|
|
|
|
=item * Repository page (report bugs here) |
218
|
|
|
|
|
|
|
|
219
|
|
|
|
|
|
|
L<https://gitlab.com/eythian/museum-rijksmuseum-object> |
220
|
|
|
|
|
|
|
|
221
|
|
|
|
|
|
|
=item * RT: CPAN's request tracker (or here) |
222
|
|
|
|
|
|
|
|
223
|
|
|
|
|
|
|
L<https://rt.cpan.org/NoAuth/Bugs.html?Dist=Museum-Rijksmuseum-Object> |
224
|
|
|
|
|
|
|
|
225
|
|
|
|
|
|
|
=item * CPAN Ratings |
226
|
|
|
|
|
|
|
|
227
|
|
|
|
|
|
|
L<https://cpanratings.perl.org/d/Museum-Rijksmuseum-Object> |
228
|
|
|
|
|
|
|
|
229
|
|
|
|
|
|
|
=item * Search CPAN |
230
|
|
|
|
|
|
|
|
231
|
|
|
|
|
|
|
L<https://metacpan.org/release/Museum-Rijksmuseum-Object> |
232
|
|
|
|
|
|
|
|
233
|
|
|
|
|
|
|
|
234
|
|
|
|
|
|
|
=back |
235
|
|
|
|
|
|
|
|
236
|
|
|
|
|
|
|
|
237
|
|
|
|
|
|
|
=head1 ACKNOWLEDGEMENTS |
238
|
|
|
|
|
|
|
|
239
|
|
|
|
|
|
|
|
240
|
|
|
|
|
|
|
=head1 LICENSE AND COPYRIGHT |
241
|
|
|
|
|
|
|
|
242
|
|
|
|
|
|
|
This software is Copyright (c) 2023 by Robin Sheat. |
243
|
|
|
|
|
|
|
|
244
|
|
|
|
|
|
|
This is free software, licensed under: |
245
|
|
|
|
|
|
|
|
246
|
|
|
|
|
|
|
The Artistic License 2.0 (GPL Compatible) |
247
|
|
|
|
|
|
|
|
248
|
|
|
|
|
|
|
|
249
|
|
|
|
|
|
|
=cut |
250
|
|
|
|
|
|
|
|
251
|
|
|
|
|
|
|
1; |