line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Genealogy::ChroniclingAmerica; |
2
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
# https://chroniclingamerica.loc.gov/search/pages/results/?state=Indiana&andtext=james=serjeant&date1=1894&date2=1896&format=json |
4
|
4
|
|
|
4
|
|
375740
|
use warnings; |
|
4
|
|
|
|
|
28
|
|
|
4
|
|
|
|
|
140
|
|
5
|
4
|
|
|
4
|
|
23
|
use strict; |
|
4
|
|
|
|
|
7
|
|
|
4
|
|
|
|
|
78
|
|
6
|
4
|
|
|
4
|
|
2779
|
use LWP::UserAgent; |
|
4
|
|
|
|
|
187802
|
|
|
4
|
|
|
|
|
154
|
|
7
|
4
|
|
|
4
|
|
2784
|
use JSON; |
|
4
|
|
|
|
|
40603
|
|
|
4
|
|
|
|
|
24
|
|
8
|
4
|
|
|
4
|
|
629
|
use URI; |
|
4
|
|
|
|
|
8
|
|
|
4
|
|
|
|
|
104
|
|
9
|
4
|
|
|
4
|
|
22
|
use Carp; |
|
4
|
|
|
|
|
8
|
|
|
4
|
|
|
|
|
3251
|
|
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
=head1 NAME |
12
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
Genealogy::ChroniclingAmerica - Find URLs for a given person on the Library of Congress Newspaper Records |
14
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
=head1 VERSION |
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
Version 0.03 |
18
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
=cut |
20
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
our $VERSION = '0.03'; |
22
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
=head1 SYNOPSIS |
24
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
use HTTP::Cache::Transparent; # be nice |
26
|
|
|
|
|
|
|
use Genealogy::ChroniclingAmerica; |
27
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
HTTP::Cache::Transparent::init({ |
29
|
|
|
|
|
|
|
BasePath => '/tmp/cache' |
30
|
|
|
|
|
|
|
}); |
31
|
|
|
|
|
|
|
my $loc = Genealogy::ChroniclingAmerica->new({ |
32
|
|
|
|
|
|
|
firstname => 'John', |
33
|
|
|
|
|
|
|
lastname => 'Smith', |
34
|
|
|
|
|
|
|
state => 'Indiana', |
35
|
|
|
|
|
|
|
date_of_death => 1862 |
36
|
|
|
|
|
|
|
}); |
37
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
while(my $url = $loc->get_next_entry()) { |
39
|
|
|
|
|
|
|
print "$url\n"; |
40
|
|
|
|
|
|
|
} |
41
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
=head1 SUBROUTINES/METHODS |
43
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
=head2 new |
45
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
Creates a Genealogy::ChroniclingAmerica object. |
47
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
It takes three mandatory arguments state, firstname and lastname. |
49
|
|
|
|
|
|
|
State must be the full name, not an abbreviation. |
50
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
There are four optional arguments: middlename, date_of_birth, date_of_death, ua and host: |
52
|
|
|
|
|
|
|
host is the domain of the site to search, the default is chroniclingamerica.loc.gov. |
53
|
|
|
|
|
|
|
ua is a pointer to an object that understands get and env_proxy messages, such |
54
|
|
|
|
|
|
|
as L. |
55
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
=cut |
57
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
sub new { |
59
|
6
|
|
|
6
|
1
|
1604
|
my $proto = shift; |
60
|
6
|
|
66
|
|
|
48
|
my $class = ref($proto) || $proto; |
61
|
|
|
|
|
|
|
|
62
|
6
|
100
|
|
|
|
28
|
return unless(defined($class)); |
63
|
|
|
|
|
|
|
|
64
|
5
|
|
|
|
|
13
|
my %args; |
65
|
5
|
100
|
33
|
|
|
33
|
if(ref($_[0]) eq 'HASH') { |
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
66
|
4
|
|
|
|
|
10
|
%args = %{$_[0]}; |
|
4
|
|
|
|
|
24
|
|
67
|
|
|
|
|
|
|
} elsif(ref($_[0]) || !defined($_[0])) { |
68
|
0
|
|
|
|
|
0
|
Carp::croak('Usage: ', __PACKAGE__, '->new(%args)'); |
69
|
|
|
|
|
|
|
} elsif(@_ % 2 == 0) { |
70
|
1
|
|
|
|
|
9
|
%args = @_; |
71
|
|
|
|
|
|
|
} |
72
|
|
|
|
|
|
|
|
73
|
5
|
50
|
|
|
|
23
|
unless($args{'firstname'}) { |
74
|
0
|
|
|
|
|
0
|
Carp::croak('First name is not optional'); |
75
|
0
|
|
|
|
|
0
|
return; # Don't know why this is needed, but it is |
76
|
|
|
|
|
|
|
} |
77
|
5
|
50
|
|
|
|
20
|
unless(defined($args{'lastname'})) { |
78
|
0
|
|
|
|
|
0
|
Carp::croak('Last name is not optional'); |
79
|
0
|
|
|
|
|
0
|
return; |
80
|
|
|
|
|
|
|
} |
81
|
5
|
50
|
|
|
|
17
|
unless($args{'state'}) { |
82
|
0
|
|
|
|
|
0
|
Carp::croak('State is not optional'); |
83
|
0
|
|
|
|
|
0
|
return; |
84
|
|
|
|
|
|
|
} |
85
|
|
|
|
|
|
|
|
86
|
5
|
50
|
|
|
|
21
|
Carp::croak('State needs to be the full name') if(length($args{'state'}) == 2); |
87
|
|
|
|
|
|
|
|
88
|
5
|
|
33
|
|
|
61
|
my $ua = $args{'ua'} || LWP::UserAgent->new(agent => __PACKAGE__ . "/$VERSION"); |
89
|
5
|
50
|
|
|
|
6922
|
$ua->env_proxy(1) unless(delete $args{'ua'}); |
90
|
|
|
|
|
|
|
|
91
|
5
|
|
|
|
|
29334
|
my $rc = { ua => $ua }; |
92
|
5
|
|
50
|
|
|
40
|
$rc->{'host'} = $args{'host'} || 'chroniclingamerica.loc.gov'; |
93
|
|
|
|
|
|
|
|
94
|
5
|
|
|
|
|
35
|
my %query_parameters = ( 'format' => 'json', 'state' => ucfirst(lc($args{'state'})) ); |
95
|
5
|
100
|
|
|
|
50
|
if($query_parameters{'state'} eq 'District of columbia') { |
96
|
1
|
|
|
|
|
4
|
$query_parameters{'state'} = 'District of Columbia'; |
97
|
|
|
|
|
|
|
} |
98
|
5
|
|
|
|
|
28
|
my $name = $args{'firstname'}; |
99
|
5
|
100
|
|
|
|
20
|
if($args{'middlename'}) { |
100
|
1
|
|
|
|
|
6
|
$rc->{'name'} = "$name $args{middlename} $args{lastname}"; |
101
|
1
|
|
|
|
|
3
|
$name .= "=$args{middlename}"; |
102
|
|
|
|
|
|
|
} else { |
103
|
4
|
|
|
|
|
20
|
$rc->{'name'} = "$name $args{lastname}"; |
104
|
|
|
|
|
|
|
} |
105
|
5
|
|
|
|
|
34
|
$name .= "=$args{lastname}"; |
106
|
|
|
|
|
|
|
|
107
|
5
|
|
|
|
|
19
|
$query_parameters{'andtext'} = $name; |
108
|
5
|
50
|
|
|
|
18
|
if($args{'date_of_birth'}) { |
109
|
5
|
|
|
|
|
14
|
$query_parameters{'date1'} = $args{'date_of_birth'}; |
110
|
|
|
|
|
|
|
} |
111
|
5
|
100
|
|
|
|
18
|
if($args{'date_of_death'}) { |
112
|
3
|
|
|
|
|
9
|
$query_parameters{'date2'} = $args{'date_of_death'}; |
113
|
|
|
|
|
|
|
} |
114
|
|
|
|
|
|
|
|
115
|
5
|
|
|
|
|
43
|
my $uri = URI->new("https://$rc->{host}/search/pages/results/"); |
116
|
5
|
|
|
|
|
16881
|
$uri->query_form(%query_parameters); |
117
|
5
|
|
|
|
|
963
|
my $url = $uri->as_string(); |
118
|
|
|
|
|
|
|
# ::diag(">>>>$url = ", $rc->{'name'}); |
119
|
|
|
|
|
|
|
# print ">>>>$url = ", $rc->{'name'}, "\n"; |
120
|
|
|
|
|
|
|
|
121
|
5
|
|
|
|
|
47
|
my $resp = $ua->get($url); |
122
|
|
|
|
|
|
|
|
123
|
5
|
50
|
|
|
|
2837994
|
if($resp->is_error()) { |
124
|
0
|
|
|
|
|
0
|
Carp::carp("API returned error on $url: ", $resp->status_line()); |
125
|
0
|
|
|
|
|
0
|
return; |
126
|
|
|
|
|
|
|
} |
127
|
|
|
|
|
|
|
|
128
|
5
|
50
|
|
|
|
91
|
unless($resp->is_success()) { |
129
|
0
|
|
|
|
|
0
|
die $resp->status_line(); |
130
|
|
|
|
|
|
|
} |
131
|
|
|
|
|
|
|
|
132
|
5
|
|
|
|
|
141
|
$rc->{'json'} = JSON->new(); |
133
|
5
|
|
|
|
|
33
|
my $data = $rc->{'json'}->decode($resp->content()); |
134
|
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
# ::diag(Data::Dumper->new([$data])->Dump()); |
136
|
|
|
|
|
|
|
|
137
|
5
|
|
|
|
|
1811
|
my $matches = $data->{'totalItems'}; |
138
|
5
|
50
|
|
|
|
28
|
if($data->{'itemsPerPage'} < $matches) { |
139
|
0
|
|
|
|
|
0
|
$matches = $data->{'itemsPerPage'}; |
140
|
|
|
|
|
|
|
} |
141
|
|
|
|
|
|
|
|
142
|
5
|
|
|
|
|
17
|
$rc->{'matches'} = $matches; |
143
|
5
|
100
|
|
|
|
21
|
if($matches) { |
144
|
3
|
|
|
|
|
21
|
$rc->{'query_parameters'} = \%query_parameters; |
145
|
3
|
|
|
|
|
10
|
$rc->{'items'} = $data->{'items'}; |
146
|
3
|
|
|
|
|
16
|
$rc->{'index'} = 0; |
147
|
|
|
|
|
|
|
} |
148
|
|
|
|
|
|
|
|
149
|
5
|
|
|
|
|
356
|
return bless $rc, $class; |
150
|
|
|
|
|
|
|
} |
151
|
|
|
|
|
|
|
|
152
|
|
|
|
|
|
|
=head2 get_next_entry |
153
|
|
|
|
|
|
|
|
154
|
|
|
|
|
|
|
Returns the next match as a URL. |
155
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
=cut |
157
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
sub get_next_entry |
159
|
|
|
|
|
|
|
{ |
160
|
8
|
|
|
8
|
1
|
6381
|
my $self = shift; |
161
|
|
|
|
|
|
|
|
162
|
8
|
100
|
|
|
|
43
|
return if($self->{'matches'} == 0); |
163
|
|
|
|
|
|
|
|
164
|
6
|
100
|
|
|
|
25
|
if($self->{'index'} >= $self->{'matches'}) { |
165
|
4
|
|
|
|
|
17
|
return; |
166
|
|
|
|
|
|
|
} |
167
|
|
|
|
|
|
|
|
168
|
2
|
|
|
|
|
7
|
my $entry = @{$self->{'items'}}[$self->{'index'}]; |
|
2
|
|
|
|
|
9
|
|
169
|
2
|
|
|
|
|
6
|
$self->{'index'}++; |
170
|
|
|
|
|
|
|
|
171
|
2
|
50
|
|
|
|
8
|
if(!defined($entry->{'url'})) { |
172
|
0
|
|
|
|
|
0
|
return $self->get_next_entry(); |
173
|
|
|
|
|
|
|
} |
174
|
|
|
|
|
|
|
|
175
|
2
|
|
|
|
|
19
|
my $text = $entry->{'ocr_eng'}; |
176
|
|
|
|
|
|
|
|
177
|
2
|
50
|
|
|
|
8
|
if(!defined($text)) { |
178
|
0
|
|
|
|
|
0
|
return $self->get_next_entry(); |
179
|
|
|
|
|
|
|
} |
180
|
|
|
|
|
|
|
|
181
|
2
|
|
|
|
|
839
|
$text =~ s/[\r\n]/ /g; |
182
|
2
|
50
|
|
|
|
114
|
if($text !~ /$self->{'name'}/ims) { |
183
|
0
|
|
|
|
|
0
|
return $self->get_next_entry(); |
184
|
|
|
|
|
|
|
} |
185
|
|
|
|
|
|
|
|
186
|
|
|
|
|
|
|
# ::diag(Data::Dumper->new([$entry])->Dump()); |
187
|
|
|
|
|
|
|
|
188
|
2
|
|
|
|
|
15
|
my $resp = $self->{'ua'}->get($entry->{'url'}); |
189
|
|
|
|
|
|
|
|
190
|
2
|
50
|
|
|
|
988057
|
if($resp->is_error()) { |
191
|
|
|
|
|
|
|
# print 'got: ', $resp->content(), "\n"; |
192
|
0
|
|
|
|
|
0
|
Carp::carp("get_next_entry: API returned error on $entry->{url}: ", $resp->status_line()); |
193
|
0
|
|
|
|
|
0
|
return; |
194
|
|
|
|
|
|
|
} |
195
|
|
|
|
|
|
|
|
196
|
2
|
50
|
|
|
|
34
|
unless($resp->is_success()) { |
197
|
0
|
|
|
|
|
0
|
die $resp->status_line(); |
198
|
|
|
|
|
|
|
} |
199
|
|
|
|
|
|
|
|
200
|
2
|
|
|
|
|
39
|
return $self->{'json'}->decode($resp->content())->{'pdf'}; |
201
|
|
|
|
|
|
|
} |
202
|
|
|
|
|
|
|
|
203
|
|
|
|
|
|
|
=head1 AUTHOR |
204
|
|
|
|
|
|
|
|
205
|
|
|
|
|
|
|
Nigel Horne, C<< >> |
206
|
|
|
|
|
|
|
|
207
|
|
|
|
|
|
|
=head1 BUGS |
208
|
|
|
|
|
|
|
|
209
|
|
|
|
|
|
|
If a middle name is given and no match is found, |
210
|
|
|
|
|
|
|
it should search again without the middle name. |
211
|
|
|
|
|
|
|
|
212
|
|
|
|
|
|
|
Please report any bugs or feature requests to C, |
213
|
|
|
|
|
|
|
or through the web interface at |
214
|
|
|
|
|
|
|
L. |
215
|
|
|
|
|
|
|
I will be notified, and then you'll |
216
|
|
|
|
|
|
|
automatically be notified of progress on your bug as I make changes. |
217
|
|
|
|
|
|
|
|
218
|
|
|
|
|
|
|
=head1 SEE ALSO |
219
|
|
|
|
|
|
|
|
220
|
|
|
|
|
|
|
L |
221
|
|
|
|
|
|
|
L |
222
|
|
|
|
|
|
|
|
223
|
|
|
|
|
|
|
=head1 SUPPORT |
224
|
|
|
|
|
|
|
|
225
|
|
|
|
|
|
|
You can find documentation for this module with the perldoc command. |
226
|
|
|
|
|
|
|
|
227
|
|
|
|
|
|
|
perldoc Genealogy::ChroniclingAmerica |
228
|
|
|
|
|
|
|
|
229
|
|
|
|
|
|
|
You can also look for information at: |
230
|
|
|
|
|
|
|
|
231
|
|
|
|
|
|
|
=over 4 |
232
|
|
|
|
|
|
|
|
233
|
|
|
|
|
|
|
=item * RT: CPAN's request tracker |
234
|
|
|
|
|
|
|
|
235
|
|
|
|
|
|
|
L |
236
|
|
|
|
|
|
|
|
237
|
|
|
|
|
|
|
=item * CPAN Ratings |
238
|
|
|
|
|
|
|
|
239
|
|
|
|
|
|
|
L |
240
|
|
|
|
|
|
|
|
241
|
|
|
|
|
|
|
=item * Search CPAN |
242
|
|
|
|
|
|
|
|
243
|
|
|
|
|
|
|
L |
244
|
|
|
|
|
|
|
|
245
|
|
|
|
|
|
|
=back |
246
|
|
|
|
|
|
|
|
247
|
|
|
|
|
|
|
=head1 LICENSE AND COPYRIGHT |
248
|
|
|
|
|
|
|
|
249
|
|
|
|
|
|
|
Copyright 2018,2019 Nigel Horne. |
250
|
|
|
|
|
|
|
|
251
|
|
|
|
|
|
|
This program is released under the following licence: GPL2 |
252
|
|
|
|
|
|
|
|
253
|
|
|
|
|
|
|
=cut |
254
|
|
|
|
|
|
|
|
255
|
|
|
|
|
|
|
1; # End of Genealogy::ChroniclingAmerica |