| line | stmt | bran | cond | sub | pod | time | code | 
| 1 |  |  |  |  |  |  | package WWW::Scrape::FindaGrave; | 
| 2 |  |  |  |  |  |  |  | 
| 3 | 4 |  |  | 4 |  | 160648 | use warnings; | 
|  | 4 |  |  |  |  | 6 |  | 
|  | 4 |  |  |  |  | 100 |  | 
| 4 | 4 |  |  | 4 |  | 12 | use strict; | 
|  | 4 |  |  |  |  | 4 |  | 
|  | 4 |  |  |  |  | 56 |  | 
| 5 | 4 |  |  | 4 |  | 1589 | use WWW::Mechanize::GZip; | 
|  | 4 |  |  |  |  | 389135 |  | 
|  | 4 |  |  |  |  | 113 |  | 
| 6 | 4 |  |  | 4 |  | 23 | use LWP::UserAgent; | 
|  | 4 |  |  |  |  | 5 |  | 
|  | 4 |  |  |  |  | 52 |  | 
| 7 | 4 |  |  | 4 |  | 1586 | use HTML::SimpleLinkExtor; | 
|  | 4 |  |  |  |  | 17587 |  | 
|  | 4 |  |  |  |  | 2392 |  | 
| 8 |  |  |  |  |  |  |  | 
| 9 |  |  |  |  |  |  | =head1 NAME | 
| 10 |  |  |  |  |  |  |  | 
| 11 |  |  |  |  |  |  | WWW::Scrape::FindaGrave - Scrape the FindaGrave site | 
| 12 |  |  |  |  |  |  |  | 
| 13 |  |  |  |  |  |  | =head1 VERSION | 
| 14 |  |  |  |  |  |  |  | 
| 15 |  |  |  |  |  |  | Version 0.02 | 
| 16 |  |  |  |  |  |  |  | 
| 17 |  |  |  |  |  |  | =cut | 
| 18 |  |  |  |  |  |  |  | 
| 19 |  |  |  |  |  |  | our $VERSION = '0.02'; | 
| 20 |  |  |  |  |  |  |  | 
| 21 |  |  |  |  |  |  | =head1 SYNOPSIS | 
| 22 |  |  |  |  |  |  |  | 
| 23 |  |  |  |  |  |  | use HTTP::Cache::Transparent;  # be nice | 
| 24 |  |  |  |  |  |  | use WWW::Scape::FindaGrave; | 
| 25 |  |  |  |  |  |  |  | 
| 26 |  |  |  |  |  |  | HTTP::Cache::Transparent::init({ | 
| 27 |  |  |  |  |  |  | BasePath => '/var/cache/findagrave' | 
| 28 |  |  |  |  |  |  | }); | 
| 29 |  |  |  |  |  |  | my $f = WWW::Scrape::FindaGrave->new({ | 
| 30 |  |  |  |  |  |  | firstname => 'John', | 
| 31 |  |  |  |  |  |  | lastname => 'Smith', | 
| 32 |  |  |  |  |  |  | country => 'England', | 
| 33 |  |  |  |  |  |  | date_of_death => 1862 | 
| 34 |  |  |  |  |  |  | }); | 
| 35 |  |  |  |  |  |  |  | 
| 36 |  |  |  |  |  |  | while(my $url = $f->get_next_entry()) { | 
| 37 |  |  |  |  |  |  | print "$url\n"; | 
| 38 |  |  |  |  |  |  | } | 
| 39 |  |  |  |  |  |  | } | 
| 40 |  |  |  |  |  |  |  | 
| 41 |  |  |  |  |  |  | =head1 SUBROUTINES/METHODS | 
| 42 |  |  |  |  |  |  |  | 
| 43 |  |  |  |  |  |  | =head2 new | 
| 44 |  |  |  |  |  |  |  | 
| 45 |  |  |  |  |  |  | Creates a WWW::Scrape::FindaGrave object. | 
| 46 |  |  |  |  |  |  |  | 
| 47 |  |  |  |  |  |  | It takes two manadatory arguments firstname and lastname. | 
| 48 |  |  |  |  |  |  |  | 
| 49 |  |  |  |  |  |  | Also one of either date_of_birth and date_of_death must be given | 
| 50 |  |  |  |  |  |  |  | 
| 51 |  |  |  |  |  |  | There are two optional arguments: middlename and mech.  Mech is a pointer | 
| 52 |  |  |  |  |  |  | to an object such as L.  If not given it will be created. | 
| 53 |  |  |  |  |  |  | =cut | 
| 54 |  |  |  |  |  |  |  | 
| 55 |  |  |  |  |  |  | sub new { | 
| 56 | 3 |  |  | 3 | 1 | 87827 | my $proto = shift; | 
| 57 | 3 |  | 33 |  |  | 18 | my $class = ref($proto) || $proto; | 
| 58 |  |  |  |  |  |  |  | 
| 59 | 3 | 50 |  |  |  | 10 | return unless(defined($class)); | 
| 60 |  |  |  |  |  |  |  | 
| 61 | 3 | 50 |  |  |  | 12 | my %args = (ref($_[0]) eq 'HASH') ? %{$_[0]} : @_; | 
|  | 3 |  |  |  |  | 12 |  | 
| 62 |  |  |  |  |  |  |  | 
| 63 | 3 | 50 |  |  |  | 11 | die "First name is not optional" unless($args{'firstname'}); | 
| 64 | 3 | 50 |  |  |  | 8 | die "Last name is not optional" unless($args{'lastname'}); | 
| 65 |  |  |  |  |  |  | die "You must give one of the date of birth or death" | 
| 66 | 3 | 50 | 66 |  |  | 10 | unless($args{'date_of_death'} || $args{'date_of_birth'}); | 
| 67 |  |  |  |  |  |  |  | 
| 68 |  |  |  |  |  |  | my $rc = { | 
| 69 |  |  |  |  |  |  | mech => $args{'mech'} || WWW::Mechanize::GZip->new(), | 
| 70 |  |  |  |  |  |  | date_of_birth => $args{'date_of_birth'}, | 
| 71 |  |  |  |  |  |  | date_of_death => $args{'date_of_death'}, | 
| 72 |  |  |  |  |  |  | country => $args{'country'}, | 
| 73 |  |  |  |  |  |  | firstname => $args{'firstname'}, | 
| 74 |  |  |  |  |  |  | middlename => $args{'middlename'}, | 
| 75 | 3 |  | 33 |  |  | 99 | lastname => $args{'lastname'}, | 
| 76 |  |  |  |  |  |  | }; | 
| 77 |  |  |  |  |  |  |  | 
| 78 | 3 |  |  |  |  | 20218 | my $resp = $rc->{'mech'}->get('http://www.findagrave.com/cgi-bin/fg.cgi'); | 
| 79 | 3 | 50 |  |  |  | 1138196 | unless($resp->is_success()) { | 
| 80 | 0 |  |  |  |  | 0 | die $resp->status_line; | 
| 81 |  |  |  |  |  |  | } | 
| 82 |  |  |  |  |  |  |  | 
| 83 |  |  |  |  |  |  | my %fields = ( | 
| 84 |  |  |  |  |  |  | GSfn => $rc->{'firstname'}, | 
| 85 | 3 |  |  |  |  | 50 | GSln => $rc->{'lastname'}, | 
| 86 |  |  |  |  |  |  | GSiman => 0, | 
| 87 |  |  |  |  |  |  | GSpartial => 0, | 
| 88 |  |  |  |  |  |  | ); | 
| 89 |  |  |  |  |  |  |  | 
| 90 | 3 | 100 |  |  |  | 13 | if($rc->{date_of_death}) { | 
|  |  | 50 |  |  |  |  |  | 
| 91 | 2 |  |  |  |  | 4 | $fields{GSdy} = $rc->{date_of_death}; | 
| 92 | 2 |  |  |  |  | 5 | $fields{GSdyrel} = 'in'; | 
| 93 |  |  |  |  |  |  | } elsif($rc->{'date_of_birth'}) { | 
| 94 | 1 |  |  |  |  | 2 | $fields{GSby} = $rc->{date_of_birth}; | 
| 95 | 1 |  |  |  |  | 2 | $fields{GSbyrel} = 'in'; | 
| 96 |  |  |  |  |  |  | } | 
| 97 |  |  |  |  |  |  |  | 
| 98 | 3 | 50 |  |  |  | 12 | if($rc->{'middlename'}) { | 
| 99 | 0 |  |  |  |  | 0 | $fields{GSmn} = $rc->{'middlename'}; | 
| 100 |  |  |  |  |  |  | } | 
| 101 |  |  |  |  |  |  |  | 
| 102 |  |  |  |  |  |  | # Don't enable this.  If we know the date of birth but findagrave | 
| 103 |  |  |  |  |  |  | # doesn't, findagrave will miss the match. Of course, the downside | 
| 104 |  |  |  |  |  |  | # of not doing this is that you will get false positives.  It's really | 
| 105 |  |  |  |  |  |  | # a problem with findagrave. | 
| 106 |  |  |  |  |  |  | # if($date_of_birth) { | 
| 107 |  |  |  |  |  |  | # $fields{GSby} = $date_of_birth; | 
| 108 |  |  |  |  |  |  | # $fields{GSbyrel} = 'in'; | 
| 109 |  |  |  |  |  |  | # } | 
| 110 |  |  |  |  |  |  |  | 
| 111 | 3 | 100 |  |  |  | 9 | if($rc->{'country'}) { | 
| 112 | 2 | 50 |  |  |  | 7 | if($rc->{'country'} eq 'United States') { | 
| 113 | 0 |  |  |  |  | 0 | $fields{GScntry} = 'The United States'; | 
| 114 |  |  |  |  |  |  | } else { | 
| 115 | 2 |  |  |  |  | 5 | $fields{GScntry} = $rc->{'country'}; | 
| 116 |  |  |  |  |  |  | } | 
| 117 |  |  |  |  |  |  | } | 
| 118 |  |  |  |  |  |  |  | 
| 119 | 3 |  |  |  |  | 18 | $resp = $rc->{'mech'}->submit_form( | 
| 120 |  |  |  |  |  |  | form_number => 1, | 
| 121 |  |  |  |  |  |  | fields => \%fields, | 
| 122 |  |  |  |  |  |  | ); | 
| 123 | 3 | 50 |  |  |  | 1486457 | unless($resp->is_success) { | 
| 124 | 0 |  |  |  |  | 0 | die $resp->status_line; | 
| 125 |  |  |  |  |  |  | } | 
| 126 | 3 | 100 |  |  |  | 27 | if($resp->content =~ /Sorry, there are no records in the Find A Grave database matching your query\./) { | 
| 127 | 1 |  |  |  |  | 15 | $rc->{'matches'} = 0; | 
| 128 | 1 |  |  |  |  | 10 | return bless $rc, $class; | 
| 129 |  |  |  |  |  |  | } | 
| 130 | 2 | 50 |  |  |  | 35 | if($resp->content =~ /(\d+)<\/B>\s+total matches/mi) { | 
| 131 | 2 |  |  |  |  | 79 | $rc->{'matches'} = $1; | 
| 132 | 2 | 50 |  |  |  | 8 | return bless $rc, $class if($rc->{'matches'} == 0); | 
| 133 |  |  |  |  |  |  | } | 
| 134 |  |  |  |  |  |  |  | 
| 135 |  |  |  |  |  |  | # Shows 40 per page | 
| 136 | 2 |  |  |  |  | 7 | $rc->{'base'} = $resp->base(); | 
| 137 | 2 |  |  |  |  | 526 | $rc->{'ua'} = LWP::UserAgent->new( | 
| 138 |  |  |  |  |  |  | keep_alive => 1, | 
| 139 |  |  |  |  |  |  | agent => __PACKAGE__, | 
| 140 |  |  |  |  |  |  | from => 'foo@example.com', | 
| 141 |  |  |  |  |  |  | timeout => 10, | 
| 142 |  |  |  |  |  |  | ); | 
| 143 |  |  |  |  |  |  |  | 
| 144 | 2 |  |  |  |  | 3724 | $rc->{'ua'}->env_proxy(1); | 
| 145 | 2 |  |  |  |  | 146 | $rc->{'index'} = 0; | 
| 146 | 2 |  |  |  |  | 4 | $rc->{'resp'} = $resp; | 
| 147 |  |  |  |  |  |  |  | 
| 148 | 2 |  |  |  |  | 23 | return bless $rc, $class; | 
| 149 |  |  |  |  |  |  | } | 
| 150 |  |  |  |  |  |  |  | 
| 151 |  |  |  |  |  |  | =head2 get_next_entry | 
| 152 |  |  |  |  |  |  |  | 
| 153 |  |  |  |  |  |  | Returns the next match as a URL to the Find-A-Grave page. | 
| 154 |  |  |  |  |  |  |  | 
| 155 |  |  |  |  |  |  | =cut | 
| 156 |  |  |  |  |  |  |  | 
| 157 |  |  |  |  |  |  | sub get_next_entry | 
| 158 |  |  |  |  |  |  | { | 
| 159 | 4 |  |  | 4 | 1 | 2894 | my $self = shift; | 
| 160 |  |  |  |  |  |  |  | 
| 161 | 4 | 100 |  |  |  | 19 | return if($self->{'matches'} == 0); | 
| 162 |  |  |  |  |  |  |  | 
| 163 | 3 |  |  |  |  | 3 | my $rc = pop @{$self->{'results'}}; | 
|  | 3 |  |  |  |  | 7 |  | 
| 164 | 3 | 50 |  |  |  | 8 | return $rc if $rc; | 
| 165 |  |  |  |  |  |  |  | 
| 166 | 3 | 100 |  |  |  | 13 | return if($self->{'index'} >= $self->{'matches'}); | 
| 167 |  |  |  |  |  |  |  | 
| 168 | 1 |  |  |  |  | 3 | my $firstname = $self->{'firstname'}; | 
| 169 | 1 |  |  |  |  | 2 | my $lastname = $self->{'lastname'}; | 
| 170 | 1 |  |  |  |  | 2 | my $date_of_death = $self->{'date_of_death'}; | 
| 171 | 1 |  |  |  |  | 1 | my $date_of_birth = $self->{'date_of_birth'}; | 
| 172 |  |  |  |  |  |  |  | 
| 173 | 1 |  |  |  |  | 3 | my $base = $self->{'resp'}->base(); | 
| 174 | 1 |  |  |  |  | 282 | my $e = HTML::SimpleLinkExtor->new($base); | 
| 175 | 1 |  |  |  |  | 371 | $e->remove_tags('img', 'script'); | 
| 176 | 1 |  |  |  |  | 10 | $e->parse($self->{'resp'}->content); | 
| 177 |  |  |  |  |  |  |  | 
| 178 | 1 |  |  |  |  | 2124 | foreach my $link ($e->links) { | 
| 179 | 29 |  |  |  |  | 7200 | my $match = 0; | 
| 180 | 29 | 50 |  |  |  | 30 | if($date_of_death) { | 
|  |  | 0 |  |  |  |  |  | 
| 181 | 29 | 100 |  |  |  | 110 | if($link =~ /www.findagrave.com\/cgi-bin\/fg.cgi\?.*&GSln=\Q$lastname\E.*&GSfn=\Q$firstname\E.*&GSdy=\Q$date_of_death\E.*&GRid=\d+/i) { | 
| 182 | 1 |  |  |  |  | 8 | $match = 1; | 
| 183 |  |  |  |  |  |  | } | 
| 184 |  |  |  |  |  |  | } elsif(defined($date_of_birth)) { | 
| 185 | 0 | 0 |  |  |  | 0 | if($link =~ /www.findagrave.com\/cgi-bin\/fg.cgi\?.*&GSln=\Q$lastname\E.*&GSfn=\Q$firstname\E.*&GSby=\Q$date_of_birth\E.*&GRid=\d+/i) { | 
| 186 | 0 |  |  |  |  | 0 | $match = 1; | 
| 187 |  |  |  |  |  |  | } | 
| 188 |  |  |  |  |  |  | } | 
| 189 | 29 | 50 | 66 |  |  | 146 | if($match && $self->{'country'}) { | 
| 190 | 1 |  |  |  |  | 3 | my $country = $self->{'country'}; | 
| 191 | 1 | 50 |  |  |  | 7 | if($self->{'resp'}->content !~ /\Q$country\E/i) { | 
| 192 | 0 |  |  |  |  | 0 | $match = 0; | 
| 193 |  |  |  |  |  |  | } | 
| 194 |  |  |  |  |  |  | } | 
| 195 | 29 | 100 |  |  |  | 102 | if($match) { | 
| 196 | 1 |  |  |  |  | 2 | push @{$self->{'results'}}, $link; | 
|  | 1 |  |  |  |  | 3 |  | 
| 197 |  |  |  |  |  |  | } | 
| 198 |  |  |  |  |  |  | } | 
| 199 | 1 |  |  |  |  | 7 | $self->{'index'}++; | 
| 200 | 1 | 50 |  |  |  | 6 | if($self->{'index'} <= $self->{'matches'}) { | 
| 201 | 1 |  |  |  |  | 2 | my $index = $self->{'index'}; | 
| 202 | 1 |  |  |  |  | 4 | $self->{'resp'} = $self->{'ua'}->get("$base&sr=$index"); | 
| 203 |  |  |  |  |  |  | } | 
| 204 |  |  |  |  |  |  |  | 
| 205 | 1 |  |  |  |  | 339836 | return pop @{$self->{'results'}}; | 
|  | 1 |  |  |  |  | 11 |  | 
| 206 |  |  |  |  |  |  | } | 
| 207 |  |  |  |  |  |  |  | 
| 208 |  |  |  |  |  |  | =head1 AUTHOR | 
| 209 |  |  |  |  |  |  |  | 
| 210 |  |  |  |  |  |  | Nigel Horne, C<<  >> | 
| 211 |  |  |  |  |  |  |  | 
| 212 |  |  |  |  |  |  | =head1 BUGS | 
| 213 |  |  |  |  |  |  |  | 
| 214 |  |  |  |  |  |  | Please report any bugs or feature requests to C, | 
| 215 |  |  |  |  |  |  | or through the web interface at | 
| 216 |  |  |  |  |  |  | L. | 
| 217 |  |  |  |  |  |  | I will be notified, and then you'll | 
| 218 |  |  |  |  |  |  | automatically be notified of progress on your bug as I make changes. | 
| 219 |  |  |  |  |  |  |  | 
| 220 |  |  |  |  |  |  | =head1 SEE ALSO | 
| 221 |  |  |  |  |  |  |  | 
| 222 |  |  |  |  |  |  | L | 
| 223 |  |  |  |  |  |  | L | 
| 224 |  |  |  |  |  |  |  | 
| 225 |  |  |  |  |  |  | =head1 SUPPORT | 
| 226 |  |  |  |  |  |  |  | 
| 227 |  |  |  |  |  |  | You can find documentation for this module with the perldoc command. | 
| 228 |  |  |  |  |  |  |  | 
| 229 |  |  |  |  |  |  | perldoc WWW::Scape::FindaGrave | 
| 230 |  |  |  |  |  |  |  | 
| 231 |  |  |  |  |  |  |  | 
| 232 |  |  |  |  |  |  | You can also look for information at: | 
| 233 |  |  |  |  |  |  |  | 
| 234 |  |  |  |  |  |  | =over 4 | 
| 235 |  |  |  |  |  |  |  | 
| 236 |  |  |  |  |  |  | =item * RT: CPAN's request tracker | 
| 237 |  |  |  |  |  |  |  | 
| 238 |  |  |  |  |  |  | L | 
| 239 |  |  |  |  |  |  |  | 
| 240 |  |  |  |  |  |  | =item * AnnoCPAN: Annotated CPAN documentation | 
| 241 |  |  |  |  |  |  |  | 
| 242 |  |  |  |  |  |  | L | 
| 243 |  |  |  |  |  |  |  | 
| 244 |  |  |  |  |  |  | =item * CPAN Ratings | 
| 245 |  |  |  |  |  |  |  | 
| 246 |  |  |  |  |  |  | L | 
| 247 |  |  |  |  |  |  |  | 
| 248 |  |  |  |  |  |  | =item * Search CPAN | 
| 249 |  |  |  |  |  |  |  | 
| 250 |  |  |  |  |  |  | L | 
| 251 |  |  |  |  |  |  |  | 
| 252 |  |  |  |  |  |  | =back | 
| 253 |  |  |  |  |  |  |  | 
| 254 |  |  |  |  |  |  |  | 
| 255 |  |  |  |  |  |  | =head1 LICENSE AND COPYRIGHT | 
| 256 |  |  |  |  |  |  |  | 
| 257 |  |  |  |  |  |  | Copyright 2016 Nigel Horne. | 
| 258 |  |  |  |  |  |  |  | 
| 259 |  |  |  |  |  |  | This program is released under the following licence: GPL | 
| 260 |  |  |  |  |  |  |  | 
| 261 |  |  |  |  |  |  |  | 
| 262 |  |  |  |  |  |  | =cut | 
| 263 |  |  |  |  |  |  |  | 
| 264 |  |  |  |  |  |  | 1; # End of WWW::Scape::FindaGrave |