blib/lib/AnyEvent/WebArchive.pm | |||
---|---|---|---|
Criterion | Covered | Total | % |
statement | 24 | 67 | 35.8 |
branch | 5 | 26 | 19.2 |
condition | 0 | 11 | 0.0 |
subroutine | 6 | 11 | 54.5 |
pod | 1 | 1 | 100.0 |
total | 36 | 116 | 31.0 |
line | stmt | bran | cond | sub | pod | time | code |
---|---|---|---|---|---|---|---|
1 | package AnyEvent::WebArchive; | ||||||
2 | |||||||
3 | 2 | 2 | 41961 | use strict; | |||
2 | 6 | ||||||
2 | 100 | ||||||
4 | 2 | 2 | 7042 | use AnyEvent::HTTP; | |||
2 | 142919 | ||||||
2 | 233 | ||||||
5 | 2 | 2 | 16017 | use Data::Dumper; | |||
2 | 21687 | ||||||
2 | 161 | ||||||
6 | 2 | 2 | 17 | use base 'Exporter'; | |||
2 | 6 | ||||||
2 | 2673 | ||||||
7 | our $VERSION = '0.02'; | ||||||
8 | |||||||
9 | our @EXPORT = qw(restore_url); | ||||||
10 | my $DEBUG = 0; | ||||||
11 | sub restore_url { | ||||||
12 | 0 | 0 | 1 | 0 | my $url = shift; | ||
13 | 0 | 0 | my $cb = pop; | ||||
14 | |||||||
15 | 0 | 0 | $url =~ s/^www\.//; | ||||
16 | 0 | 0 | 0 | my $opt = ref $_[0] ? $_[0] : {@_}; | |||
17 | |||||||
18 | 0 | 0 | 0 | $AnyEvent::HTTP::USERAGENT = $opt->{'user_agent'} || 'Opera/9.80 (Windows NT 5.1; U; ru) Presto/2.5.24 Version/10.52'; | |||
19 | 0 | 0 | 0 | $AnyEvent::HTTP::MAX_PER_HOST ||= $opt->{'max_per_host'}; | |||
20 | 0 | 0 | 0 | $AnyEvent::HTTP::ACTIVE ||= $opt->{'active' }; | |||
21 | |||||||
22 | 0 | 0 | my $count; | ||||
23 | 0 | 0 | my $worker = {}; | ||||
24 | 0 | 0 | bless $worker, __PACKAGE__; | ||||
25 | 0 | 0 | $worker->{'domain'} = $url; | ||||
26 | http_get _search($url), sub { | ||||||
27 | 0 | 0 | 0 | $url = $url; | |||
28 | 0 | 0 | 0 | $DEBUG && warn "GET $url\n"; | |||
29 | 0 | 0 | my ($body, $headers) = @_; | ||||
30 | |||||||
31 | 0 | 0 | for my $job (grep { $_->[0] } # XXX | ||||
0 | 0 | ||||||
0 | 0 | ||||||
32 | 0 | 0 | map { [ /href="([^"]+)"/sg, />([^<]+)<\/a>/sg ] } map { split /( ){2}/ } |
||||
33 | $body =~ m{(.*?)}si | ||||||
34 | ) { | ||||||
35 | 0 | 0 | 0 | $DEBUG && warn "GET $job->[0]\n"; | |||
36 | 0 | 0 | $count++; | ||||
37 | http_get $job->[0], sub { | ||||||
38 | 0 | 0 | my ($body, $headers) = @_; | ||||
39 | 0 | 0 | 0 | if ($headers->{'Status'} == 200) { | |||
40 | 0 | 0 | $worker->_save_file($job->[1], $body); | ||||
41 | } else { | ||||||
42 | 0 | 0 | warn "Bad status for url $job->[0]: $_" for Dumper($headers); | ||||
43 | } | ||||||
44 | |||||||
45 | 0 | 0 | 0 | $cb->() unless --$count; | |||
46 | } | ||||||
47 | 0 | 0 | } | ||||
48 | } | ||||||
49 | 0 | 0 | } | ||||
50 | |||||||
51 | sub _filename { | ||||||
52 | 0 | 0 | 0 | my $str = shift; | |||
53 | |||||||
54 | 0 | 0 | $str =~ s/[^a-z\.\,\s\;-]/_/sig; | ||||
55 | |||||||
56 | 0 | 0 | return $str; | ||||
57 | } | ||||||
58 | |||||||
59 | sub _search { | ||||||
60 | 0 | 0 | 0 | return "http://web.archive.org/web/*sr_1nr_10000/$_*" for shift; | |||
61 | } | ||||||
62 | |||||||
63 | sub _save_file { | ||||||
64 | 0 | 0 | 0 | my ($worker, $url,$body) = @_; | |||
65 | |||||||
66 | 0 | 0 | $url = $worker->{'domain'} . $worker->_normalize_url($url); | ||||
67 | |||||||
68 | 0 | 0 | my $path; | ||||
69 | 0 | 0 | for (split /\//, $url) { | ||||
70 | 0 | 0 | 0 | 0 | last if /^\?/ || $url =~ /$_$/; | ||
71 | 0 | 0 | $path .= "$_/"; | ||||
72 | 0 | 0 | 0 | $DEBUG && warn "mkdir $path\n"; | |||
73 | 0 | 0 | mkdir $path; | ||||
74 | } | ||||||
75 | |||||||
76 | 0 | 0 | 0 | return warn "file $url already exists, skipping\n" if -e $url; | |||
77 | 0 | 0 | 0 | $DEBUG && warn "writing $url\n"; | |||
78 | |||||||
79 | |||||||
80 | 0 | 0 | 0 | open my $fh, '>', $url or warn "$!: $url"; | |||
81 | 0 | 0 | print $fh $worker->_normalize($body); | ||||
82 | } | ||||||
83 | |||||||
84 | sub _normalize { | ||||||
85 | 1 | 1 | 3 | my ($worker,$body) = @_; | |||
86 | |||||||
87 | 1 | 11 | $body =~ s/(?<=href=")([^"]+)(?=")/$worker->_normalize_url($1)/sieg; | ||||
1 | 4 | ||||||
88 | 1 | 8 | $body =~ s{(?<= |
).*(?=