line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package WWW::Newsgrabber; |
2
|
|
|
|
|
|
|
|
3
|
1
|
|
|
1
|
|
44277
|
use strict; |
|
1
|
|
|
|
|
4
|
|
|
1
|
|
|
|
|
40
|
|
4
|
|
|
|
|
|
|
# use warnings; |
5
|
1
|
|
|
1
|
|
1033
|
use HTML::TokeParser; |
|
1
|
|
|
|
|
19156
|
|
|
1
|
|
|
|
|
40
|
|
6
|
1
|
|
|
1
|
|
1871
|
use LWP::UserAgent; |
|
1
|
|
|
|
|
59867
|
|
|
1
|
|
|
|
|
41
|
|
7
|
1
|
|
|
1
|
|
13
|
use HTTP::Request; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
26
|
|
8
|
1
|
|
|
1
|
|
1003
|
use URI::URL; |
|
1
|
|
|
|
|
5194
|
|
|
1
|
|
|
|
|
1144
|
|
9
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
require Exporter; |
12
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
our @ISA = qw(Exporter); |
14
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
# Items to export into callers namespace by default. Note: do not export |
16
|
|
|
|
|
|
|
# names by default without a very good reason. Use EXPORT_OK instead. |
17
|
|
|
|
|
|
|
# Do not simply export all your public functions/methods/constants. |
18
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
# This allows declaration use WWW::GameStar ':all'; |
20
|
|
|
|
|
|
|
# If you do not need this, moving things directly into @EXPORT or @EXPORT_OK |
21
|
|
|
|
|
|
|
# will save memory. |
22
|
|
|
|
|
|
|
our %EXPORT_TAGS = ( 'all' => [ qw( |
23
|
|
|
|
|
|
|
HtmlLinkExtractor |
24
|
|
|
|
|
|
|
getNews |
25
|
|
|
|
|
|
|
Get |
26
|
|
|
|
|
|
|
) ] ); |
27
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } ); |
29
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
our @EXPORT = qw( |
31
|
|
|
|
|
|
|
HtmlLinkExtractor |
32
|
|
|
|
|
|
|
getNews |
33
|
|
|
|
|
|
|
Get |
34
|
|
|
|
|
|
|
); |
35
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
our $VERSION = '1.0'; |
37
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
###### |
39
|
|
|
|
|
|
|
my $MaxFileSizeOfWebDocument = (50 * 1024 * 1024); # 5mb |
40
|
|
|
|
|
|
|
my $MaxRedirectRequests = 15; |
41
|
|
|
|
|
|
|
my $AuthorEmail = 'yourname@cpan.org'; |
42
|
|
|
|
|
|
|
my $Timeout = 25; |
43
|
|
|
|
|
|
|
my $CrawlDelay = int(rand(3)); |
44
|
|
|
|
|
|
|
my $Referer = "http://www.google.com/"; |
45
|
|
|
|
|
|
|
my $DEBUG = 1; |
46
|
|
|
|
|
|
|
###### |
47
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
sub new(){ |
50
|
|
|
|
|
|
|
|
51
|
0
|
|
|
0
|
0
|
|
my $class = shift; |
52
|
0
|
0
|
|
|
|
|
my %args = ref($_[0])?%{$_[0]}:@_; |
|
0
|
|
|
|
|
|
|
53
|
0
|
|
|
|
|
|
my $self = \%args; |
54
|
0
|
|
|
|
|
|
bless $self, $class; |
55
|
0
|
|
|
|
|
|
$self->_init(); |
56
|
0
|
|
|
|
|
|
return $self; |
57
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
}; # sub new(){ |
59
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
sub _init(){ |
62
|
|
|
|
|
|
|
|
63
|
0
|
|
|
0
|
|
|
my $self = shift; |
64
|
0
|
|
0
|
|
|
|
my $Url = $self->{'url'} || die "$self->_init() no url to scan given!\n"; |
65
|
0
|
|
|
|
|
|
my $HashRef = $self->Get($Url); |
66
|
0
|
|
|
|
|
|
my $ArrRef = $self->HtmlLinkExtractor($HashRef); |
67
|
|
|
|
|
|
|
|
68
|
0
|
|
|
|
|
|
$self->{'_CONTENT_ARRAY_REF'} = $ArrRef; |
69
|
0
|
|
|
|
|
|
return $self; |
70
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
}; # sub _init(){ |
72
|
|
|
|
|
|
|
|
73
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
sub getNews(){ |
75
|
|
|
|
|
|
|
|
76
|
0
|
|
|
0
|
0
|
|
my $self = shift; |
77
|
0
|
|
|
|
|
|
my $ArrRef = $self->{'_CONTENT_ARRAY_REF'}; |
78
|
0
|
|
|
|
|
|
my %NoDoubleLinks = {}; |
79
|
0
|
|
|
|
|
|
my %ReturnLinks = {}; |
80
|
|
|
|
|
|
|
|
81
|
0
|
|
|
|
|
|
foreach my $entry ( @{$ArrRef} ){ |
|
0
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
|
83
|
0
|
|
|
|
|
|
my ($linkname, $url) = split(' ### ', $entry ); |
84
|
0
|
0
|
|
|
|
|
if ( !exists $NoDoubleLinks{$url} ) { |
85
|
0
|
|
|
|
|
|
$ReturnLinks{$url} = $linkname; |
86
|
0
|
|
|
|
|
|
$NoDoubleLinks{$url} = 0; |
87
|
|
|
|
|
|
|
}; |
88
|
|
|
|
|
|
|
}; # foreach my $entry ( @{$ArrRef} ){ |
89
|
|
|
|
|
|
|
|
90
|
0
|
|
|
|
|
|
return \%ReturnLinks; |
91
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
}; # sub getNews(){ |
93
|
|
|
|
|
|
|
|
94
|
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
# Preloaded methods go here. |
96
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
sub HtmlLinkExtractor(){ |
98
|
|
|
|
|
|
|
|
99
|
0
|
|
|
0
|
0
|
|
my $self = shift; |
100
|
0
|
|
0
|
|
|
|
my $Regex = $self->{'regex'} || warn "$self->_init() no regex given!\n"; |
101
|
0
|
|
|
|
|
|
my $HashRef = shift; |
102
|
0
|
|
|
|
|
|
my $ResponseObj = $HashRef->{'OBJ'}; |
103
|
0
|
|
|
|
|
|
my $PageContent = $HashRef->{'CNT'}; |
104
|
|
|
|
|
|
|
|
105
|
0
|
|
|
|
|
|
my @ReturnLinks = (); |
106
|
|
|
|
|
|
|
|
107
|
0
|
0
|
|
|
|
|
return -1 if ( ref($ResponseObj) ne "HTTP::Response" ); |
108
|
|
|
|
|
|
|
|
109
|
0
|
|
|
|
|
|
my $base = $ResponseObj->base; |
110
|
0
|
|
|
|
|
|
my $TokenParser = HTML::TokeParser->new( \$PageContent ); |
111
|
|
|
|
|
|
|
|
112
|
0
|
|
|
|
|
|
while ( my $token = $TokenParser->get_tag("a")) { |
113
|
|
|
|
|
|
|
|
114
|
0
|
|
|
|
|
|
my $url = $token->[1]{href}; |
115
|
0
|
|
|
|
|
|
my $linktitle = $token->[1]{title}; |
116
|
0
|
|
|
|
|
|
my $rel = $token->[1]{rel}; |
117
|
0
|
|
|
|
|
|
my $text = $TokenParser->get_trimmed_text("/a"); # $text = Linktitle |
118
|
0
|
|
|
|
|
|
$url = url($url, $base)->abs; # enth�lt die aktuell zu bearbeitende url |
119
|
|
|
|
|
|
|
|
120
|
0
|
|
|
|
|
|
chomp($url); chomp($text); |
|
0
|
|
|
|
|
|
|
121
|
0
|
0
|
0
|
|
|
|
push(@ReturnLinks, "$text ### $url") if ( $url =~ /^(http)/i && $url =~ /$Regex/ig ); |
122
|
|
|
|
|
|
|
|
123
|
|
|
|
|
|
|
}; # while ( my $token = $TokenParser->get_tag("a")) { |
124
|
|
|
|
|
|
|
|
125
|
0
|
|
|
|
|
|
return \@ReturnLinks; |
126
|
|
|
|
|
|
|
|
127
|
|
|
|
|
|
|
}; # sub HtmlLinkExtractor(){ |
128
|
|
|
|
|
|
|
|
129
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
sub Get() { |
131
|
|
|
|
|
|
|
|
132
|
0
|
|
|
0
|
0
|
|
my $self = shift; |
133
|
0
|
|
|
|
|
|
my $url = shift; |
134
|
0
|
|
0
|
|
|
|
my $referer = shift || $url; |
135
|
|
|
|
|
|
|
|
136
|
0
|
|
|
|
|
|
my $StatusHashRef = {}; |
137
|
|
|
|
|
|
|
|
138
|
0
|
|
|
|
|
|
my $UA = LWP::UserAgent->new( keep_alive => 1 ); |
139
|
|
|
|
|
|
|
|
140
|
0
|
|
|
|
|
|
$UA->agent("Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; YPC 3.0.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)"); |
141
|
|
|
|
|
|
|
# $UA->agent("wget"); |
142
|
0
|
|
|
|
|
|
$UA->timeout( $Timeout ); |
143
|
0
|
|
|
|
|
|
$UA->max_size( $MaxFileSizeOfWebDocument ); |
144
|
0
|
|
|
|
|
|
$UA->from( $AuthorEmail ); |
145
|
0
|
|
|
|
|
|
$UA->max_redirect( $MaxRedirectRequests ); |
146
|
0
|
|
|
|
|
|
$UA->parse_head( 1 ); |
147
|
0
|
|
|
|
|
|
$UA->protocols_allowed( [ 'http', 'https', 'ftp', 'ed2k'] ); |
148
|
0
|
|
|
|
|
|
$UA->protocols_forbidden( [ 'file', 'mailto'] ); |
149
|
0
|
|
|
|
|
|
$UA->requests_redirectable( [ 'HEAD', 'GET', 'POST'] ); |
150
|
|
|
|
|
|
|
|
151
|
|
|
|
|
|
|
# $ua->credentials( $netloc, $realm, $uname, $pass ) |
152
|
|
|
|
|
|
|
# $ua->proxy(['http', 'ftp'], 'http://proxy.sn.no:8001/'); # f�r protokollschema http und ftp benutze proxy ... |
153
|
|
|
|
|
|
|
# $ua->env_proxy -> wais_proxy=http://proxy.my.place/ -> export gopher_proxy wais_proxy no_proxy |
154
|
|
|
|
|
|
|
|
155
|
|
|
|
|
|
|
# sleep $CrawlDelay; |
156
|
|
|
|
|
|
|
|
157
|
0
|
|
|
|
|
|
my $req = HTTP::Request->new( GET => $url ); |
158
|
0
|
|
|
|
|
|
$req->referer($referer); |
159
|
|
|
|
|
|
|
|
160
|
0
|
|
|
|
|
|
my $res = $UA->request($req); |
161
|
|
|
|
|
|
|
|
162
|
0
|
0
|
|
|
|
|
if ( $res->is_success ) { |
163
|
|
|
|
|
|
|
|
164
|
0
|
|
|
|
|
|
$StatusHashRef->{ 'OBJ' } = $res; |
165
|
0
|
|
|
|
|
|
$StatusHashRef->{ 'CNT' } = $res->content; |
166
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
}; # if ($res->is_success) { |
168
|
|
|
|
|
|
|
|
169
|
0
|
|
|
|
|
|
return $StatusHashRef; |
170
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
}; # sub GET() { |
172
|
|
|
|
|
|
|
|
173
|
|
|
|
|
|
|
|
174
|
|
|
|
|
|
|
1; |
175
|
|
|
|
|
|
|
__END__ |