line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package WWW::Golem; |
2
|
|
|
|
|
|
|
|
3
|
1
|
|
|
1
|
|
71630
|
use strict; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
43
|
|
4
|
|
|
|
|
|
|
# use warnings; |
5
|
1
|
|
|
1
|
|
1057
|
use HTML::TokeParser; |
|
1
|
|
|
|
|
13944
|
|
|
1
|
|
|
|
|
32
|
|
6
|
1
|
|
|
1
|
|
1078
|
use LWP::UserAgent; |
|
1
|
|
|
|
|
57417
|
|
|
1
|
|
|
|
|
44
|
|
7
|
1
|
|
|
1
|
|
12
|
use HTTP::Request; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
26
|
|
8
|
1
|
|
|
1
|
|
969
|
use URI::URL; |
|
1
|
|
|
|
|
4643
|
|
|
1
|
|
|
|
|
952
|
|
9
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
require Exporter; |
12
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
our @ISA = qw(Exporter); |
14
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
# Items to export into callers namespace by default. Note: do not export |
16
|
|
|
|
|
|
|
# names by default without a very good reason. Use EXPORT_OK instead. |
17
|
|
|
|
|
|
|
# Do not simply export all your public functions/methods/constants. |
18
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
# This allows declaration use WWW::GameStar ':all'; |
20
|
|
|
|
|
|
|
# If you do not need this, moving things directly into @EXPORT or @EXPORT_OK |
21
|
|
|
|
|
|
|
# will save memory. |
22
|
|
|
|
|
|
|
our %EXPORT_TAGS = ( 'all' => [ qw( |
23
|
|
|
|
|
|
|
HtmlLinkExtractor |
24
|
|
|
|
|
|
|
getNews |
25
|
|
|
|
|
|
|
Get |
26
|
|
|
|
|
|
|
) ] ); |
27
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } ); |
29
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
our @EXPORT = qw( |
31
|
|
|
|
|
|
|
HtmlLinkExtractor |
32
|
|
|
|
|
|
|
getNews |
33
|
|
|
|
|
|
|
Get |
34
|
|
|
|
|
|
|
); |
35
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
our $VERSION = '1.0'; |
37
|
|
|
|
|
|
|
my $Url = "http://www.golem.de/"; |
38
|
|
|
|
|
|
|
my $Regex = "\.html"; |
39
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
###### |
41
|
|
|
|
|
|
|
my $MaxFileSizeOfWebDocument = (50 * 1024 * 1024); # 5mb |
42
|
|
|
|
|
|
|
my $MaxRedirectRequests = 15; |
43
|
|
|
|
|
|
|
my $AuthorEmail = 'yourname@cpan.org'; |
44
|
|
|
|
|
|
|
my $Timeout = 25; |
45
|
|
|
|
|
|
|
my $CrawlDelay = int(rand(3)); |
46
|
|
|
|
|
|
|
my $Referer = "http://www.google.com/"; |
47
|
|
|
|
|
|
|
my $DEBUG = 1; |
48
|
|
|
|
|
|
|
###### |
49
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
sub new(){ |
52
|
|
|
|
|
|
|
|
53
|
0
|
|
|
0
|
0
|
|
my $class = shift; |
54
|
0
|
0
|
|
|
|
|
my %args = ref($_[0])?%{$_[0]}:@_; |
|
0
|
|
|
|
|
|
|
55
|
0
|
|
|
|
|
|
my $self = \%args; |
56
|
0
|
|
|
|
|
|
bless $self, $class; |
57
|
0
|
|
|
|
|
|
$self->_init(); |
58
|
0
|
|
|
|
|
|
return $self; |
59
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
}; # sub new(){ |
61
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
sub _init(){ |
64
|
|
|
|
|
|
|
|
65
|
0
|
|
|
0
|
|
|
my $self = shift; |
66
|
0
|
|
|
|
|
|
my $HashRef = $self->Get($Url); |
67
|
0
|
|
|
|
|
|
my $ArrRef = $self->HtmlLinkExtractor($HashRef); |
68
|
|
|
|
|
|
|
|
69
|
0
|
|
|
|
|
|
$self->{'_CONTENT_ARRAY_REF'} = $ArrRef; |
70
|
0
|
|
|
|
|
|
return $self; |
71
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
}; # sub _init(){ |
73
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
sub getNews(){ |
76
|
|
|
|
|
|
|
|
77
|
0
|
|
|
0
|
0
|
|
my $self = shift; |
78
|
0
|
|
|
|
|
|
my $ArrRef = $self->{'_CONTENT_ARRAY_REF'}; |
79
|
0
|
|
|
|
|
|
my %NoDoubleLinks = {}; |
80
|
0
|
|
|
|
|
|
my %ReturnLinks = {}; |
81
|
|
|
|
|
|
|
|
82
|
0
|
|
|
|
|
|
foreach my $entry ( @{$ArrRef} ){ |
|
0
|
|
|
|
|
|
|
83
|
|
|
|
|
|
|
|
84
|
0
|
|
|
|
|
|
my ($linkname, $url) = split(' ### ', $entry ); |
85
|
0
|
0
|
|
|
|
|
if ( !exists $NoDoubleLinks{$url} ) { |
86
|
0
|
|
|
|
|
|
$ReturnLinks{$url} = $linkname; |
87
|
0
|
|
|
|
|
|
$NoDoubleLinks{$url} = 0; |
88
|
|
|
|
|
|
|
}; |
89
|
|
|
|
|
|
|
}; # foreach my $entry ( @{$ArrRef} ){ |
90
|
|
|
|
|
|
|
|
91
|
0
|
|
|
|
|
|
return \%ReturnLinks; |
92
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
}; # sub getNews(){ |
94
|
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
|
96
|
|
|
|
|
|
|
# Preloaded methods go here. |
97
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
sub HtmlLinkExtractor(){ |
99
|
|
|
|
|
|
|
|
100
|
0
|
|
|
0
|
0
|
|
my $self = shift; |
101
|
0
|
|
|
|
|
|
my $HashRef = shift; |
102
|
0
|
|
|
|
|
|
my $ResponseObj = $HashRef->{'OBJ'}; |
103
|
0
|
|
|
|
|
|
my $PageContent = $HashRef->{'CNT'}; |
104
|
|
|
|
|
|
|
|
105
|
0
|
|
|
|
|
|
my @ReturnLinks = (); |
106
|
|
|
|
|
|
|
|
107
|
0
|
0
|
|
|
|
|
return -1 if ( ref($ResponseObj) ne "HTTP::Response" ); |
108
|
|
|
|
|
|
|
|
109
|
0
|
|
|
|
|
|
my $base = $ResponseObj->base; |
110
|
0
|
|
|
|
|
|
my $TokenParser = HTML::TokeParser->new( \$PageContent ); |
111
|
|
|
|
|
|
|
|
112
|
0
|
|
|
|
|
|
while ( my $token = $TokenParser->get_tag("a")) { |
113
|
|
|
|
|
|
|
|
114
|
0
|
|
|
|
|
|
my $url = $token->[1]{href}; |
115
|
0
|
|
|
|
|
|
my $linktitle = $token->[1]{title}; |
116
|
0
|
|
|
|
|
|
my $rel = $token->[1]{rel}; |
117
|
0
|
|
|
|
|
|
my $text = $TokenParser->get_trimmed_text("/a"); # $text = Linktitle |
118
|
0
|
|
|
|
|
|
$url = url($url, $base)->abs; # enth�lt die aktuell zu bearbeitende url |
119
|
|
|
|
|
|
|
|
120
|
0
|
|
|
|
|
|
chomp($url); chomp($text); |
|
0
|
|
|
|
|
|
|
121
|
0
|
0
|
|
|
|
|
next if ( $url !~ /golem\.de/ig ) ; |
122
|
0
|
0
|
0
|
|
|
|
push(@ReturnLinks, "$text ### $url") if ( $url =~ /^(http)/i && $url =~ /$Regex/ig ); |
123
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
}; # while ( my $token = $TokenParser->get_tag("a")) { |
125
|
|
|
|
|
|
|
|
126
|
0
|
|
|
|
|
|
return \@ReturnLinks; |
127
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
}; # sub HtmlLinkExtractor(){ |
129
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
|
131
|
|
|
|
|
|
|
sub Get() { |
132
|
|
|
|
|
|
|
|
133
|
0
|
|
|
0
|
0
|
|
my $self = shift; |
134
|
0
|
|
|
|
|
|
my $url = shift; |
135
|
0
|
|
0
|
|
|
|
my $referer = shift || $url; |
136
|
|
|
|
|
|
|
|
137
|
0
|
|
|
|
|
|
my $StatusHashRef = {}; |
138
|
|
|
|
|
|
|
|
139
|
0
|
|
|
|
|
|
my $UA = LWP::UserAgent->new( keep_alive => 1 ); |
140
|
|
|
|
|
|
|
|
141
|
0
|
|
|
|
|
|
$UA->agent("Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; YPC 3.0.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)"); |
142
|
|
|
|
|
|
|
# $UA->agent("wget"); |
143
|
0
|
|
|
|
|
|
$UA->timeout( $Timeout ); |
144
|
0
|
|
|
|
|
|
$UA->max_size( $MaxFileSizeOfWebDocument ); |
145
|
0
|
|
|
|
|
|
$UA->from( $AuthorEmail ); |
146
|
0
|
|
|
|
|
|
$UA->max_redirect( $MaxRedirectRequests ); |
147
|
0
|
|
|
|
|
|
$UA->parse_head( 1 ); |
148
|
0
|
|
|
|
|
|
$UA->protocols_allowed( [ 'http', 'https', 'ftp', 'ed2k'] ); |
149
|
0
|
|
|
|
|
|
$UA->protocols_forbidden( [ 'file', 'mailto'] ); |
150
|
0
|
|
|
|
|
|
$UA->requests_redirectable( [ 'HEAD', 'GET', 'POST'] ); |
151
|
|
|
|
|
|
|
|
152
|
|
|
|
|
|
|
# $ua->credentials( $netloc, $realm, $uname, $pass ) |
153
|
|
|
|
|
|
|
# $ua->proxy(['http', 'ftp'], 'http://proxy.sn.no:8001/'); # f�r protokollschema http und ftp benutze proxy ... |
154
|
|
|
|
|
|
|
# $ua->env_proxy -> wais_proxy=http://proxy.my.place/ -> export gopher_proxy wais_proxy no_proxy |
155
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
# sleep $CrawlDelay; |
157
|
|
|
|
|
|
|
|
158
|
0
|
|
|
|
|
|
my $req = HTTP::Request->new( GET => $url ); |
159
|
0
|
|
|
|
|
|
$req->referer($referer); |
160
|
|
|
|
|
|
|
|
161
|
0
|
|
|
|
|
|
my $res = $UA->request($req); |
162
|
|
|
|
|
|
|
|
163
|
0
|
0
|
|
|
|
|
if ( $res->is_success ) { |
164
|
|
|
|
|
|
|
|
165
|
0
|
|
|
|
|
|
$StatusHashRef->{ 'OBJ' } = $res; |
166
|
0
|
|
|
|
|
|
$StatusHashRef->{ 'CNT' } = $res->content; |
167
|
|
|
|
|
|
|
|
168
|
|
|
|
|
|
|
}; # if ($res->is_success) { |
169
|
|
|
|
|
|
|
|
170
|
0
|
|
|
|
|
|
return $StatusHashRef; |
171
|
|
|
|
|
|
|
|
172
|
|
|
|
|
|
|
}; # sub GET() { |
173
|
|
|
|
|
|
|
|
174
|
|
|
|
|
|
|
|
175
|
|
|
|
|
|
|
1; |
176
|
|
|
|
|
|
|
__END__ |