| line | stmt | bran | cond | sub | pod | time | code | 
| 1 |  |  |  |  |  |  | package WWW::Heise; | 
| 2 |  |  |  |  |  |  |  | 
| 3 | 1 |  |  | 1 |  | 24465 | use strict; | 
|  | 1 |  |  |  |  | 3 |  | 
|  | 1 |  |  |  |  | 53 |  | 
| 4 |  |  |  |  |  |  | # use warnings; | 
| 5 | 1 |  |  | 1 |  | 935 | use HTML::TokeParser; | 
|  | 1 |  |  |  |  | 14868 |  | 
|  | 1 |  |  |  |  | 40 |  | 
| 6 | 1 |  |  | 1 |  | 1342 | use LWP::UserAgent; | 
|  | 1 |  |  |  |  | 79536 |  | 
|  | 1 |  |  |  |  | 46 |  | 
| 7 | 1 |  |  | 1 |  | 12 | use HTTP::Request; | 
|  | 1 |  |  |  |  | 2 |  | 
|  | 1 |  |  |  |  | 28 |  | 
| 8 | 1 |  |  | 1 |  | 1257 | use URI::URL; | 
|  | 1 |  |  |  |  | 4537 |  | 
|  | 1 |  |  |  |  | 977 |  | 
| 9 |  |  |  |  |  |  |  | 
| 10 |  |  |  |  |  |  |  | 
| 11 |  |  |  |  |  |  | require Exporter; | 
| 12 |  |  |  |  |  |  |  | 
| 13 |  |  |  |  |  |  | our @ISA = qw(Exporter); | 
| 14 |  |  |  |  |  |  |  | 
| 15 |  |  |  |  |  |  | # Items to export into callers namespace by default. Note: do not export | 
| 16 |  |  |  |  |  |  | # names by default without a very good reason. Use EXPORT_OK instead. | 
| 17 |  |  |  |  |  |  | # Do not simply export all your public functions/methods/constants. | 
| 18 |  |  |  |  |  |  |  | 
| 19 |  |  |  |  |  |  | # This allows declaration	use WWW::GameStar ':all'; | 
| 20 |  |  |  |  |  |  | # If you do not need this, moving things directly into @EXPORT or @EXPORT_OK | 
| 21 |  |  |  |  |  |  | # will save memory. | 
| 22 |  |  |  |  |  |  | our %EXPORT_TAGS = ( 'all' => [ qw( | 
| 23 |  |  |  |  |  |  | HtmlLinkExtractor | 
| 24 |  |  |  |  |  |  | getNews | 
| 25 |  |  |  |  |  |  | Get | 
| 26 |  |  |  |  |  |  | ) ] ); | 
| 27 |  |  |  |  |  |  |  | 
| 28 |  |  |  |  |  |  | our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } ); | 
| 29 |  |  |  |  |  |  |  | 
| 30 |  |  |  |  |  |  | our @EXPORT = qw( | 
| 31 |  |  |  |  |  |  | HtmlLinkExtractor | 
| 32 |  |  |  |  |  |  | getNews | 
| 33 |  |  |  |  |  |  | Get | 
| 34 |  |  |  |  |  |  | ); | 
| 35 |  |  |  |  |  |  |  | 
| 36 |  |  |  |  |  |  | our $VERSION 	= '1.0'; | 
| 37 |  |  |  |  |  |  | my $Url		= "http://www.heise.de/"; | 
| 38 |  |  |  |  |  |  | my $Regex	= "\/newsticker\/"; | 
| 39 |  |  |  |  |  |  |  | 
| 40 |  |  |  |  |  |  | ###### | 
| 41 |  |  |  |  |  |  | my $MaxFileSizeOfWebDocument	= (50 * 1024 * 1024);	# 5mb | 
| 42 |  |  |  |  |  |  | my $MaxRedirectRequests		= 15; | 
| 43 |  |  |  |  |  |  | my $AuthorEmail			= 'yourname@cpan.org'; | 
| 44 |  |  |  |  |  |  | my $Timeout			= 25; | 
| 45 |  |  |  |  |  |  | my $CrawlDelay			= int(rand(3)); | 
| 46 |  |  |  |  |  |  | my $Referer			= "http://www.google.com/"; | 
| 47 |  |  |  |  |  |  | my $DEBUG			= 1; | 
| 48 |  |  |  |  |  |  | ###### | 
| 49 |  |  |  |  |  |  |  | 
| 50 |  |  |  |  |  |  |  | 
| 51 |  |  |  |  |  |  | sub new(){ | 
| 52 |  |  |  |  |  |  |  | 
| 53 | 0 |  |  | 0 | 0 |  | my $class   	= shift; | 
| 54 | 0 | 0 |  |  |  |  | my %args 	= ref($_[0])?%{$_[0]}:@_; | 
|  | 0 |  |  |  |  |  |  | 
| 55 | 0 |  |  |  |  |  | my $self 	= \%args; | 
| 56 | 0 |  |  |  |  |  | bless $self, $class; | 
| 57 | 0 |  |  |  |  |  | $self->_init(); | 
| 58 | 0 |  |  |  |  |  | return $self; | 
| 59 |  |  |  |  |  |  |  | 
| 60 |  |  |  |  |  |  | }; # sub new(){ | 
| 61 |  |  |  |  |  |  |  | 
| 62 |  |  |  |  |  |  |  | 
| 63 |  |  |  |  |  |  | sub _init(){ | 
| 64 |  |  |  |  |  |  |  | 
| 65 | 0 |  |  | 0 |  |  | my $self 	= shift; | 
| 66 | 0 |  |  |  |  |  | my $HashRef 	= $self->Get($Url); | 
| 67 | 0 |  |  |  |  |  | my $ArrRef	= $self->HtmlLinkExtractor($HashRef); | 
| 68 |  |  |  |  |  |  |  | 
| 69 | 0 |  |  |  |  |  | $self->{'_CONTENT_ARRAY_REF'} = $ArrRef; | 
| 70 | 0 |  |  |  |  |  | return $self; | 
| 71 |  |  |  |  |  |  |  | 
| 72 |  |  |  |  |  |  | }; # sub _init(){ | 
| 73 |  |  |  |  |  |  |  | 
| 74 |  |  |  |  |  |  |  | 
| 75 |  |  |  |  |  |  | sub getNews(){ | 
| 76 |  |  |  |  |  |  |  | 
| 77 | 0 |  |  | 0 | 0 |  | my $self 		= shift; | 
| 78 | 0 |  |  |  |  |  | my $ArrRef 		= $self->{'_CONTENT_ARRAY_REF'}; | 
| 79 | 0 |  |  |  |  |  | my %NoDoubleLinks	= {}; | 
| 80 | 0 |  |  |  |  |  | my %ReturnLinks		= {}; | 
| 81 |  |  |  |  |  |  |  | 
| 82 | 0 |  |  |  |  |  | foreach my $entry ( @{$ArrRef} ){ | 
|  | 0 |  |  |  |  |  |  | 
| 83 |  |  |  |  |  |  |  | 
| 84 | 0 |  |  |  |  |  | my ($linkname, $url) = split(' ### ', $entry ); | 
| 85 | 0 | 0 |  |  |  |  | if ( !exists $NoDoubleLinks{$url} ) { | 
| 86 | 0 |  |  |  |  |  | $ReturnLinks{$url} = $linkname; | 
| 87 | 0 |  |  |  |  |  | $NoDoubleLinks{$url} = 0; | 
| 88 |  |  |  |  |  |  | }; | 
| 89 |  |  |  |  |  |  | }; # foreach my $entry ( @{$ArrRef} ){ | 
| 90 |  |  |  |  |  |  |  | 
| 91 | 0 |  |  |  |  |  | return \%ReturnLinks; | 
| 92 |  |  |  |  |  |  |  | 
| 93 |  |  |  |  |  |  | }; # sub getNews(){ | 
| 94 |  |  |  |  |  |  |  | 
| 95 |  |  |  |  |  |  |  | 
| 96 |  |  |  |  |  |  | # Preloaded methods go here. | 
| 97 |  |  |  |  |  |  |  | 
| 98 |  |  |  |  |  |  | sub HtmlLinkExtractor(){ | 
| 99 |  |  |  |  |  |  |  | 
| 100 | 0 |  |  | 0 | 0 |  | my $self			= shift; | 
| 101 | 0 |  |  |  |  |  | my $HashRef			= shift; | 
| 102 | 0 |  |  |  |  |  | my $ResponseObj			= $HashRef->{'OBJ'}; | 
| 103 | 0 |  |  |  |  |  | my $PageContent			= $HashRef->{'CNT'}; | 
| 104 |  |  |  |  |  |  |  | 
| 105 | 0 |  |  |  |  |  | my @ReturnLinks			= (); | 
| 106 |  |  |  |  |  |  |  | 
| 107 | 0 | 0 |  |  |  |  | return -1 if ( ref($ResponseObj) ne "HTTP::Response" ); | 
| 108 |  |  |  |  |  |  |  | 
| 109 | 0 |  |  |  |  |  | my $base			= $ResponseObj->base; | 
| 110 | 0 |  |  |  |  |  | my $TokenParser		= HTML::TokeParser->new( \$PageContent ); | 
| 111 |  |  |  |  |  |  |  | 
| 112 | 0 |  |  |  |  |  | while ( my $token	= $TokenParser->get_tag("a")) { | 
| 113 |  |  |  |  |  |  |  | 
| 114 | 0 |  |  |  |  |  | my $url		= $token->[1]{href}; | 
| 115 | 0 |  |  |  |  |  | my $linktitle	= $token->[1]{title}; | 
| 116 | 0 |  |  |  |  |  | my $rel		= $token->[1]{rel}; | 
| 117 | 0 |  |  |  |  |  | my $text	= $TokenParser->get_trimmed_text("/a");	# $text = Linktitle | 
| 118 | 0 |  |  |  |  |  | $url		= url($url, $base)->abs;	# enth�lt die aktuell zu bearbeitende url | 
| 119 |  |  |  |  |  |  |  | 
| 120 | 0 |  |  |  |  |  | chomp($url); chomp($text); | 
|  | 0 |  |  |  |  |  |  | 
| 121 | 0 | 0 | 0 |  |  |  | push(@ReturnLinks, "$text ### $url") if ( $url =~ /^(http)/i && $url =~ /$Regex/ig ); | 
| 122 |  |  |  |  |  |  |  | 
| 123 |  |  |  |  |  |  | }; # while ( my $token = $TokenParser->get_tag("a")) { | 
| 124 |  |  |  |  |  |  |  | 
| 125 | 0 |  |  |  |  |  | return \@ReturnLinks; | 
| 126 |  |  |  |  |  |  |  | 
| 127 |  |  |  |  |  |  | }; # sub HtmlLinkExtractor(){ | 
| 128 |  |  |  |  |  |  |  | 
| 129 |  |  |  |  |  |  |  | 
| 130 |  |  |  |  |  |  | sub Get() { | 
| 131 |  |  |  |  |  |  |  | 
| 132 | 0 |  |  | 0 | 0 |  | my $self	= shift; | 
| 133 | 0 |  |  |  |  |  | my $url		= shift; | 
| 134 | 0 |  | 0 |  |  |  | my $referer	= shift || $url; | 
| 135 |  |  |  |  |  |  |  | 
| 136 | 0 |  |  |  |  |  | my $StatusHashRef = {}; | 
| 137 |  |  |  |  |  |  |  | 
| 138 | 0 |  |  |  |  |  | my $UA		= LWP::UserAgent->new( keep_alive => 1 ); | 
| 139 |  |  |  |  |  |  |  | 
| 140 | 0 |  |  |  |  |  | $UA->agent("Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; YPC 3.0.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)"); | 
| 141 |  |  |  |  |  |  | #	$UA->agent("wget"); | 
| 142 | 0 |  |  |  |  |  | $UA->timeout( $Timeout ); | 
| 143 | 0 |  |  |  |  |  | $UA->max_size( $MaxFileSizeOfWebDocument ); | 
| 144 | 0 |  |  |  |  |  | $UA->from( $AuthorEmail ); | 
| 145 | 0 |  |  |  |  |  | $UA->max_redirect( $MaxRedirectRequests ); | 
| 146 | 0 |  |  |  |  |  | $UA->parse_head( 1 ); | 
| 147 | 0 |  |  |  |  |  | $UA->protocols_allowed( [ 'http', 'https', 'ftp', 'ed2k'] ); | 
| 148 | 0 |  |  |  |  |  | $UA->protocols_forbidden( [ 'file', 'mailto'] ); | 
| 149 | 0 |  |  |  |  |  | $UA->requests_redirectable( [ 'HEAD', 'GET', 'POST'] ); | 
| 150 |  |  |  |  |  |  |  | 
| 151 |  |  |  |  |  |  | #	$ua->credentials( $netloc, $realm, $uname, $pass ) | 
| 152 |  |  |  |  |  |  | #	$ua->proxy(['http', 'ftp'], 'http://proxy.sn.no:8001/');	# f�r protokollschema http und ftp benutze proxy ... | 
| 153 |  |  |  |  |  |  | # $ua->env_proxy ->  wais_proxy=http://proxy.my.place/ -> export gopher_proxy wais_proxy no_proxy | 
| 154 |  |  |  |  |  |  |  | 
| 155 |  |  |  |  |  |  | # sleep $CrawlDelay; | 
| 156 |  |  |  |  |  |  |  | 
| 157 | 0 |  |  |  |  |  | my $req = HTTP::Request->new( GET => $url ); | 
| 158 | 0 |  |  |  |  |  | $req->referer($referer); | 
| 159 |  |  |  |  |  |  |  | 
| 160 | 0 |  |  |  |  |  | my $res = $UA->request($req); | 
| 161 |  |  |  |  |  |  |  | 
| 162 | 0 | 0 |  |  |  |  | if ( $res->is_success ) { | 
| 163 |  |  |  |  |  |  |  | 
| 164 | 0 |  |  |  |  |  | $StatusHashRef->{ 'OBJ' } = $res; | 
| 165 | 0 |  |  |  |  |  | $StatusHashRef->{ 'CNT' } = $res->content; | 
| 166 |  |  |  |  |  |  |  | 
| 167 |  |  |  |  |  |  | }; # if ($res->is_success) { | 
| 168 |  |  |  |  |  |  |  | 
| 169 | 0 |  |  |  |  |  | return $StatusHashRef; | 
| 170 |  |  |  |  |  |  |  | 
| 171 |  |  |  |  |  |  | }; # sub GET() { | 
| 172 |  |  |  |  |  |  |  | 
| 173 |  |  |  |  |  |  |  | 
| 174 |  |  |  |  |  |  | 1; | 
| 175 |  |  |  |  |  |  | __END__ |