File Coverage

blib/lib/WWW/Google/News.pm

Criterion	Covered	Total	%
statement	37	93	39.7
branch	2	12	16.6
condition	1	2	50.0
subroutine	11	16	68.7
pod			n/a
total	51	123	41.4

line	stmt	bran	cond	sub	time	code
1
2						package WWW::Google::News;
3
4	1			1	26244	use strict;
	1				2
	1				44
5	1			1	8	use warnings;
	1				3
	1				79
6
7						require Exporter;
8
9						our @ISA = qw(Exporter);
10						our @EXPORT_OK = qw(get_news get_news_greg_style get_news_for_topic);
11						our $VERSION = '0.12';
12
13	1			1	5	use Carp;
	1				6
	1				111
14	1			1	165727	use LWP;
	1				2774326
	1				95
15	1			1	10	use URI::Escape;
	1				1
	1				1044
16
17						sub new {
18	1			1	12	my $pkg = shift;
19	1				2	my $self = {};
20	1				3	bless $self, $pkg;
21	1	50			5	if (! $self->init(@_)) {
22	0				0	return undef;
23						}
24	1				3	return $self;
25						}
26
27						sub init {
28	1			1	2	my $self = shift;
29	1	50			4	my $args = (ref($_[0]) eq "HASH") ? shift : {@_};
30	1				7	$self->{'_topic'} = $args->{'topic'};
31	1				2	$self->{'_start_date'} = $args->{'start_date'};
32	1				3	$self->{'_end_date'} = $args->{'end_date'};
33	1				2	$self->{'_sort'} = $args->{'sort'};
34	1		50		10	$self->{'_max'} = $args->{'max'} \|\| 20;
35
36	1				4	return 1;
37						}
38
39						sub topic {
40	1			1	5	my $self = shift;
41	1				2	$self->{'_topic'} = shift;
42	1				2	return $self->{'_topic'};
43						}
44
45						sub start_date {
46	0			0	0	my $self = shift;
47	0				0	$self->{'_start_date'} = shift;
48	0				0	return $self->{'_start_date'};
49						}
50
51						sub end_date {
52	0			0	0	my $self = shift;
53	0				0	$self->{'_end_date'} = shift;
54	0				0	return $self->{'_end_date'};
55						}
56
57						sub sort {
58	0			0	0	my $self = shift;
59	0				0	$self->{'_sort'} = shift;
60	0				0	return $self->{'_sort'};
61						}
62
63						sub max {
64	1			1	5	my $self = shift;
65	1				2	$self->{'_max'} = shift;
66	1				3	return $self->{'_max'};
67						}
68
69						sub search {
70	1			1	4	my $self = shift;
71	1				184	return get_news_for_topic($self->{'_topic'},$self->{'_start_date'},$self->{'_end_date'},$self->{'_sort'},$self->{'_max'});
72						}
73
74						sub get_news {
75	0			0		my $url = 'http://news.google.com/news/gnmainlite.html';
76	0					my $ua = LWP::UserAgent->new;
77	0					$ua->agent('Mozilla/5.0');
78	0					my $response = $ua->get($url);
79	0	0				return unless $response->is_success;
80	0					my $content = $response->content;
81	0					my $results = {};
82
83	0					my $re1 = '	]> (.?)
84	0					my $re2 = '

]+)"?[^>]*>(.+?)
]+>]+>(.*?)(.*?)
]+>(.+?)\s*...\s*'; 85 86 0 my @sections = split /($re1)/im,$content; 87 0 my $current_section = ''; 88 0 foreach my $section (@sections) { 89 0 0 if ($section =~ m/$re1/im) { 90 0 $current_section = $1; 91 #print STDERR $1,"\n"; 92 } else { 93 0 my @stories = split /($re2)/mi,$section; 94 0 foreach my $story (@stories) { 95 0 0 if ($story =~ m/$re2/mi) { 96 0 0 if (!(exists($results->{$current_section}))) { 97 0 $results->{$current_section} = []; 98 } 99 0 my $story_h = {}; 100 0 my( $url, $headline, $source, $date, $summary ) = ( $1, $2, $3, $4, $5 ); 101 102 0 _clean_string($source); 103 0 _clean_string($headline); 104 0 _clean_string($date); 105 0 _clean_string($summary); 106 107 0 $story_h->{url} = $url; 108 0 $story_h->{headline} = $headline; 109 0 $story_h->{source} = $source; 110 0 $story_h->{date} = $date; 111 0 $story_h->{description} = "$source: $summary"; 112 0 $story_h->{summary} = $summary; 113 114 0 push(@{$results->{$current_section}},$story_h); 0 115 } 116 } 117 } 118 } 119 #print STDERR Dumper($results); 120 0 return $results; 121 } 122 123 124 sub get_news_greg_style { 125 0 0 my $results = get_news(); 126 0 my $greg_results = {}; 127 0 foreach my $section (keys(%$results)) { 128 0 $greg_results->{$section} = {}; 129 0 my $cnt = 0; 130 0 foreach my $story_h (@{$results->{$section}}) { 0 131 0 $cnt++; 132 0 $greg_results->{$section}->{$cnt} = $story_h; 133 } 134 } 135 0 return $greg_results; 136 } 137 138 sub get_news_for_topic { 139 140 my $topic = uri_escape( $_[0] ); 141 my $start_date = $_[1] || ""; 142 my $end_date = $_[2] || ""; 143 my $sort= $_[3] || ""; 144 my $max = $_[4] || 20; 145 146 my $url = "http://news.google.com/news?hl=en&edition=us&q=$topic"; 147 my $url_start; 148 my $url_end; 149 if ($start_date =~ /(^|-)(\d{1,2})-(\d{1,2})$/) { 150 $url_start = "&as_mind=$3&as_minm=$2"; 151 } 152 if ($end_date =~ /(^|-)(\d{1,2})-(\d{1,2})$/) { 153 $url_end = "&as_maxd=$3&as_maxm=$2"; 154 } 155 if ($url_start && $url_end) { 156 $url .= "&as_drrb=b" . $url_start . $url_end; 157 } 158 159 if (lc($sort) eq "date" || ($sort eq "" && $url_start && $url_end)) { 160 $url .= "&scoring=d"; 161 } 162 163 my @results = (); 164 165 my %URL; 166 $URL{"0"} = 1; 167 my $flag = 1; 168 169 my $page_size = 100; 170 171 if ($max <= 0) { 172 $page_size = 100; 173 } elsif ($max <= 50) { 174 $page_size = 50; 175 } elsif ($max <= 20) { 176 $page_size = 20; 177 } 178 179 1 1 410 use XML::Atom::Client; 0 0 0 0 180 my $api = XML::Atom::Client->new; 181 my $feed = $api->getFeed($url."&output=atom"); 182 my @entries = $feed->entries; 183 foreach my $e (@entries) { 184 my $headline = $e->title(); 185 my $source = ""; 186 if ($headline =~ s/ - (.+)$//) { 187 $source = $1; 188 } 189 my $date = $e->issued(); 190 my $summary = $e->content()->body(); 191 _clean_string($summary); 192 $summary =~ s/^.+? \.\.\. //; 193 $summary =~ s/ \.\.\..*?$//; 194 my $story_h; 195 $story_h->{url} = $e->link()->href(); 196 $story_h->{headline} = $headline; 197 $story_h->{source} = $source; 198 $story_h->{date} = $date; 199 $story_h->{description} = "$source: $summary"; 200 $story_h->{summary} = $summary; 201 push(@results,$story_h); 202 } 203 204 MAIN: while(0) { 205 $flag = 0; 206 foreach my $u (sort {$a<=>$b} keys %URL) { 207 next unless $URL{$u}; 208 209 $flag = 1; 210 211 my $ua = LWP::UserAgent->new(); 212 my $request = HTTP::Request->new(GET => $url."&start=$u"); 213 $request->header("Cookie","PREF=ID=3559860f31fac0d3:LD=en:NR=$page_size:TM=1109341032:LM=1113500592:GM=1:S=gRr8PQTzjZ9uhb-z; domain=.google.com; expires=Sun, Jan 17, 2038 1:14:20 PM; path=/"); 214 $ua->timeout(30); 215 $ua->agent('Mozilla/5.0'); 216 my $response = $ua->request($request); 217 return unless $response->is_success; 218 my $content = $response->content; 219 220 if (!$content) { 221 sleep 5; next; 222 } 223 224 $URL{$u} = 0; 225 226 my $re1 = '
]*>]+>(.+)

]+)"?[^>]*>(.+?)
]+>]+>([^<]*?)(.*?)
]+>(.+?)\s*...\s*'; 228 229 my @page_links = split /(\&start=\d+>)/mi,$content; 230 foreach my $pl (@page_links) { 231 if ($pl =~ /\&start=(\d+)>/) { 232 if (!exists($URL{$1})) { 233 $URL{$1} = 1; 234 } 235 } 236 } 237 my( $section ) = ( $content =~ m/$re1/s ) or next; 238 $section =~ s/\n//g; 239 my @stories = split /($re2)/mi,$section; 240 foreach my $story (@stories) { 241 if ($story =~ m/$re2/i) { 242 my $story_h = {}; 243 my( $url, $headline, $source, $date, $summary ) = ( $1, $2, $3, $4, $5 ); 244 245 _clean_string($source); 246 _clean_string($headline); 247 _clean_string($date); 248 _clean_string($summary); 249 250 $story_h->{url} = $url; 251 $story_h->{headline} = $headline; 252 $story_h->{source} = $source; 253 $story_h->{date} = $date; 254 $story_h->{description} = "$source: $summary"; 255 $story_h->{summary} = $summary; 256 257 push(@results,$story_h); 258 last MAIN if $max>0 && scalar(@results)>=$max; 259 } 260 } 261 } 262 263 last MAIN unless $flag; 264 } 265 return \@results; 266 267 } 268 269 sub _clean_string { 270 $_[0] =~ s/ / /ig; 271 $_[0] =~ s/"/"/ig; 272 $_[0] =~ s/&/&/ig; 273 $_[0] =~ s/'/'/g; 274 $_[0] =~ s/
/ /ig; 275 $_[0] =~ s/<[^>]+>//g; 276 $_[0] =~ s/\s*-?\s*$//; 277 $_[0] =~ s/^\s+//; 278 } 279 280 1; 281 282 __END__