| blib/lib/WWW/Scraper/Yahoo360.pm | |||
|---|---|---|---|
| Criterion | Covered | Total | % |
| statement | 156 | 239 | 65.2 |
| branch | 34 | 78 | 43.5 |
| condition | 9 | 14 | 64.2 |
| subroutine | 17 | 23 | 73.9 |
| pod | 11 | 12 | 91.6 |
| total | 227 | 366 | 62.0 |
| line | stmt | bran | cond | sub | pod | time | code |
|---|---|---|---|---|---|---|---|
| 1 | # | ||||||
| 2 | # Ignorant Yahoo 360 blog scraper (blog.360.yahoo.com) | ||||||
| 3 | # | ||||||
| 4 | # $Id: Yahoo360.pm 168 2009-05-31 11:51:37Z cosimo $ | ||||||
| 5 | |||||||
| 6 | package WWW::Scraper::Yahoo360; | ||||||
| 7 | |||||||
| 8 | 1 | 1 | 27952 | use strict; | |||
| 1 | 2 | ||||||
| 1 | 41 | ||||||
| 9 | 1 | 1 | 6 | use warnings; | |||
| 1 | 2 | ||||||
| 1 | 32 | ||||||
| 10 | |||||||
| 11 | 1 | 1 | 7 | use Carp (); | |||
| 1 | 6 | ||||||
| 1 | 17 | ||||||
| 12 | 1 | 1 | 5947 | use Date::Parse (); | |||
| 1 | 10460 | ||||||
| 1 | 28 | ||||||
| 13 | 1 | 1 | 1049 | use File::Slurp (); | |||
| 1 | 27033 | ||||||
| 1 | 28 | ||||||
| 14 | 1 | 1 | 1293 | use HTTP::Date (); | |||
| 1 | 1843 | ||||||
| 1 | 24 | ||||||
| 15 | 1 | 1 | 1054 | use JSON::XS (); | |||
| 1 | 10854 | ||||||
| 1 | 28 | ||||||
| 16 | 1 | 1 | 1466 | use WWW::Mechanize (); | |||
| 1 | 197789 | ||||||
| 1 | 35 | ||||||
| 17 | |||||||
| 18 | 1 | 1 | 11 | use constant BLOG_URL => q{http://blog.360.yahoo.com/blog/}; | |||
| 1 | 3 | ||||||
| 1 | 76 | ||||||
| 19 | 1 | 1 | 5 | use constant LOGIN_FORM => q{login_form}; | |||
| 1 | 3 | ||||||
| 1 | 43 | ||||||
| 20 | 1 | 1 | 4 | use constant LOGIN_URL => q{https://login.yahoo.com/config/login_verify2?.intl=us&.done=http%3A%2F%2Fblog.360.yahoo.com%2Fblog%2F%3F.login%3D1&.src=360}; | |||
| 1 | 3 | ||||||
| 1 | 2956 | ||||||
| 21 | |||||||
| 22 | our $DEBUG = 0; | ||||||
| 23 | our $VERSION = '0.09'; | ||||||
| 24 | |||||||
| 25 | sub new { | ||||||
| 26 | 1 | 1 | 1 | 17 | my ($class, $args) = @_; | ||
| 27 | 1 | 50 | 9 | $class = ref $class || $class || __PACKAGE__; | |||
| 28 | 1 | 2 | my $self = $args; | ||||
| 29 | 1 | 4 | bless $self, $class; | ||||
| 30 | } | ||||||
| 31 | |||||||
| 32 | # Fetches high-level blog information | ||||||
| 33 | sub blog_info { | ||||||
| 34 | 8 | 8 | 1 | 6724 | my ($self, $blog_page) = @_; | ||
| 35 | |||||||
| 36 | 8 | 50 | 25 | if (! $blog_page) { | |||
| 37 | 0 | 0 | $self->debug('Fetching blog main page'); | ||||
| 38 | 0 | 0 | $blog_page = $self->blog_main_page(); | ||||
| 39 | 0 | 0 | 0 | if (! $blog_page) { | |||
| 40 | 0 | 0 | $self->debug('Failed to fetch blog main page'); | ||||
| 41 | 0 | 0 | return; | ||||
| 42 | } | ||||||
| 43 | } | ||||||
| 44 | |||||||
| 45 | # Get sharing level | ||||||
| 46 | # Your blog can be seen by Public |
||||||
| 47 | # | ||||||
| 48 | # or: | ||||||
| 49 | # Your blog can be seen by Just me (private) |
||||||
| 50 | # Your blog can be seen by Friends |
||||||
| 51 | # | ||||||
| 52 | 8 | 12 | my $sharing = q{}; | ||||
| 53 | 8 | 50 | 91 | if ($blog_page =~ m{Your blog can be seen by ([\w\(\)\s]+)}m) { | |||
| 54 | |||||||
| 55 | 8 | 24 | $sharing = lc $1; | ||||
| 56 | 8 | 100 | 35 | if ($sharing =~ m{just me}) { | |||
| 50 | |||||||
| 57 | 2 | 5 | $sharing = 'private'; | ||||
| 58 | } | ||||||
| 59 | elsif ($sharing =~ m{friend}) { | ||||||
| 60 | 0 | 0 | $sharing = 'friends'; | ||||
| 61 | } | ||||||
| 62 | |||||||
| 63 | 8 | 21 | $self->debug('Blog sharing found to be "', $sharing, '"'); | ||||
| 64 | } | ||||||
| 65 | else { | ||||||
| 66 | 0 | 0 | $self->debug('Blog sharing string not found'); | ||||
| 67 | } | ||||||
| 68 | |||||||
| 69 | # Get title | ||||||
| 70 | 8 | 15 | my $title = q{}; | ||||
| 71 | 8 | 50 | 281 | if ($blog_page =~ m{([^<]+)Full Post View}m) { |
|||
| 72 | 8 | 16 | $title = $1; | ||||
| 73 | 8 | 17 | $self->debug('Blog title found to be "', $title, '"'); | ||||
| 74 | } | ||||||
| 75 | |||||||
| 76 | # Get number of posts | ||||||
| 77 | # | ||||||
| 78 | # 1 - 5 of 13 ... | ||||||
| 79 | 8 | 12 | my $start = | ||||
| 80 | my $end = | ||||||
| 81 | my $count = 0; | ||||||
| 82 | |||||||
| 83 | 8 | 50 | 172 | if ($blog_page =~ m{(\d+) \- (\d+) of (\d+)}m) { | |||
| 84 | 8 | 11 | $start = $1; | ||||
| 85 | 8 | 19 | $end = $2; | ||||
| 86 | 8 | 16 | $count = $3; | ||||
| 87 | 8 | 16 | $self->debug('Blog post counts found. Start:', $start, ' End:', $end, ' Count:', $count); | ||||
| 88 | } | ||||||
| 89 | else { | ||||||
| 90 | 0 | 0 | $self->debug('Blog post counts not found'); | ||||
| 91 | } | ||||||
| 92 | |||||||
| 93 | 8 | 12 | my $link = q{}; | ||||
| 94 | 8 | 50 | 197 | if ($blog_page =~ m{My Blog}) { | |||
| 95 | 8 | 16 | $link = $1; | ||||
| 96 | 8 | 18 | $self->debug('Blog URL found: ', $link); | ||||
| 97 | } | ||||||
| 98 | else { | ||||||
| 99 | 0 | 0 | $self->debug('Blog URL not found'); | ||||
| 100 | } | ||||||
| 101 | |||||||
| 102 | 8 | 20 | $title =~ s{^\s+}{}; | ||||
| 103 | 8 | 31 | $title =~ s{\s+$}{}; | ||||
| 104 | |||||||
| 105 | return { | ||||||
| 106 | 8 | 34 | sharing => $sharing, | ||||
| 107 | title => $title, | ||||||
| 108 | start => $start, | ||||||
| 109 | end => $end, | ||||||
| 110 | count => $count, | ||||||
| 111 | link => $link, | ||||||
| 112 | lastBuildDate => HTTP::Date::time2str(), | ||||||
| 113 | language => 'en-us', | ||||||
| 114 | }; | ||||||
| 115 | |||||||
| 116 | } | ||||||
| 117 | |||||||
| 118 | # Fetches the user's main blog page | ||||||
| 119 | sub blog_main_page { | ||||||
| 120 | 0 | 0 | 1 | 0 | my ($self) = @_; | ||
| 121 | |||||||
| 122 | 0 | 0 | my $mech = $self->mech(); | ||||
| 123 | 0 | 0 | $mech->get(BLOG_URL); | ||||
| 124 | |||||||
| 125 | 0 | 0 | 0 | if ($mech->success()) { | |||
| 126 | 0 | 0 | $self->debug('Blog main page downloaded successfully'); | ||||
| 127 | 0 | 0 | return $mech->content(); | ||||
| 128 | } | ||||||
| 129 | |||||||
| 130 | 0 | 0 | $self->debug('Blog main page download failed'); | ||||
| 131 | 0 | 0 | Carp::croak("Failed to retrieve blog main page"); | ||||
| 132 | } | ||||||
| 133 | |||||||
| 134 | # Builds the url to fetch a specific blog page | ||||||
| 135 | sub blog_page_url { | ||||||
| 136 | 0 | 0 | 1 | 0 | my ($self, $link, $start, $per_page, $count) = @_; | ||
| 137 | 0 | 0 | my $url = $link; | ||||
| 138 | 0 | 0 | my $last = $start + $per_page - 1; | ||||
| 139 | 0 | 0 | 0 | if ($last > $count) { $last = $count } | |||
| 0 | 0 | ||||||
| 140 | 0 | 0 | $url .= '&l=' . $start; | ||||
| 141 | 0 | 0 | $url .= '&u=' . $last; | ||||
| 142 | 0 | 0 | $url .= '&mx=' . $count; | ||||
| 143 | 0 | 0 | $url .= '&lmt=' . $per_page; | ||||
| 144 | 0 | 0 | return $url; | ||||
| 145 | } | ||||||
| 146 | |||||||
| 147 | sub debug { | ||||||
| 148 | 120 | 50 | 120 | 0 | 272 | return unless $DEBUG; | |
| 149 | |||||||
| 150 | 0 | 0 | my ($self, @msg) = @_; | ||||
| 151 | 0 | 0 | print STDERR @msg, "\n"; | ||||
| 152 | |||||||
| 153 | 0 | 0 | return; | ||||
| 154 | } | ||||||
| 155 | |||||||
| 156 | # Logs in to Yahoo | ||||||
| 157 | sub login { | ||||||
| 158 | 0 | 0 | 1 | 0 | my ($self) = @_; | ||
| 159 | |||||||
| 160 | 0 | 0 | my $user = $self->{username}; | ||||
| 161 | 0 | 0 | my $pass = $self->{password}; | ||||
| 162 | |||||||
| 163 | 0 | 0 | my $mech = $self->mech(); | ||||
| 164 | |||||||
| 165 | 0 | 0 | $mech->get(LOGIN_URL); | ||||
| 166 | |||||||
| 167 | 0 | 0 | $mech->submit_form( | ||||
| 168 | form_name => LOGIN_FORM, | ||||||
| 169 | fields => { | ||||||
| 170 | login => $user, | ||||||
| 171 | passwd => $pass, | ||||||
| 172 | '.persistent' => 'y', | ||||||
| 173 | }, | ||||||
| 174 | button => '.save', | ||||||
| 175 | ); | ||||||
| 176 | |||||||
| 177 | # Not sure how to make this more robust | ||||||
| 178 | 0 | 0 | my $next_page = $mech->content(); | ||||
| 179 | 0 | 0 | 0 | if ($next_page =~ m{Invalid ID or password}) { | |||
| 180 | 0 | 0 | $self->debug('Login to Yahoo service failed for user "', $user, '"'); | ||||
| 181 | 0 | 0 | return; | ||||
| 182 | } | ||||||
| 183 | |||||||
| 184 | 0 | 0 | my $ok = $mech->success(); | ||||
| 185 | |||||||
| 186 | 0 | 0 | 0 | if ($ok) { | |||
| 187 | 0 | 0 | $self->debug('Login to Yahoo service succeeded'); | ||||
| 188 | } | ||||||
| 189 | else { | ||||||
| 190 | 0 | 0 | $self->debug('Login to Yahoo service failed. Unknown reason?'); | ||||
| 191 | } | ||||||
| 192 | |||||||
| 193 | 0 | 0 | return $ok; | ||||
| 194 | } | ||||||
| 195 | |||||||
| 196 | # Dumps last accessed page content to STDOUT | ||||||
| 197 | sub dump { | ||||||
| 198 | 0 | 0 | 1 | 0 | my ($self) = @_; | ||
| 199 | 0 | 0 | print $self->mech->content(); | ||||
| 200 | } | ||||||
| 201 | |||||||
| 202 | # Retrieves all comments in the user's blog | ||||||
| 203 | sub get_blog_comments { | ||||||
| 204 | 0 | 0 | 1 | 0 | my ($self, $posts) = @_; | ||
| 205 | |||||||
| 206 | 0 | 0 | 0 | if (! $posts) { | |||
| 207 | 0 | 0 | return; | ||||
| 208 | } | ||||||
| 209 | |||||||
| 210 | 0 | 0 | my @comments; | ||||
| 211 | |||||||
| 212 | 0 | 0 | for my $post (@{$posts}) { | ||||
| 0 | 0 | ||||||
| 213 | |||||||
| 214 | # No comments, don't fetch them | ||||||
| 215 | 0 | 0 | 0 | if ($post->{comments} == 0) { | |||
| 216 | 0 | 0 | $self->debug('No comments for post ', $post->{title}); | ||||
| 217 | 0 | 0 | next; | ||||
| 218 | } | ||||||
| 219 | |||||||
| 220 | #print qq{Found $post->{comments} comments for blog post "$post->{title}"\n}; | ||||||
| 221 | |||||||
| 222 | 0 | 0 | 0 | if (my $post_comm = $self->get_blogpost_comments($post)) { | |||
| 223 | 0 | 0 | $self->debug('Got ', scalar(@{ $post_comm }), ' comments for post ', $post->{title}); | ||||
| 0 | 0 | ||||||
| 224 | 0 | 0 | push @comments, @{ $post_comm }; | ||||
| 0 | 0 | ||||||
| 225 | } | ||||||
| 226 | |||||||
| 227 | } | ||||||
| 228 | |||||||
| 229 | 0 | 0 | return \@comments; | ||||
| 230 | } | ||||||
| 231 | |||||||
| 232 | # Retrieves all comments to a single blog post | ||||||
| 233 | sub get_blogpost_comments { | ||||||
| 234 | 2 | 2 | 1 | 7480 | my ($self, $post, $page) = @_; | ||
| 235 | |||||||
| 236 | # If we didn't get a pre-saved html page, get it now | ||||||
| 237 | 2 | 50 | 9 | if (! $page) { | |||
| 238 | 0 | 0 | $self->mech->get($post->{link}); | ||||
| 239 | 0 | 0 | 0 | $page = $self->mech->success | |||
| 240 | ? $self->mech->content() | ||||||
| 241 | : q{}; | ||||||
| 242 | } | ||||||
| 243 | |||||||
| 244 | 2 | 50 | 7 | if (! $page) { | |||
| 245 | 0 | 0 | warn "ERROR fetching blogpost comments for $post->{title}\n"; | ||||
| 246 | 0 | 0 | return; | ||||
| 247 | } | ||||||
| 248 | |||||||
| 249 | 2 | 3 | my @comments; | ||||
| 250 | |||||||
| 251 | 2 | 37 | while ($page =~ m{ |
||||
| 252 | |||||||
| 253 | 6 | 30 | my $comment = { | ||||
| 254 | 'user-profile' => $1, | ||||||
| 255 | username => $2, | ||||||
| 256 | link => $post->{link}, | ||||||
| 257 | }; | ||||||
| 258 | |||||||
| 259 | # Comments can span multiple lines | ||||||
| 260 | # but are always enclosed between and |
||||||
| 261 | 6 | 50 | 32 | if ($page =~ m{ (.*?) }sg) { |
|||
| 262 | 6 | 14 | $comment->{comment} = $1; | ||||
| 263 | 6 | 12 | $comment->{comment} =~ s{^\s+}{}; | ||||
| 264 | 6 | 38 | $comment->{comment} =~ s{\s+$}{}; | ||||
| 265 | } | ||||||
| 266 | |||||||
| 267 | 6 | 50 | 27 | if ($page =~ m{ ([^<]+)\s*<}mg) { |
|||
| 268 | 6 | 15 | $comment->{date} = $1; | ||||
| 269 | 6 | 11 | $comment->{date} =~ s{^\s+}{}; | ||||
| 270 | 6 | 28 | $comment->{date} =~ s{\s+$}{}; | ||||
| 271 | 6 | 15 | $comment->{date} = $self->parse_date($comment->{date}); | ||||
| 272 | } | ||||||
| 273 | |||||||
| 274 | $self->debug( | ||||||
| 275 | 6 | 21 | 'Found comment "', $comment->{comment}, | ||||
| 276 | '" by "', $comment->{username}, '"' | ||||||
| 277 | ); | ||||||
| 278 | |||||||
| 279 | 6 | 42 | push @comments, $comment; | ||||
| 280 | } | ||||||
| 281 | |||||||
| 282 | 2 | 7 | $self->debug('Found ', scalar(@comments), ' comments to blog post ', $post->{link}); | ||||
| 283 | |||||||
| 284 | 2 | 8 | return \@comments; | ||||
| 285 | } | ||||||
| 286 | |||||||
| 287 | # Gets all blog posts by a user | ||||||
| 288 | sub get_blog_posts { | ||||||
| 289 | 4 | 4 | 1 | 6128 | my ($self, $blog_page, %overrides) = @_; | ||
| 290 | |||||||
| 291 | 4 | 13 | $self->debug("Start parsing of blog posts"); | ||||
| 292 | |||||||
| 293 | 4 | 50 | 12 | if (! $blog_page) { | |||
| 294 | 0 | 0 | $self->debug("Downloading of main blog page"); | ||||
| 295 | 0 | 0 | 0 | $blog_page ||= $self->blog_main_page(); | |||
| 296 | 0 | 0 | $self->debug("Download complete"); | ||||
| 297 | } | ||||||
| 298 | else { | ||||||
| 299 | 4 | 12 | $self->debug("Blog main page was already supplied. No need to download."); | ||||
| 300 | } | ||||||
| 301 | |||||||
| 302 | 4 | 11 | my $blog_info = $self->blog_info($blog_page); | ||||
| 303 | |||||||
| 304 | 4 | 103 | for (keys %overrides) { | ||||
| 305 | 12 | 28 | $blog_info->{$_} = $overrides{$_}; | ||||
| 306 | } | ||||||
| 307 | |||||||
| 308 | 4 | 9 | my $link = $blog_info->{link}; | ||||
| 309 | 4 | 8 | my $start = $blog_info->{start}; | ||||
| 310 | 4 | 8 | my $count = $blog_info->{count}; | ||||
| 311 | 4 | 6 | my $end_page = $blog_info->{end}; | ||||
| 312 | 4 | 9 | my $end_blog = $start + $count - 1; | ||||
| 313 | 4 | 7 | my $per_page = $end_page - $start + 1; | ||||
| 314 | |||||||
| 315 | 4 | 7 | my @posts = (); | ||||
| 316 | |||||||
| 317 | 4 | 16 | $self->debug("Parsing posts ($start .. $end_blog)"); | ||||
| 318 | |||||||
| 319 | # Prevent endless loops | ||||||
| 320 | 4 | 50 | 11 | if ($start > $end_page) { | |||
| 321 | 0 | 0 | $start = $end_page; | ||||
| 322 | } | ||||||
| 323 | |||||||
| 324 | 4 | 17 | for (my $n = $start; $n <= $end_blog; ) { | ||||
| 325 | |||||||
| 326 | 4 | 17 | $self->debug( | ||||
| 327 | 'Reading post n. ', $n, | ||||||
| 328 | ' end_of_page:', $end_page, | ||||||
| 329 | ' end_of_blog:', $end_blog, | ||||||
| 330 | ); | ||||||
| 331 | |||||||
| 332 | # Fetch next page and continue | ||||||
| 333 | 4 | 50 | 66 | 20 | if ($n >= $end_page && $end_page < $end_blog) { | ||
| 334 | |||||||
| 335 | 0 | 0 | my $next_page_url = $self->blog_page_url( | ||||
| 336 | $link, $end_page + 1, $per_page, $count | ||||||
| 337 | ); | ||||||
| 338 | |||||||
| 339 | 0 | 0 | $end_page += $per_page; | ||||
| 340 | |||||||
| 341 | 0 | 0 | $self->mech->get($next_page_url); | ||||
| 342 | 0 | 0 | $self->debug('Next url is:', $next_page_url); | ||||
| 343 | |||||||
| 344 | 0 | 0 | $blog_page = $self->mech->content(); | ||||
| 345 | 0 | 0 | 0 | if (! $blog_page) { | |||
| 346 | 0 | 0 | $self->debug('Failed to read url: ', $next_page_url); | ||||
| 347 | 0 | 0 | last; | ||||
| 348 | } | ||||||
| 349 | |||||||
| 350 | } | ||||||
| 351 | |||||||
| 352 | 4 | 7 | my $found_posts = 0; | ||||
| 353 | |||||||
| 354 | 4 | 57 | while ($blog_page =~ m{ |
||||
| 355 | |||||||
| 356 | # Blog post title | ||||||
| 357 | 10 | 18 | my $title = $1; | ||||
| 358 | 10 | 40 | my $post = { | ||||
| 359 | title => $1, | ||||||
| 360 | description => '' | ||||||
| 361 | }; | ||||||
| 362 | |||||||
| 363 | 10 | 22 | $self->debug('Found new blog post "', $title, '" (', $n, ')'); | ||||
| 364 | |||||||
| 365 | 10 | 12 | $found_posts = 1; | ||||
| 366 | |||||||
| 367 | # Main picture of the blog post | ||||||
| 368 | 10 | 50 | 56 | if ($blog_page =~ m{ (.*?) }gsmc) { |
|||
| 369 | 10 | 24 | my $pic = $1; | ||||
| 370 | 10 | 33 | $pic =~ s{^\s*}{}mx; | ||||
| 371 | 10 | 357 | $pic =~ s{\s*$}{}mx; | ||||
| 372 | 10 | 100 | 24 | if ($pic) { | |||
| 373 | 4 | 14 | $post->{description} = ' ' . $pic . ' '; |
||||
| 374 | 4 | 14 | $self->debug(' Image: ', substr($pic, 0, 30), '...'); | ||||
| 375 | } | ||||||
| 376 | } | ||||||
| 377 | |||||||
| 378 | # Blog post content | ||||||
| 379 | # Read until the end of line (there might be multiple s) |
||||||
| 380 | 10 | 50 | 61 | if ($blog_page =~ m{ (.*) }gmc) { |
|||
| 381 | 10 | 34 | $post->{description} .= $1; | ||||
| 382 | 10 | 29 | $self->debug(' Content: ', substr($1, 0, 30), '...'); | ||||
| 383 | } | ||||||
| 384 | |||||||
| 385 | # Tags | ||||||
| 386 | 10 | 50 | 50 | if ($blog_page =~ m{ | |||
| 387 | 10 | 24 | $post->{tags} = $1; | ||||
| 388 | 10 | 18 | $self->debug(' Tags: ', $1); | ||||
| 389 | } | ||||||
| 390 | |||||||
| 391 | # Date of post | ||||||
| 392 | 10 | 50 | 170 | if ($blog_page =~ m{([^<]+)Edit}gm) { | |||
| 393 | 10 | 24 | $post->{pubDate} = HTTP::Date::time2str($self->parse_date($1)); | ||||
| 394 | 10 | 135 | $self->debug(' Date: ', $1); | ||||
| 395 | } | ||||||
| 396 | |||||||
| 397 | # Permanent link | ||||||
| 398 | 10 | 50 | 90 | if ($blog_page =~ m{Permanent Link}gm) { | |||
| 399 | 10 | 27 | $post->{link} = $1; | ||||
| 400 | 10 | 21 | $self->debug(' Permalink: ', $1); | ||||
| 401 | } | ||||||
| 402 | |||||||
| 403 | # No. of comments | ||||||
| 404 | 10 | 50 | 86 | if ($blog_page =~ m{(\d+) Comments?}gm) { | |||
| 405 | 10 | 55 | $post->{comments} = $1; | ||||
| 406 | 10 | 22 | $self->debug(' Comments: ', $1); | ||||
| 407 | } | ||||||
| 408 | |||||||
| 409 | 10 | 18 | push @posts, $post; | ||||
| 410 | |||||||
| 411 | 10 | 59 | $n++; | ||||
| 412 | |||||||
| 413 | } | ||||||
| 414 | |||||||
| 415 | 4 | 100 | 17 | if (not $found_posts) { | |||
| 416 | 1 | 3 | last; | ||||
| 417 | } | ||||||
| 418 | |||||||
| 419 | } | ||||||
| 420 | |||||||
| 421 | 4 | 29 | return \@posts; | ||||
| 422 | |||||||
| 423 | } | ||||||
| 424 | |||||||
| 425 | # Mechanize object accessor | ||||||
| 426 | sub mech { | ||||||
| 427 | 0 | 0 | 1 | 0 | my ($self) = @_; | ||
| 428 | 0 | 0 | 0 | if (! exists $self->{_mech}) { | |||
| 429 | 0 | 0 | $self->{_mech} = WWW::Mechanize->new(); | ||||
| 430 | } | ||||||
| 431 | 0 | 0 | return $self->{_mech}; | ||||
| 432 | } | ||||||
| 433 | |||||||
| 434 | # Tries to parse a date in the Yahoo 360 format | ||||||
| 435 | sub parse_date { | ||||||
| 436 | 20 | 20 | 1 | 3108 | my ($self, $date) = @_; | ||
| 437 | |||||||
| 438 | 20 | 36 | $date =~ s{^\s+}{}; | ||||
| 439 | 20 | 75 | $date =~ s{\s+$}{}; | ||||
| 440 | |||||||
| 441 | 20 | 50 | 112 | if ($date =~ m{^ (\w{3})\w+ \s (\w{3})\w* \s (\d+), \s (\d+) \s - \s (\d+):(\d+)([ap]m) \s \((.*)\) \s* $}x) { | |||
| 442 | 20 | 30 | my $dow = $1; | ||||
| 443 | 20 | 35 | my $month = $2; | ||||
| 444 | 20 | 25 | my $day = $3; | ||||
| 445 | 20 | 36 | my $year = $4; | ||||
| 446 | 20 | 31 | my $hours = $5; | ||||
| 447 | 20 | 24 | my $mins = $6; | ||||
| 448 | 20 | 38 | my $ampm = uc $7; | ||||
| 449 | 20 | 29 | my $tz = uc $8; | ||||
| 450 | |||||||
| 451 | # Indochina time zone is not recognized by Date::Parse | ||||||
| 452 | 20 | 100 | 48 | if ($tz eq 'ICT') { | |||
| 453 | 12 | 18 | $tz = 'UTC+07'; | ||||
| 454 | } | ||||||
| 455 | |||||||
| 456 | 20 | 100 | 100 | 135 | if ($ampm eq 'AM' && $hours == 12) { | ||
| 100 | 100 | ||||||
| 457 | 1 | 6 | $hours = 0; | ||||
| 458 | } | ||||||
| 459 | elsif ($ampm eq 'PM' && $hours != 12) { | ||||||
| 460 | 8 | 11 | $hours += 12; | ||||
| 461 | 8 | 50 | 14 | if ($hours > 23) { | |||
| 462 | 0 | 0 | $hours -= 24; | ||||
| 463 | } | ||||||
| 464 | } | ||||||
| 465 | |||||||
| 466 | 20 | 39 | my $time = "$hours:$mins:00"; | ||||
| 467 | |||||||
| 468 | # Wed, 16 Jun 94 07:29:35 CST | ||||||
| 469 | 20 | 57 | $date = "$day $month $year $time $tz"; | ||||
| 470 | |||||||
| 471 | #arn "# Converted to [$date]\n"; | ||||||
| 472 | |||||||
| 473 | } | ||||||
| 474 | |||||||
| 475 | 20 | 59 | my $epoch = Date::Parse::str2time($date); | ||||
| 476 | #arn "# str2time($date) returns ($epoch)\n"; | ||||||
| 477 | |||||||
| 478 | 20 | 5214 | return $epoch; | ||||
| 479 | } | ||||||
| 480 | |||||||
| 481 | 1; | ||||||
| 482 | |||||||
| 483 | __END__ |