File Coverage

blib/lib/WWW/FetchStory/Fetcher/LiveJournal.pm

Criterion	Covered	Total	%
statement	12	183	6.5
branch	0	130	0.0
condition	0	9	0.0
subroutine	4	13	30.7
pod	9	9	100.0
total	25	344	7.2

line	stmt	bran	cond	sub	pod	time	code
1							package WWW::FetchStory::Fetcher::LiveJournal;
2							$WWW::FetchStory::Fetcher::LiveJournal::VERSION = '0.2602';
3	4			4		164548	use strict;
	4					10
	4					149
4	4			4		21	use warnings;
	4					9
	4					240
5	4			4		2427	use HTML::Entities;
	4					29241
	4					578
6							=head1 NAME
7
8							WWW::FetchStory::Fetcher::LiveJournal - fetching module for WWW::FetchStory
9
10							=head1 VERSION
11
12							version 0.2602
13
14							=head1 DESCRIPTION
15
16							This is the LiveJournal story-fetching plugin for WWW::FetchStory.
17
18							=cut
19
20	4			4		37	use parent qw(WWW::FetchStory::Fetcher);
	4					7
	4					38
21
22							=head1 METHODS
23
24							=head2 info
25
26							Information about the fetcher.
27
28							$info = $self->info();
29
30							=cut
31
32							sub info {
33	0			0	1		my $self = shift;
34
35	0						my $info = "(http://www.livejournal.com/) Journalling site where some people post their fiction.";
36
37	0						return $info;
38							} # info
39
40							=head2 priority
41
42							The priority of this fetcher. Fetchers with higher priority
43							get tried first. This is useful where there may be a generic
44							fetcher for a particular site, and then a more specialized fetcher
45							for particular sections of a site. For example, there may be a
46							generic LiveJournal fetcher, and then refinements for particular
47							LiveJournal community, such as the sshg_exchange community.
48							This works as either a class function or a method.
49
50							This must be overridden by the specific fetcher class.
51
52							$priority = $self->priority();
53
54							$priority = WWW::FetchStory::Fetcher::priority($class);
55
56							=cut
57
58							sub priority {
59	0			0	1		my $class = shift;
60
61	0						return 1;
62							} # priority
63
64							=head2 allow
65
66							If this fetcher can be used for the given URL, then this returns
67							true.
68							This must be overridden by the specific fetcher class.
69
70							if ($obj->allow($url))
71							{
72							....
73							}
74
75							=cut
76
77							sub allow {
78	0			0	1		my $self = shift;
79	0						my $url = shift;
80
81	0						return ($url =~ /\.livejournal\.com/);
82							} # allow
83
84							=head1 Private Methods
85
86							=head2 extract_story
87
88							Extract the story-content from the fetched content.
89
90							my ($story, $title) = $self->extract_story(content=>$content,
91							title=>$title);
92
93							=cut
94
95							sub extract_story {
96	0			0	1		my $self = shift;
97	0						my %args = (
98							content=>'',
99							title=>'',
100							@_
101							);
102	0						my $content = $args{content};
103
104	0						my $ljuser= '';
105	0						my $title = $args{title};
106	0						my $url = '';
107	0	0					if ($content =~ m#<title>([\w]+):\s*([^<]+)</title>#s)
		0
		0
		0
		0
108							{
109	0						$ljuser= $1;
110	0						$title = $2;
111							}
112							elsif ($content =~ m#<h1 class="b-singlepost-title">\s([^<]+)\s</h1>#s)
113							{
114	0						$title = $1;
115							}
116							elsif ($content =~ m#<title>([^<]+)</title>#s)
117							{
118	0						$title = $1;
119							}
120							elsif ($content =~ m#<h2 class="asset-name page-header2"><a href="([^"]+)">([^>]+)</a></h2>#)
121							{
122	0						$url = $1;
123	0						$title = $2;
124							}
125							elsif ($content =~ m#<div class="subject">([^<]+)</div>#)
126							{
127	0						$title = $1;
128							}
129	0	0					if ($content =~ m#([-\w]+)</b></a></span>\) wrote in <span class='ljuser'#s)
130							{
131	0						$ljuser = $1;
132							}
133
134	0	0					if (!$url)
135							{
136	0	0					if ($content =~ m#(http://[-\w]+.livejournal.com/\d+\.html)\?format=light\&mode=reply#s)
		0
		0
		0
137							{
138	0						$url = $1;
139							}
140							elsif ($content =~ m#<a[^>]href=["']([^?\s]+)\?mode=reply["']\s>Post a new comment#s)
141							{
142	0						$url = $1;
143							}
144							elsif ($content =~ m#<a[^>]*href=["']([^?\s]+)\?mode=reply#s)
145							{
146	0						$url = $1;
147							}
148							elsif ($content =~ m#<a[^>]href="([^?\s]+)"\s>Link</a>#s)
149							{
150	0						$url = $1;
151							}
152							}
153	0	0	0				if (!$ljuser && $url && ($url =~ m#http://([-\w]+)\.livejournal\.com#s))
			0
154							{
155	0						$ljuser = $1;
156							}
157
158	0						my $story = '';
159	0	0					if ($content =~ m#<article class="\sb-singlepost-body[^>]>(.*?)</article>#s)
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
160							{
161	0						$story = $1;
162							}
163							elsif ($content =~ m#<article class="\sb-singlepost-body[^>]>(.?)<div class="\sb-singlepost-tags ljtags\s*"#s)
164							{
165	0						$story = $1;
166							}
167							elsif ($content =~ m#<div class="b-singlepost-body">(.*?)<div id="comments"#s)
168							{
169	0						$story = $1;
170							}
171							elsif ($content =~ m#</table><p>(.)<br[^>]/><hr[^>]*/><div id='Comments'>#s)
172							{
173	0						$story = $1;
174							}
175							elsif ($content =~ m#<div class="b-singlepost-body">(.*?)<div class="b-singlepost-tags ljtags">#s)
176							{
177	0						$story = $1;
178							}
179							elsif ($content =~ m#<div class="b-singlepost-body">(.*?)</div>#s)
180							{
181	0						$story = $1;
182							}
183							elsif ($content =~ m#<div class="entrytext">(.*?)<div class="meta">#s)
184							{
185	0						$story = $1;
186							}
187							elsif ($content =~ m#<div style='[^']+'>(.)</div>\s<br[^>]/><hr[^>]/><div id='Comments'>#s)
188							{
189	0						$story = $1;
190							}
191							elsif ($content =~ m#</table></div>(.)<br[^>]/><hr[^>]*/><div id='Comments'>#s)
192							{
193	0						$story = $1;
194							}
195							elsif ($content =~ m#<div class="asset-body">\s<div class="user-icon"[^>]+>\s<img[^>]+>\s<br\s/>\s</div>(.)</div>\s*<div class="lj-currents">#s)
196							{
197	0						$story = $1;
198							}
199							elsif ($content =~ m#<td valign='top'>(<strong>.*)<strong>Current Mood:</strong>#s)
200							{
201	0						$story = $1;
202							}
203							elsif ($content =~ m#<div\sclass="entryHeader">([^<])</div>.?<td\sclass="entry">\s(<div>.?</div>)#s)
204							{
205	0						$title = $1;
206	0						$story = $2;
207							}
208							elsif ($content =~ m#<div\sclass="asset-content">\s(.?)\s<div class="lj-currents">#s)
209							{
210	0						$story = $1;
211							}
212							elsif ($content =~ m#<div\sclass="asset-content">\s(.?)\s<div class="quickreply" id="ljqrtentrycomment"#s)
213							{
214	0						$story = $1;
215							}
216							elsif ($content =~ m#<div\sclass="entryText">\s(.?)\s</div>\s*<div class="entryMetadata">#s)
217							{
218	0						$story = $1;
219							}
220							elsif ($content =~ m#<div\sclass="entryText">\s(.?)\s</div>\s*<div class="entryFooter">#s)
221							{
222	0						$story = $1;
223							}
224							elsif ($content =~ m#<b>Entry tags:</b>.?</table>(.?)<div class="ljad ljadleaderboard"#s)
225							{
226	0						$story = $1;
227							}
228							elsif ($content =~ m#<b>Entry tags:</b>.?</table>(.?)<iframe src='http://ads.sixapart.com/#s)
229							{
230	0						$story = $1;
231							}
232							elsif ($content =~ m#alt="Next Entry".?</table>\s</div>(.*?)<iframe src='http://ads.sixapart.com/#s)
233							{
234	0						$story = $1;
235							}
236							elsif ($content =~ m#alt="Next Entry".?</table>\s</div>(.*?)<div id='Comments'>#s)
237							{
238	0						$story = $1;
239							}
240							elsif ($content =~ m#<div class="entry_text">(.*?)<div class="clear">#s)
241							{
242	0						$story = $1;
243							}
244							elsif ($content =~ m#(<b>Tags</b>.?)\s<div class="quickreply" id="ljqrttopcomment"#s)
245							{
246	0						$story = $1;
247							}
248	0	0					warn "ljuser=$ljuser, title=$title\n" if ($self->{verbose} > 1);
249	0	0					warn "url=$url\n" if ($self->{verbose} > 1);
250	0	0					if ($story)
251							{
252							# remove LJ-specific cruft
253	0						$story =~ s#<a name="cutid."></a>##sg;
254	0						$story =~ s#<a name='cutid.-end'></a>##sg;
255	0						$story =~ s#<center><div class="lj-like">.*</center>##sg;
256	0						$story =~ s#<a href="http://[A-Za-z_-]+\.livejournal\.com/profile"[^>]>\s<img\sclass="i-ljuser-userhead"[^>]/>\s*</a>##sg;
257
258	0						$story = $self->tidy_chars($story);
259							}
260							else
261							{
262	0	0					if ($self->{debug})
263							{
264	0						open(my $fh, ">", "FS$$.html");
265	0						print $fh $content;
266	0						close $fh;
267	0						die "STORY not found. Content is in FS$$.html"
268							}
269							else
270							{
271	0						print STDERR "story not found\n";
272	0						return $self->tidy_chars($content);
273							}
274							}
275
276	0						my $out = <<EOT;
277							<h1>$title</h1>
278							EOT
279	0	0					if ($url)
280							{
281	0						$out .= <<EOT;
282							<p>(from <a href='$url'>$url</a>)</p>
283							EOT
284							}
285	0						$out .= <<EOT;
286							<p>$story
287							EOT
288	0						return ($out, $title);
289							} # extract_story
290
291							=head2 get_toc
292
293							Get a table-of-contents page.
294
295							=cut
296							sub get_toc {
297	0			0	1		my $self = shift;
298	0						my %args = @_;
299	0						my $url = $args{first_url};
300
301	0						return $self->get_page("${url}?format=light");
302							} # get_toc
303
304							=head2 parse_author
305
306							Get the author from the content
307
308							=cut
309							sub parse_author {
310	0			0	1		my $self = shift;
311	0						my %args = (
312							url=>'',
313							content=>'',
314							@_
315							);
316
317	0						my $content = $args{content};
318	0						my $author = '';
319	0	0					if ($content =~ m#<b>Author: </b> <span\s+class="ljuser\s+i-ljuser\s+"\s+lj:user="([-_\w]+)"#)
		0
		0
		0
320							{
321	0						$author = $1;
322							}
323							elsif ($content =~ m#<b>Creator:\s</b>\s<span\s+class="ljuser\s+i-ljuser\s+"\s+lj:user="([-_\w]+)"#)
324							{
325	0						$author = $1;
326							}
327							elsif ($content =~ m#<span class='ljuser ljuser-name_\w+' lj:user='\w+' style='white-space: nowrap;'><a href='http://\w+\.livejournal\.com/profile'><img src='http://l-stat\.livejournal\.com/img/userinfo\.gif' alt='\[info\]' width='17' height='17' style='vertical-align: bottom; border: 0; padding-right: 1px;' /></a><a href='http://\w+\.livejournal\.com/'><b>(.*?)</b></a></span>#)
328							{
329	0						$author = $1;
330							}
331							elsif ($content =~ m#<a href='http://[-\w]+\.livejournal\.com/'><b>(.*?)</b></a>#)
332							{
333	0						$author = $1;
334							}
335							else
336							{
337	0						$author = $self->SUPER::parse_author(%args);
338							}
339	0						return $author;
340							} # parse_author
341
342							=head2 parse_title
343
344							Get the title from the content
345
346							=cut
347							sub parse_title {
348	0			0	1		my $self = shift;
349	0						my %args = (
350							content=>'',
351							@_
352							);
353
354	0						my $content = $args{content};
355	0						my $title = '';
356	0	0					if ($content =~ /<(?:b\|strong)>Title:?\s<\/(?:b\|strong)>:?\s"?(.?)"?\s<(?:br\|p\|\/p\|div\|\/div)/si)
		0
		0
		0
		0
		0
		0
		0
		0
357							{
358	0						$title = $1;
359							}
360							elsif ($content =~ /\bTitle:\s"?(.?)"?\s*<br/s)
361							{
362	0						$title = $1;
363							}
364							elsif ($content =~ m#<title>[\w]+:\s*([^<]+)</title>#s)
365							{
366	0						$title = $1;
367							}
368							elsif ($content =~ m#<h1 class="b-singlepost-title">\s([^<]+)\s</h1>#s)
369							{
370	0						$title = $1;
371							}
372							elsif ($content =~ m#<title>([^<]+)</title>#s)
373							{
374	0						$title = $1;
375							}
376							elsif ($content =~ m#<h2 class="asset-name page-header2"><a href="([^"]+)">([^>]+)</a></h2>#)
377							{
378	0						$title = $2;
379							}
380							elsif ($content =~ m#<div class="subject">([^<]+)</div>#)
381							{
382	0						$title = $1;
383							}
384							elsif ($content =~ m#<h1>([^<]+)</h1>#is)
385							{
386	0						$title = $1;
387							}
388							elsif ($content =~ m#<h2>([^<]+)</h2>#is)
389							{
390	0						$title = $1;
391							}
392	0						$title = decode_entities($title); # get rid of HTML entities in the title
393	0						$title =~ s/<u>//ig;
394	0						$title =~ s/<\/u>//ig;
395	0						return $title;
396							} # parse_title
397
398							=head2 parse_toc
399
400							Parse the table-of-contents file.
401
402							%info = $self->parse_toc(content=>$content,
403							url=>$url,
404							urls=>\@urls);
405
406							This should return a hash containing:
407
408							=over
409
410							=item chapters
411
412							An array of URLs for the chapters of the story. In the case where the
413							story only takes one page, that will be the chapter.
414							In the case where multiple URLs have been passed in, it will be those URLs.
415
416							=item title
417
418							The title of the story.
419
420							=back
421
422							It may also return additional information, such as Summary.
423
424							=cut
425
426							sub parse_toc {
427	0			0	1		my $self = shift;
428	0						my %args = (
429							url=>'',
430							content=>'',
431							@_
432							);
433
434	0						my $content = $args{content};
435	0						my $user = '';
436	0						my $is_community = 0;
437	0	0					if ($args{url} =~ m{http://([-\w]+)\.livejournal\.com})
438							{
439	0						$user = $1;
440							}
441	0	0					if ($user eq 'community')
442							{
443	0						$is_community = 1;
444	0						$user = '';
445	0	0					if ($args{url} =~ m{http://community\.livejournal\.com/([-\w]+)})
446							{
447	0						$user = $1;
448							}
449							}
450
451	0						my %info = ();
452	0						$info{url} = $args{url};
453	0						$info{toc_first} = 1;
454
455	0						my $title = $self->parse_title(%args);
456	0						$title =~ s/${user}:\s*//;
457	0						$title =~ s/Fic:\s*//;
458	0						$info{title} = $title;
459
460	0						my $summary = $self->parse_summary(%args);
461	0						$summary =~ s/"/'/g;
462	0						$info{summary} = $summary;
463
464	0						my $author = $self->parse_author(%args);
465	0	0					if (!$author)
466							{
467	0						$author = $user;
468							}
469	0						$info{author} = $author;
470
471	0						my $characters = $self->parse_characters(%args);
472	0						$info{characters} = $characters;
473
474	0						$info{universe} = $self->parse_universe(%args);
475	0						$info{chapters} = $self->parse_chapter_urls(%args, user=>$user,
476							is_community=>$is_community);
477
478	0						return %info;
479							} # parse_toc
480
481							=head2 parse_chapter_urls
482
483							Figure out the URLs for the chapters of this story.
484
485							=cut
486							sub parse_chapter_urls {
487	0			0	1		my $self = shift;
488	0						my %args = (
489							url=>'',
490							content=>'',
491							@_
492							);
493	0						my $content = $args{content};
494	0						my $user = $args{user};
495	0						my @chapters = ();
496
497							# avoid adding duplicate URLs by remembering what we've parsed
498	0						my %remember_ch_urls = ();
499	0						$remember_ch_urls{$args{url}} = 1;
500
501	0	0					if (defined $args{urls})
502							{
503	0						@chapters = @{$args{urls}};
	0
504	0						for (my $i = 0; $i < @chapters; $i++)
505							{
506	0						$chapters[$i] = sprintf('%s?format=light', $chapters[$i]);
507							}
508							}
509	0	0	0				if (@chapters <= 1 and $user)
510							{
511	0	0					warn "parse_chapter_urls: user=$user\n" if ($self->{verbose} > 1);
512	0	0					if ($args{is_community})
513							{
514	0						while ($content =~ m/href="(http:\/\/community\.livejournal\.com\/${user}\/\d+.html)/sg)
515							{
516	0						my $ch_url = $1;
517	0	0					if (!$remember_ch_urls{$ch_url})
518							{
519	0	0					warn "chapter=$ch_url\n" if ($self->{verbose} > 1);
520	0						push @chapters, "${ch_url}?format=light";
521	0						$remember_ch_urls{$ch_url} = 1;
522							}
523							}
524							}
525							else
526							{
527	0						while ($content =~ m/<a\s+href="(http:\/\/${user}\.livejournal\.com\/\d+.html)"\s([^>])>/sg)
528							{
529	0						my $ch_url = $1;
530	0						my $rest = $2;
531	0	0					if ($rest =~ /prevnext-link/)
532							{
533							# it's a prev-next link, ignore it
534							}
535							else
536							{
537	0	0					warn "chapter='$ch_url'\n" if ($self->{verbose} > 1);
538	0	0					if (!$remember_ch_urls{$ch_url})
539							{
540	0						push @chapters, "${ch_url}?format=light";
541	0						$remember_ch_urls{$ch_url} = 1;
542							}
543							}
544							}
545							}
546							}
547
548	0						return \@chapters;
549							} # parse_chapter_urls
550
551							1; # End of WWW::FetchStory::Fetcher::LiveJournal
552							__END__