File Coverage

blib/lib/WWW/FetchStory/Fetcher/Dreamwidth.pm

Criterion	Covered	Total	%
statement	9	124	7.2
branch	0	60	0.0
condition	0	9	0.0
subroutine	3	11	27.2
pod	8	8	100.0
total	20	212	9.4

line	stmt	bran	cond	sub	pod	time	code
1							package WWW::FetchStory::Fetcher::Dreamwidth;
2							$WWW::FetchStory::Fetcher::Dreamwidth::VERSION = '0.2602';
3	1			1		164164	use strict;
	1					2
	1					32
4	1			1		4	use warnings;
	1					1
	1					63
5							=head1 NAME
6
7							WWW::FetchStory::Fetcher::Dreamwidth - fetching module for WWW::FetchStory
8
9							=head1 VERSION
10
11							version 0.2602
12
13							=head1 DESCRIPTION
14
15							This is the Dreamwidth story-fetching plugin for WWW::FetchStory.
16
17							=cut
18
19	1			1		4	use parent qw(WWW::FetchStory::Fetcher);
	1					1
	1					7
20
21							=head1 METHODS
22
23							=head2 info
24
25							Information about the fetcher.
26
27							$info = $self->info();
28
29							=cut
30
31							sub info {
32	0			0	1		my $self = shift;
33
34	0						my $info = "(http://www.dreamwidth.org) Journalling site where some post their fiction.";
35
36	0						return $info;
37							} # info
38
39							=head2 priority
40
41							The priority of this fetcher. Fetchers with higher priority
42							get tried first. This is useful where there may be a generic
43							fetcher for a particular site, and then a more specialized fetcher
44							for particular sections of a site. For example, there may be a
45							generic Dreamwidth fetcher, and then refinements for particular
46							Dreamwidth community, such as the sshg_exchange community.
47							This works as either a class function or a method.
48
49							This must be overridden by the specific fetcher class.
50
51							$priority = $self->priority();
52
53							$priority = WWW::FetchStory::Fetcher::priority($class);
54
55							=cut
56
57							sub priority {
58	0			0	1		my $class = shift;
59
60	0						return 1;
61							} # priority
62
63							=head2 allow
64
65							If this fetcher can be used for the given URL, then this returns
66							true.
67							This must be overridden by the specific fetcher class.
68
69							if ($obj->allow($url))
70							{
71							....
72							}
73
74							=cut
75
76							sub allow {
77	0			0	1		my $self = shift;
78	0						my $url = shift;
79
80	0						return ($url =~ /\.dreamwidth\.org/);
81							} # allow
82
83							=head1 Private Methods
84
85							=head2 extract_story
86
87							Extract the story-content from the fetched content.
88
89							my ($story, $title) = $self->extract_story(content=>$content,
90							title=>$title);
91
92							=cut
93
94							sub extract_story {
95	0			0	1		my $self = shift;
96	0						my %args = (
97							content=>'',
98							title=>'',
99							@_
100							);
101	0						my $content = $args{content};
102
103	0						my $user= '';
104	0						my $title = '';
105	0						my $url = '';
106	0	0					if ($content =~ m#<title>([\w]+):\s*([^<]+)</title>#s)
		0
107							{
108	0						$user= $1;
109	0						$title = $2;
110							}
111							elsif ($content =~ m#<title>([^<]+)</title>#s)
112							{
113	0						$title = $1;
114							}
115	0	0					if ($content =~ m#([-\w]+)</b></a></span>\) wrote in <span class='ljuser'#s)
116							{
117	0						$user = $1;
118							}
119
120	0						my $year = '';
121	0						my $month = '';
122	0						my $day = '';
123	0	0					if ($content =~ m#wrote,<br /><font[^>]+>\@\s*<a href="[^"]+">(\d+)</a>-<a href="[^"]+">(\d+)</a>-<a href="[^"]+">(\d+)</a>#s)
124							{
125	0						$year = $1;
126	0						$month = $2;
127	0						$day = $3;
128	0	0					warn "year=$year,month=$month,day=$day\n" if ($self->{verbose} > 1);
129							}
130
131	0	0					if (!$url)
132							{
133	0	0					if ($content =~ m#<a[^>]href=["']([^?\s]+)\?mode=reply["']\s>Post a new comment#s)
		0
		0
134							{
135	0						$url = $1;
136							}
137							elsif ($content =~ m#<a[^>]*href=["']([^?\s]+)\?mode=reply#s)
138							{
139	0						$url = $1;
140							}
141							elsif ($content =~ m#<a[^>]href="([^?\s]+)"\s>Link</a>#s)
142							{
143	0						$url = $1;
144							}
145							}
146	0	0	0				if (!$user && $url && ($url =~ m#http://([-\w]+)\.dreamwidth\.org#s))
			0
147							{
148	0						$user = $1;
149							}
150
151	0						my $story = '';
152	0	0					if ($content =~ m#<div id='entrysubj'>(.*?)<div id='Comments'>#s)
		0
		0
153							{
154	0						$story = $1;
155							}
156							elsif ($content =~ m#<div id='entrysubj'>(.*?)<div role="navigation">#s)
157							{
158	0						$story = $1;
159							}
160							elsif ($content =~ m#<div class="entry"[^>]>(.?)<div class="tag">#s)
161							{
162	0						$story = $1;
163							}
164	0	0					warn "user=$user, title=$title\n" if ($self->{verbose} > 1);
165	0	0					warn "url=$url\n" if ($self->{verbose} > 1);
166	0	0					if ($story)
167							{
168	0						$story = $self->tidy_chars($story);
169							# remove cutid1
170	0						$story =~ s#<a name="cutid."></a>##sg;
171							}
172							else
173							{
174	0						print STDERR "story not found\n";
175	0						return $self->tidy_chars($content);
176							}
177
178	0						my $out = <<EOT;
179							<h1>$title</h1>
180							<p>by $user</p>
181							<p>$year-$month-$day (from <a href='$url'>here</a>)</p>
182							<p>$story
183							EOT
184	0						return ($out, $title);
185							} # extract_story
186
187							=head2 get_toc
188
189							Get a table-of-contents page.
190
191							=cut
192							sub get_toc {
193	0			0	1		my $self = shift;
194	0						my %args = @_;
195	0						my $url = $args{first_url};
196
197	0						return $self->get_page("${url}?format=light");
198							} # get_toc
199
200							=head2 parse_toc
201
202							Parse the table-of-contents file.
203
204							%info = $self->parse_toc(content=>$content,
205							url=>$url,
206							urls=>\@urls);
207
208							This should return a hash containing:
209
210							=over
211
212							=item chapters
213
214							An array of URLs for the chapters of the story. In the case where the
215							story only takes one page, that will be the chapter.
216							In the case where multiple URLs have been passed in, it will be those URLs.
217
218							=item title
219
220							The title of the story.
221
222							=back
223
224							It may also return additional information, such as Summary.
225
226							=cut
227
228							sub parse_toc {
229	0			0	1		my $self = shift;
230	0						my %args = (
231							url=>'',
232							content=>'',
233							@_
234							);
235
236	0						my $content = $args{content};
237	0						my $user = '';
238	0						my $is_community = 0;
239	0	0					if ($args{url} =~ m{http://([-\w]+)\.dreamwidth\.org})
240							{
241	0						$user = $1;
242							}
243	0	0					if ($user eq 'community')
244							{
245	0						$is_community = 1;
246	0						$user = '';
247	0	0					if ($args{url} =~ m{http://community\.dreamwidth\.org/([-\w]+)})
248							{
249	0						$user = $1;
250							}
251							}
252
253	0						my %info = ();
254	0						$info{url} = $args{url};
255
256	0						$info{toc_first} = 1;
257
258	0						my $title = $self->parse_title(%args);
259	0						$title =~ s/${user}:\s*//;
260	0						$info{title} = $title;
261
262	0						my $summary = $self->parse_summary(%args);
263	0						$summary =~ s/"/'/g;
264	0						$info{summary} = $summary;
265
266	0						$info{universe} = $self->parse_universe(%args);
267
268	0						my $author = $self->parse_author(%args);
269	0	0					if (!$author)
270							{
271	0						$author = $user;
272							}
273	0						$info{author} = $author;
274
275	0						my $characters = $self->parse_characters(%args);
276	0						$info{characters} = $characters;
277
278	0						$info{chapters} = $self->parse_chapter_urls(%args,
279							user=>$user,
280							is_community=>$is_community);
281
282	0						return %info;
283							} # parse_toc
284
285							=head2 parse_chapter_urls
286
287							Figure out the URLs for the chapters of this story.
288
289							=cut
290							sub parse_chapter_urls {
291	0			0	1		my $self = shift;
292	0						my %args = (
293							url=>'',
294							content=>'',
295							@_
296							);
297	0						my $content = $args{content};
298	0						my $user = $args{user};
299	0						my @chapters = ();
300	0	0					if (defined $args{urls})
301							{
302	0						@chapters = @{$args{urls}};
	0
303	0						for (my $i = 0; $i < @chapters; $i++)
304							{
305	0						$chapters[$i] = sprintf('%s?format=light', $chapters[$i]);
306							}
307							}
308	0	0	0				if (@chapters == 1 and $user)
309							{
310	0	0					warn "user=$user\n" if ($self->{verbose} > 1);
311	0	0					if ($args{is_community})
312							{
313	0						while ($content =~
314							m/href="(http:\/\/community\.dreamwidth\.org\/${user}\/\d+.html)(#cutid\d)?">/sg)
315							{
316	0						my $ch_url = $1;
317	0	0					if ($ch_url ne $args{url})
318							{
319	0	0					warn "chapter=$ch_url\n" if ($self->{verbose} > 1);
320	0						push @chapters, "${ch_url}?format=light";
321							}
322							}
323							}
324							else
325							{
326	0						while ($content =~
327							m/href="(http:\/\/${user}\.dreamwidth\.org\/\d+.html)(#cutid\d)?">/sg)
328							{
329	0						my $ch_url = $1;
330	0	0					if ($ch_url ne $args{url})
331							{
332	0	0					warn "chapter=$ch_url\n" if ($self->{verbose} > 1);
333	0						push @chapters, "${ch_url}?format=light";
334							}
335							}
336							}
337							}
338
339	0						return \@chapters;
340							} # parse_chapter_urls
341
342							=head2 parse_author
343
344							Get the author from the content
345
346							=cut
347							sub parse_author {
348	0			0	1		my $self = shift;
349	0						my %args = (
350							url=>'',
351							content=>'',
352							@_
353							);
354
355	0						my $content = $args{content};
356	0						my $author = $self->SUPER::parse_author(%args);
357
358	0	0					if ($author =~ m#<span lj:user='\w+' style='white-space: nowrap;' class='ljuser'><a href='http://www\.dreamwidth\.org/profile\?user=\w+'><img src='http://www\.dreamwidth\.org/img/silk/identity/user\.png' alt='\[profile\] ' width='17' height='17' style='vertical-align: text-bottom; border: 0; padding-right: 1px;' /></a><a href='http://www\.dreamwidth\.org/profile\?user=\w+'><b>(.*?)</b></a></span>#)
		0
359							{
360	0						$author = $1;
361							}
362							elsif ($author =~ m#<a href='http://[-\w]+\.dreamwidth\.org/'><b>(.*?)</b></a>#)
363							{
364	0						$author = $1;
365							}
366	0						return $author;
367							} # parse_author
368
369							1; # End of WWW::FetchStory::Fetcher::Dreamwidth
370							__END__