blib/lib/App/RSS2Leafnode.pm | |||
---|---|---|---|
Criterion | Covered | Total | % |
statement | 75 | 1604 | 4.6 |
branch | 0 | 728 | 0.0 |
condition | 0 | 544 | 0.0 |
subroutine | 26 | 163 | 15.9 |
pod | 4 | 124 | 3.2 |
total | 105 | 3163 | 3.3 |
line | stmt | bran | cond | sub | pod | time | code |
---|---|---|---|---|---|---|---|
1 | # Copyright 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015 Kevin Ryde | ||||||
2 | # | ||||||
3 | # This file is part of RSS2Leafnode. | ||||||
4 | # | ||||||
5 | # RSS2Leafnode is free software; you can redistribute it and/or modify it | ||||||
6 | # under the terms of the GNU General Public License as published by the Free | ||||||
7 | # Software Foundation; either version 3, or (at your option) any later | ||||||
8 | # version. | ||||||
9 | # | ||||||
10 | # RSS2Leafnode is distributed in the hope that it will be useful, but | ||||||
11 | # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY | ||||||
12 | # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License | ||||||
13 | # for more details. | ||||||
14 | # | ||||||
15 | # You should have received a copy of the GNU General Public License along | ||||||
16 | # with RSS2Leafnode. If not, see |
||||||
17 | |||||||
18 | |||||||
19 | # maybe: | ||||||
20 | # location links | ||||||
21 | # http://maps.google.com/maps?ll=-35.066667,148.1 | ||||||
22 | # http://maps.google.com/maps?ll=-35.066667,148.1&spn=0.01,0.01&t=m | ||||||
23 | # | ||||||
24 | # | ||||||
25 | # when site has mutliple names for a page, relative or absolute | ||||||
26 | |||||||
27 | |||||||
28 | |||||||
29 | package App::RSS2Leafnode; | ||||||
30 | 1 | 1 | 1399 | use 5.010; | |||
1 | 4 | ||||||
1 | 52 | ||||||
31 | 1 | 1 | 7 | use strict; | |||
1 | 1 | ||||||
1 | 37 | ||||||
32 | 1 | 1 | 6 | use warnings; | |||
1 | 2 | ||||||
1 | 30 | ||||||
33 | 1 | 1 | 6 | use Carp; | |||
1 | 1 | ||||||
1 | 101 | ||||||
34 | 1 | 1 | 770 | use Encode; | |||
1 | 13638 | ||||||
1 | 114 | ||||||
35 | 1 | 1 | 3643 | use Hash::Util::FieldHash; | |||
1 | 1270 | ||||||
1 | 86 | ||||||
36 | 1 | 1 | 10 | use List::Util 'min', 'max'; | |||
1 | 1 | ||||||
1 | 123 | ||||||
37 | 1 | 1 | 828 | use List::MoreUtils; | |||
1 | 13194 | ||||||
1 | 13 | ||||||
38 | 1 | 1 | 1276 | use POSIX (); # ENOENT, etc | |||
1 | 8814 | ||||||
1 | 45 | ||||||
39 | 1 | 1 | 20 | use Scalar::Util; | |||
1 | 2 | ||||||
1 | 54 | ||||||
40 | 1 | 1 | 637 | use Text::Trim 1.02; # version 1.02 for undef support | |||
1 | 708 | ||||||
1 | 181 | ||||||
41 | 1 | 1 | 2343 | use URI; | |||
1 | 8155 | ||||||
1 | 43 | ||||||
42 | 1 | 1 | 683 | use HTML::Entities::Interpolate; | |||
1 | 9374 | ||||||
1 | 8 | ||||||
43 | |||||||
44 | 1 | 1 | 568 | use App::RSS2Leafnode::XML::Twig::Other; | |||
1 | 2 | ||||||
1 | 61 | ||||||
45 | |||||||
46 | # version 1.17 for __p(), and version 1.16 for turn_utf_8_on() | ||||||
47 | 1 | 1 | 820 | use Locale::TextDomain 1.17; | |||
1 | 9806 | ||||||
1 | 7 | ||||||
48 | 1 | 1 | 11010 | use Locale::TextDomain ('App-RSS2Leafnode'); | |||
1 | 3 | ||||||
1 | 5 | ||||||
49 | BEGIN { | ||||||
50 | 1 | 1 | 42 | use Locale::Messages; | |||
1 | 3 | ||||||
1 | 84 | ||||||
51 | 1 | 1 | 6 | Locale::Messages::bind_textdomain_codeset ('App-RSS2Leafnode','UTF-8'); | |||
52 | 1 | 31 | Locale::Messages::bind_textdomain_filter ('App-RSS2Leafnode', | ||||
53 | \&Locale::Messages::turn_utf_8_on); | ||||||
54 | } | ||||||
55 | |||||||
56 | # uncomment this to run the ### lines | ||||||
57 | # use Smart::Comments; | ||||||
58 | |||||||
59 | our $VERSION; | ||||||
60 | BEGIN { | ||||||
61 | 1 | 1 | 1093 | $VERSION = 78; | |||
62 | } | ||||||
63 | |||||||
64 | ## no critic (ProhibitFixedStringMatches) | ||||||
65 | |||||||
66 | |||||||
67 | # Cribs: | ||||||
68 | # | ||||||
69 | # RSS | ||||||
70 | # http://my.netscape.com/publish/help/ | ||||||
71 | # RSS 0.9 spec. | ||||||
72 | # http://my.netscape.com/publish/help/mnn20/quickstart.html | ||||||
73 | # RSS 0.91 spec. | ||||||
74 | # http://purl.org/rss/1.0/ | ||||||
75 | # RSS 1.0 spec. | ||||||
76 | # http://www.rssboard.org/rss-specification | ||||||
77 | # http://www.rssboard.org/files/rss-2.0-sample.xml | ||||||
78 | # RSS 2.0 spec and sample. | ||||||
79 | # | ||||||
80 | # http://www.rssboard.org/rss-profile | ||||||
81 | # "Best practices." | ||||||
82 | # | ||||||
83 | # Dublin Core | ||||||
84 | # RFC 5013 -- summary | ||||||
85 | # http://dublincore.org/documents/dcmi-terms/ -- dc/terms | ||||||
86 | # | ||||||
87 | # Atom | ||||||
88 | # RFC 4287 -- Atom spec | ||||||
89 | # RFC 3339 -- ISO timestamps as used in Atom | ||||||
90 | # RFC 4685 -- "thr" threading extensions | ||||||
91 | # RFC 4946 -- | ||||||
92 | # RFC 5005 -- etc paging and archiving | ||||||
93 | # http://diveintomark.org/archives/2004/05/28/howto-atom-id | ||||||
94 | # Making an |
||||||
95 | # http://www.iana.org/assignments/link-relations/link-relations.xhtml | ||||||
96 | # assigned values | ||||||
97 | # | ||||||
98 | # RSS Modules: | ||||||
99 | # http://www.meatballwiki.org/wiki/ModWiki -- wiki | ||||||
100 | # http://web.resource.org/rss/1.0/modules/slash/ | ||||||
101 | # http://code.google.com/apis/feedburner/feedburner_namespace_reference.html | ||||||
102 | # http://backend.userland.com/creativeCommonsRSSModule | ||||||
103 | # | ||||||
104 | # http://web.resource.org/rss/1.0/modules/content/ | ||||||
105 | # http://www.rssboard.org/rss-profile#namespace-elements-content | ||||||
106 | # http://validator.w3.org/feed/docs/warning/NeedDescriptionBeforeContent.html | ||||||
107 | # |
||||||
108 | # | ||||||
109 | # http://www.apple.com/itunes/podcasts/specs.html | ||||||
110 | # http://www.feedforall.com/itunes.htm | ||||||
111 | # http://www.w3.org/2003/01/geo/wgs84_pos -- |
||||||
112 | # http://www.georss.org/ | ||||||
113 | # http://www.georss.org/Encodings | ||||||
114 | # http://www.georss.org/atom | ||||||
115 | # http://www.georss.org/rdf_rss1 | ||||||
116 | # | ||||||
117 | # http://activitystrea.ms/specs/atom/1.0/ | ||||||
118 | # activity: | ||||||
119 | # http://prismstandard.org/namespaces/basic/2.0/ | ||||||
120 | # http://www.prismstandard.org/specifications/2.0/PRISM_prism_namespace_2.0.pdf | ||||||
121 | # Prism | ||||||
122 | # | ||||||
123 | # URIs | ||||||
124 | # RFC 1738, RFC 2396, RFC 3986 -- URI formats (news/nntp in 1738) | ||||||
125 | # draft-ellermann-news-nntp-uri-11.txt -- news/nntp update | ||||||
126 | # RFC 2732 -- ipv6 "[]" hostnames | ||||||
127 | # RFC 2141 -- urn: | ||||||
128 | # RFC 4122 -- uuid format (as under urn:uuid:) | ||||||
129 | # RFC 4151 -- tag: | ||||||
130 | # RFC 1034, RFC 1123 -- domain names | ||||||
131 | # RFC 2606 -- reserved domain names ".invalid" | ||||||
132 | # | ||||||
133 | # XML | ||||||
134 | # http://www.w3.org/TR/xmlbase/ -- xml:base | ||||||
135 | # RFC 3023 text/xml etc media types | ||||||
136 | # | ||||||
137 | # Mail Messages | ||||||
138 | # RFC 850, RFC 1036 | ||||||
139 | # -- News message format, inc headers and rnews format | ||||||
140 | # RFC 2822, RFC 5322, RFC 5536 | ||||||
141 | # -- Email message format. | ||||||
142 | # RFC 2076, RFC 4021 -- headers summary. | ||||||
143 | # RFC 2557 -- MHTML Content-Location | ||||||
144 | # RFC 1864 -- Content-MD5 header | ||||||
145 | # RFC 2369 -- List-Post header and friends | ||||||
146 | # http://www.ietf.org/proceedings/98dec/I-D/draft-ietf-drums-mail-followup-to-00.txt | ||||||
147 | # Draft "Mail-Followup-To" header. | ||||||
148 | # | ||||||
149 | # RFC 1327 -- X.400 to RFC822 introducing Language header | ||||||
150 | # RFC 3282 -- Content-Language header | ||||||
151 | # RFC 1766, RFC 3066, RFC 4646 -- language tag form | ||||||
152 | # | ||||||
153 | # | ||||||
154 | # NNTP | ||||||
155 | # RFC 977 -- NNTP | ||||||
156 | # RFC 2616 -- HTTP/1.1 Accept-Encoding header | ||||||
157 | # RFC 2980 -- NNTP extensions | ||||||
158 | # | ||||||
159 | # RFC 4642 -- NNTP with SSL | ||||||
160 | # | ||||||
161 | # For XML in Perl there's several ways to do it! | ||||||
162 | # - XML::Parser looks likely for stream/event processing, but its builtin | ||||||
163 | # tree mode is very basic. | ||||||
164 | # - XML::Twig extends XML::Parser to a good tree, though the docs are | ||||||
165 | # slightly light on. It only does a subset of "XPath" but the | ||||||
166 | # functions/regexps are more perl-like for matching and there's various | ||||||
167 | # handy shortcuts for common operations. | ||||||
168 | # - XML::LibXML is the full blown libxml and is rather a lot to learn. | ||||||
169 | # Because it's mainly C it's not easy to find where or how you're going | ||||||
170 | # wrong when your code doesn't work. libxml also seems stricter about | ||||||
171 | # namespace matters than XML::Parser/XML::Twig. | ||||||
172 | # - XML::RSS uses XML::Parser to build its own style tree of RSS, | ||||||
173 | # including unifying differences among RSS/RDF 0.91, 1.0 and 2.0. | ||||||
174 | # Nested elements seem to need specific handling in its code, which can | ||||||
175 | # make it tricky for sub-element oddities. A fair amount of it is about | ||||||
176 | # writing RSS too. | ||||||
177 | # - XML::RSS::LibXML uses libxml for XML::RSS compatible reading and | ||||||
178 | # writing. It seems to do better on unrecognised sub-elements. | ||||||
179 | # - XML::Atom offers the basic Atom elements but doesn't seem to give | ||||||
180 | # access to extra stuff that might be in a feed. | ||||||
181 | # - XML::Feed tries to unify XML::RSS and XML::Atom but again doesn't seem | ||||||
182 | # to go much beyond the basics. It too is geared towards writing as | ||||||
183 | # well as reading. | ||||||
184 | # - XML::TreePP pure perl parser to a hash tree. | ||||||
185 | # | ||||||
186 | # The choice of XML::Twig is based on wanting both RSS and Atom, but | ||||||
187 | # XML::Feed not going far enough. Tree processing is easier than stream, | ||||||
188 | # and an RSS isn't meant to be huge. A tree may help if channel fields | ||||||
189 | # follow items or something equally unnatural, but will probably assume that | ||||||
190 | # doesn't happen and look at the twig partial-tree mode. Between the tree | ||||||
191 | # styles XML::LibXML is harder to get into than Twig. | ||||||
192 | # | ||||||
193 | |||||||
194 | #------------------------------------------------------------------------------ | ||||||
195 | # mostly generic | ||||||
196 | |||||||
197 | # return $str with a newline at the end, if it doesn't already have one | ||||||
198 | sub str_ensure_newline { | ||||||
199 | 0 | 0 | 0 | my ($str) = @_; | |||
200 | 0 | 0 | if ($str !~ /\n$/) { $str .= "\n" } | ||||
0 | |||||||
201 | 0 | return $str; | |||||
202 | } | ||||||
203 | |||||||
204 | sub md5_of_utf8 { | ||||||
205 | 0 | 0 | 0 | my ($str) = @_; | |||
206 | 0 | require Digest::MD5; | |||||
207 | 0 | return Digest::MD5::md5_base64 (Encode::encode_utf8 ($str)); | |||||
208 | } | ||||||
209 | |||||||
210 | sub is_empty { | ||||||
211 | 0 | 0 | 0 | my ($str) = @_; | |||
212 | 0 | 0 | return (! defined $str || $str =~ /^\s*$/); | ||||
213 | } | ||||||
214 | sub is_non_empty { | ||||||
215 | 0 | 0 | 0 | my ($str) = @_; | |||
216 | 0 | return ! is_empty($str); | |||||
217 | } | ||||||
218 | sub non_empty { | ||||||
219 | 0 | 0 | 0 | my ($str) = @_; | |||
220 | 0 | 0 | return (is_non_empty($str) ? $str : ()); | ||||
221 | } | ||||||
222 | |||||||
223 | sub join_non_empty { | ||||||
224 | 0 | 0 | 0 | my $sep = shift; | |||
225 | 0 | return non_empty (join($sep, map {non_empty($_)} @_)); | |||||
0 | |||||||
226 | } | ||||||
227 | |||||||
228 | sub collapse_whitespace { | ||||||
229 | 0 | 0 | 0 | my ($str) = @_; | |||
230 | 0 | 0 | defined $str or return undef; | ||||
231 | 0 | 0 | $str =~ s/(\s+)/($1 eq ' ' ? $1 : ' ')/ge; | ||||
0 | |||||||
232 | 0 | return Text::Trim::trim($str); | |||||
233 | } | ||||||
234 | |||||||
235 | # return true if $str is entirely ascii chars 0 to 127 | ||||||
236 | sub is_ascii { | ||||||
237 | 0 | 0 | 0 | my ($str) = @_; | |||
238 | 0 | return ($str !~ /[^[:ascii:]]/); | |||||
239 | } | ||||||
240 | |||||||
241 | # Return the number of lines in $str. | ||||||
242 | # If $str ends with a newline then that counts as the last line, so "xyz\n" | ||||||
243 | # is one line. If $str doesn't end with a newline then the final chars are | ||||||
244 | # a line, so "abc\ndef" is two lines. | ||||||
245 | sub str_count_lines { | ||||||
246 | 0 | 0 | 0 | my ($str) = @_; | |||
247 | 0 | 0 | return scalar($str =~ tr/\n//) + (length($str) && substr($str,-1) ne "\n"); | ||||
248 | } | ||||||
249 | |||||||
250 | sub File_Temp_DEBUG_saver { | ||||||
251 | 0 | 0 | 0 | my ($self, $newval) = @_; | |||
252 | 0 | require Scope::Guard; | |||||
253 | 0 | require File::Temp; | |||||
254 | 0 | my $oldval = $File::Temp::DEBUG; | |||||
255 | 0 | 0 | my $ret = Scope::Guard->new (sub { $File::Temp::DEBUG = $oldval }); | ||||
0 | |||||||
256 | 0 | $File::Temp::DEBUG = $newval; | |||||
257 | 0 | return $ret; | |||||
258 | } | ||||||
259 | sub MIME_Tools_debugging { | ||||||
260 | 0 | 0 | 0 | my ($self, $newval) = @_; | |||
261 | 0 | require Scope::Guard; | |||||
262 | 0 | require MIME::Tools; | |||||
263 | 0 | my $oldval = MIME::Tools->debugging; | |||||
264 | 0 | 0 | my $ret = Scope::Guard->new (sub { MIME::Tools->debugging($oldval) }); | ||||
0 | |||||||
265 | 0 | MIME::Tools->debugging ($newval); | |||||
266 | 0 | return $ret; | |||||
267 | } | ||||||
268 | |||||||
269 | sub homedir { | ||||||
270 | # my ($self) = @_; | ||||||
271 | 0 | 0 | 0 | require File::HomeDir; | |||
272 | # call each time just in case playing tricks with $ENV{HOME} in conf file | ||||||
273 | 0 | 0 | return File::HomeDir->my_home | ||||
274 | // croak 'File::HomeDir says you have no home directory'; | ||||||
275 | } | ||||||
276 | |||||||
277 | #------------------------------------------------------------------------------ | ||||||
278 | # Number::Format for sizes in bytes | ||||||
279 | |||||||
280 | use constant::defer NUMBER_FORMAT => sub { | ||||||
281 | 0 | 0 | require Number::Format; | ||||
282 | 0 | 0 | Number::Format->VERSION(1.5); # for format_bytes() options params | ||||
283 | 0 | 0 | return Number::Format->new | ||||
284 | (-kilo_suffix => __p('number-format-kilobytes','K'), | ||||||
285 | -mega_suffix => __p('number-format-megabytes','M'), | ||||||
286 | -giga_suffix => __p('number-format-gigabytes','G')); | ||||||
287 | 1 | 1 | 784 | }; | |||
1 | 973 | ||||||
1 | 11 | ||||||
288 | |||||||
289 | sub format_size_in_bytes { | ||||||
290 | 0 | 0 | 0 | my ($self, $length) = @_; | |||
291 | 0 | 0 | if ($length >= 2000) { | ||||
292 | 0 | return $self->NUMBER_FORMAT()->format_bytes ($length, precision => 1); | |||||
293 | } else { | ||||||
294 | 0 | return __x('{size} bytes', size => $length); | |||||
295 | } | ||||||
296 | } | ||||||
297 | |||||||
298 | #------------------------------------------------------------------------------ | ||||||
299 | |||||||
300 | sub new { | ||||||
301 | 0 | 0 | 1 | my $class = shift; | |||
302 | 0 | return bless { | |||||
303 | # config variables | ||||||
304 | verbose => 0, | ||||||
305 | render => 0, | ||||||
306 | render_width => 60, | ||||||
307 | rss_get_links => 0, | ||||||
308 | rss_get_comments => 0, | ||||||
309 | rss_newest_only => 0, | ||||||
310 | get_icon => 0, | ||||||
311 | html_charset_from_content => 0, | ||||||
312 | |||||||
313 | # secret extra | ||||||
314 | msgidextra => '', | ||||||
315 | |||||||
316 | @_, | ||||||
317 | }, $class; | ||||||
318 | } | ||||||
319 | |||||||
320 | sub command_line { | ||||||
321 | 0 | 0 | 1 | my ($self) = @_; | |||
322 | |||||||
323 | 0 | my $done_version; | |||||
324 | 0 | require Getopt::Long; | |||||
325 | 0 | Getopt::Long::Configure ('no_ignore_case'); | |||||
326 | Getopt::Long::GetOptions | ||||||
327 | ('config=s' => \$self->{'config_filename'}, | ||||||
328 | 'verbose:1' => \$self->{'verbose'}, | ||||||
329 | 'version' => sub { | ||||||
330 | 0 | 0 | say __x("RSS2Leafnode version {version}", version => $VERSION); | ||||
331 | 0 | $done_version = 1; | |||||
332 | }, | ||||||
333 | 'bareversion' => sub { | ||||||
334 | 0 | 0 | say $VERSION; | ||||
335 | 0 | $done_version = 1; | |||||
336 | }, | ||||||
337 | 'msgid=s' => \$self->{'msgidextra'}, | ||||||
338 | 'help|?' => sub { | ||||||
339 | 0 | 0 | say __x("rss2leafnode [--options]"); | ||||
340 | 0 | say __x(" --config=filename configuration file (default ~/.rss2leafnode.conf)"); | |||||
341 | 0 | say __x(" --help print this help"); | |||||
342 | 0 | say __x(" --verbose describe what's done"); | |||||
343 | 0 | say __x(" --verbose=2 show technical details of what's done"); | |||||
344 | 0 | say __x(" --version print program version number"); | |||||
345 | 0 | exit 0; | |||||
346 | 0 | 0 | }) or return 1; | ||||
347 | 0 | 0 | if (! $done_version) { | ||||
348 | 0 | $self->do_config_file; | |||||
349 | 0 | $self->nntp_close; | |||||
350 | } | ||||||
351 | 0 | return 0; | |||||
352 | } | ||||||
353 | |||||||
354 | sub verbose { | ||||||
355 | 0 | 0 | 0 | my $self = shift; | |||
356 | 0 | my $count = shift; | |||||
357 | 0 | 0 | if ($self->{'verbose'} >= $count) { | ||||
358 | 0 | say @_; | |||||
359 | } | ||||||
360 | } | ||||||
361 | |||||||
362 | sub config_filename { | ||||||
363 | 0 | 0 | 0 | my ($self) = @_; | |||
364 | 0 | 0 | return $self->{'config_filename'} // do { | ||||
365 | 0 | require File::Spec; | |||||
366 | 0 | File::Spec->catfile ($self->homedir, '.rss2leafnode.conf'); | |||||
367 | }; | ||||||
368 | } | ||||||
369 | sub status_filename { | ||||||
370 | 0 | 0 | 0 | my ($self) = @_; | |||
371 | 0 | 0 | return $self->{'status_filename'} // do { | ||||
372 | 0 | require File::Spec; | |||||
373 | 0 | File::Spec->catfile ($self->homedir, '.rss2leafnode.status'); | |||||
374 | }; | ||||||
375 | } | ||||||
376 | |||||||
377 | sub do_config_file { | ||||||
378 | 0 | 0 | 0 | my ($self) = @_; | |||
379 | 0 | my @guards; | |||||
380 | |||||||
381 | 0 | 0 | open STDERR, '>&STDOUT' or die "Oops, can't join STDERR to STDOUT"; | ||||
382 | |||||||
383 | # File::Temp::DEBUG for possible temp files used by HTML::FormatExternal | ||||||
384 | # these debugs turned on only for the duration of running the config file | ||||||
385 | # and the downloading etc in it | ||||||
386 | 0 | 0 | if ($self->{'verbose'} >= 2) { | ||||
387 | 0 | push @guards, $self->File_Temp_DEBUG_saver(1); | |||||
388 | 0 | push @guards, $self->MIME_Tools_debugging(1); | |||||
389 | } | ||||||
390 | |||||||
391 | 0 | my $config_filename = $self->config_filename; | |||||
392 | 0 | $self->verbose (1, "config: ", $config_filename); | |||||
393 | |||||||
394 | 0 | require App::RSS2Leafnode::Conf; | |||||
395 | 0 | local $App::RSS2Leafnode::Conf::r2l = $self; | |||||
396 | 0 | 0 | if (! defined (do { package App::RSS2Leafnode::Conf; | ||||
397 | 0 | do $config_filename; | |||||
398 | })) { | ||||||
399 | 0 | 0 | if (! -e $config_filename) { | ||||
400 | 0 | croak "rss2leafnode: config file $config_filename doesn't exist\n"; | |||||
401 | } else { | ||||||
402 | 0 | croak $@; | |||||
403 | } | ||||||
404 | } | ||||||
405 | } | ||||||
406 | |||||||
407 | #------------------------------------------------------------------------------ | ||||||
408 | # LWP stuff | ||||||
409 | |||||||
410 | sub user_agent { | ||||||
411 | 0 | 0 | 0 | my ($self) = @_; | |||
412 | 0 | 0 | if (defined $self->{'user_agent'}) { | ||||
413 | 0 | return $self->{'user_agent'}; | |||||
414 | } else { | ||||||
415 | 0 | return 'RSS2leafnode/' . $self->VERSION . ' '; | |||||
416 | } | ||||||
417 | } | ||||||
418 | |||||||
419 | sub ua { | ||||||
420 | 0 | 0 | 0 | my ($self) = @_; | |||
421 | 0 | 0 | return ($self->{'ua'} ||= do { | ||||
422 | 0 | require LWP::UserAgent; | |||||
423 | 0 | LWP::UserAgent->VERSION(5.832); # 5.832 for content_charset() | |||||
424 | |||||||
425 | # one connection kept alive | ||||||
426 | 0 | my $ua = LWP::UserAgent->new (keep_alive => 1); | |||||
427 | 0 | Scalar::Util::weaken ($ua->{(__PACKAGE__)} = $self); | |||||
428 | 0 | $ua->agent ($self->user_agent); | |||||
429 | |||||||
430 | 0 | Scalar::Util::weaken (my $weak_self = $self); | |||||
431 | 0 | $ua->add_handler (request_send => \&lwp_request_send__verbose); | |||||
432 | $ua->add_handler (response_done => sub { | ||||||
433 | 0 | 0 | lwp_response_done__check_md5 ($weak_self, @_); | ||||
434 | 0 | }); | |||||
435 | |||||||
436 | # ask for everything $resp->decode() / $resp->decoded_content() can cope | ||||||
437 | # with, in particular "gzip" and "deflate" compression if Compress::Zlib | ||||||
438 | # etc is available | ||||||
439 | # | ||||||
440 | 0 | require HTTP::Message; | |||||
441 | 0 | my $decodable = HTTP::Message::decodable(); | |||||
442 | 0 | $self->verbose (2, "HTTP decodable: ", $decodable); | |||||
443 | 0 | $ua->default_header ('Accept-Encoding' => $decodable); | |||||
444 | |||||||
445 | 0 | $ua | |||||
446 | }); | ||||||
447 | } | ||||||
448 | |||||||
449 | sub lwp_request_send__verbose { | ||||||
450 | 0 | 0 | 0 | my ($req, $ua, $h) = @_; | |||
451 | 0 | my $self = $ua->{(__PACKAGE__)}; | |||||
452 | 0 | $self->verbose (2, "request_send:", $req->dump, "\n"); # extra newline | |||||
453 | 0 | return; # continue processing | |||||
454 | } | ||||||
455 | |||||||
456 | sub lwp_response_done__check_md5 { | ||||||
457 | 0 | 0 | 0 | my ($self, $resp, $ua, $h) = @_; | |||
458 | 0 | 0 | $self || return; | ||||
459 | ### lwp_response_done__check_md5() ... | ||||||
460 | 0 | 0 | my $want = $resp->header('Content-MD5') // do { | ||||
461 | 0 | $self->verbose (2, 'no Content-MD5 header'); | |||||
462 | 0 | return; | |||||
463 | }; | ||||||
464 | 0 | $resp->decode; | |||||
465 | 0 | my $cref = $resp->content_ref; | |||||
466 | 0 | require Digest::MD5; | |||||
467 | 0 | my $got = Digest::MD5::md5_hex($$cref); | |||||
468 | 0 | 0 | if ($got ne $want) { | ||||
469 | 0 | print __x("Warning, MD5 checksum mismatch on download {url}\n", | |||||
470 | url => $resp->request->uri); | ||||||
471 | } else { | ||||||
472 | 0 | $self->verbose(2, 'Content-MD5 ok'); | |||||
473 | } | ||||||
474 | } | ||||||
475 | |||||||
476 | # $resp is a HTTP::Response object. Modify its headers to apply our | ||||||
477 | # $html_charset_from_content option, which means if it's set then prefer the | ||||||
478 | # document's Content-Type over what the server says. | ||||||
479 | # | ||||||
480 | # The LWP::UserAgent parse_head option appends the document bits to | ||||||
481 | # the message headers. If the server and the document both offer a | ||||||
482 | # Content-Type then there's two, with the document one last, so all we have | ||||||
483 | # to do is change to make the last one the only one. | ||||||
484 | # | ||||||
485 | sub enforce_html_charset_from_content { | ||||||
486 | 0 | 0 | 0 | my ($self, $resp) = @_; | |||
487 | 0 | 0 | 0 | if ($self->{'html_charset_from_content'} | |||
488 | && $resp->headers->content_is_html) { | ||||||
489 | 0 | my $old = $resp->header('Content-Type'); | |||||
490 | 0 | $resp->header('Content-Type' => $resp->headers->content_type); | |||||
491 | |||||||
492 | 0 | $self->verbose (2, 'html_charset_from_content mangled Content-Type from'); | |||||
493 | 0 | $self->verbose (2, " from ", $old); | |||||
494 | 0 | $self->verbose (2, " to ", $resp->header('Content-Type')); | |||||
495 | 0 | $self->verbose (2, " giving charset ", $resp->content_charset); | |||||
496 | } | ||||||
497 | } | ||||||
498 | |||||||
499 | |||||||
500 | #------------------------------------------------------------------------------ | ||||||
501 | my %known; | ||||||
502 | |||||||
503 | # |
||||||
504 | # ENHANCE-ME: is this something to work into the skipdays? or a message expiry? | ||||||
505 | # | ||||||
506 | $known{'/channel/item/dcterms:valid'} = undef; | ||||||
507 | |||||||
508 | # |
||||||
509 | $known{'/channel/item/dcterms:audience'} = undef; | ||||||
510 | |||||||
511 | # |
||||||
512 | # |
||||||
513 | @known{qw(/channel/item/eq:depth | ||||||
514 | /channel/item/eq:seconds)} = (); | ||||||
515 | |||||||
516 | # rdf structure stuff | ||||||
517 | @known{qw(/channel/items | ||||||
518 | /channel/items/rdf:Seq | ||||||
519 | /channel/items/rdf:Seq/rdf:li)} = (); | ||||||
520 | |||||||
521 | @known{('/channel/cloud', | ||||||
522 | '/channel/link', | ||||||
523 | '/channel/docs', | ||||||
524 | '/channel/generator', | ||||||
525 | '/channel/rating', | ||||||
526 | '/channel/id', | ||||||
527 | '/channel/description', | ||||||
528 | '/channel/tagline', | ||||||
529 | '/channel/info', # atom something freeform | ||||||
530 | '/channel/itunes:summary', | ||||||
531 | '/channel/feedburner:info', | ||||||
532 | |||||||
533 | # nothing much in these as yet eg. rssboard | ||||||
534 | '/channel/item/sitemap:priority', | ||||||
535 | '/channel/item/sitemap:changefreq', | ||||||
536 | |||||||
537 | # feedburner junk | ||||||
538 | '/channel/feedburner:feedFlare', | ||||||
539 | |||||||
540 | # images | ||||||
541 | '/channel/itunes:owner', | ||||||
542 | '/channel/itunes:owner/itunes:name', | ||||||
543 | '/channel/itunes:owner/itunes:email', | ||||||
544 | |||||||
545 | '/channel/textInput', | ||||||
546 | '/channel/textInput/description', | ||||||
547 | '/channel/textInput/link', | ||||||
548 | '/channel/textInput/name', | ||||||
549 | '/channel/textInput/title', | ||||||
550 | '/channel/textinput', | ||||||
551 | '/channel/textinput/title', | ||||||
552 | '/channel/textinput/description', | ||||||
553 | '/channel/textinput/name', | ||||||
554 | '/channel/textinput/link', | ||||||
555 | |||||||
556 | '/channel/openSearch:totalResults', | ||||||
557 | '/channel/openSearch:startIndex', | ||||||
558 | '/channel/openSearch:itemsPerPage', | ||||||
559 | |||||||
560 | '/channel/item', | ||||||
561 | '/channel/item/source', | ||||||
562 | |||||||
563 | '/channel/item/twitter:source', | ||||||
564 | |||||||
565 | # something from radio free france | ||||||
566 | # eg. http://radiofrance-podcast.net/podcast09/rss_10193.xml | ||||||
567 | '/channel/item/podcastRF:businessReference', | ||||||
568 | |||||||
569 | # google documents stuff | ||||||
570 | '/channel/item/gd:extendedProperty', | ||||||
571 | |||||||
572 | # |
||||||
573 | # eg. RBA http://www.rba.gov.au/rss/rss-cb-exchange-rates.xml | ||||||
574 | 'channel/item/cb:statistics', | ||||||
575 | |||||||
576 | # |
||||||
577 | # eg. Fed Reserve http://www.federalreserve.gov/feeds/press_taf.xml | ||||||
578 | '/channel/item/cb:news', | ||||||
579 | |||||||
580 | # FIXME: |
||||||
581 | # Fed eg. http://www.federalreserve.gov/feeds/speeches.xml | ||||||
582 | '/channel/item/cb:speech', | ||||||
583 | |||||||
584 | # |
||||||
585 | # |
||||||
586 | # Fed eg. http://www.federalreserve.gov/feeds/ifdp.xml | ||||||
587 | # FIXME: except |
||||||
588 | '/channel/item/cb:paper', | ||||||
589 | '/channel/item/cb:event', | ||||||
590 | |||||||
591 | # |
||||||
592 | '/channel/item/media:hash', | ||||||
593 | |||||||
594 | # not sure what these are, but don't seem very interesting | ||||||
595 | '/channel/item/slate:slate_plus', # |
||||||
596 | '/channel/item/slate:paywall', # |
||||||
597 | '/channel/item/slate:sponsored', # |
||||||
598 | )} = (); | ||||||
599 | |||||||
600 | # weather | ||||||
601 | # '/channel/item/w:current', | ||||||
602 | # '/channel/item/w:forecast', | ||||||
603 | # '/channel/yweather:location', | ||||||
604 | # '/channel/yweather:units', | ||||||
605 | # '/channel/yweather:wind', | ||||||
606 | # '/channel/yweather:atmosphere', | ||||||
607 | # '/channel/yweather:astronomy', | ||||||
608 | # '/channel/item/yweather:condition', | ||||||
609 | # '/channel/item/yweather:forecast', | ||||||
610 | |||||||
611 | # --central-bank | ||||||
612 | # /channel/item/cb:statistics | ||||||
613 | # /channel/item/cb:statistics/cb:country | ||||||
614 | # /channel/item/cb:statistics/cb:institutionAbbrev | ||||||
615 | # /channel/item/cb:statistics/cb:exchangeRate | ||||||
616 | # /channel/item/cb:statistics/cb:exchangeRate/cb:value | ||||||
617 | # /channel/item/cb:statistics/cb:exchangeRate/cb:baseCurrency | ||||||
618 | # /channel/item/cb:statistics/cb:exchangeRate/cb:targetCurrency | ||||||
619 | # /channel/item/cb:statistics/cb:exchangeRate/cb:rateType | ||||||
620 | # /channel/item/cb:statistics/cb:exchangeRate/cb:observationPeriod | ||||||
621 | # /channel/item/cb:speech | ||||||
622 | # /channel/item/cb:speech/cb:simpleTitle | ||||||
623 | # /channel/item/cb:speech/cb:occurrenceDate | ||||||
624 | # /channel/item/cb:speech/cb:person | ||||||
625 | # /channel/item/cb:speech/cb:person/cb:givenName | ||||||
626 | # /channel/item/cb:speech/cb:person/cb:surname | ||||||
627 | # /channel/item/cb:speech/cb:person/cb:personalTitle | ||||||
628 | # /channel/item/cb:speech/cb:person/cb:nameAsWritten | ||||||
629 | # /channel/item/cb:speech/cb:person/cb:role | ||||||
630 | # /channel/item/cb:speech/cb:person/cb:role/cb:jobTitle | ||||||
631 | # /channel/item/cb:speech/cb:person/cb:role/cb:affiliation | ||||||
632 | # /channel/item/cb:speech/cb:venue | ||||||
633 | |||||||
634 | |||||||
635 | #------------------------------------------------------------------------------ | ||||||
636 | # dates | ||||||
637 | |||||||
638 | 1 | 1 | 1812 | use constant RFC822_STRFTIME_FORMAT => '%a, %d %b %Y %H:%M:%S %z'; | |||
1 | 2 | ||||||
1 | 2781 | ||||||
639 | |||||||
640 | # return a string which is current time in RFC 822 format | ||||||
641 | sub rfc822_time_now { | ||||||
642 | 0 | 0 | 0 | return POSIX::strftime (RFC822_STRFTIME_FORMAT, localtime(time())); | |||
643 | } | ||||||
644 | |||||||
645 | sub isodate_to_rfc822 { | ||||||
646 | 0 | 0 | 0 | my ($isodate) = @_; | |||
647 | 0 | 0 | if (! defined $isodate) { return undef; } | ||||
0 | |||||||
648 | 0 | my $date = $isodate; # the original goes through if unrecognised | |||||
649 | |||||||
650 | 0 | 0 | 0 | if ($isodate =~ /\dT\d/ || $isodate =~ /^\d{4}-\d{2}-\d{2}$/) { | |||
651 | # eg. "2000-01-01T12:00+00:00" | ||||||
652 | # "2000-01-01T12:00:00Z" | ||||||
653 | # "2000-01-01" | ||||||
654 | 0 | 0 | my $zonestr = ($isodate =~ s/([+-][0-9][0-9]):([0-9][0-9])$// ? " $1$2" | ||||
0 | |||||||
655 | : $isodate =~ s/Z$// ? ' +0000' | ||||||
656 | : ''); | ||||||
657 | 0 | require Date::Parse; | |||||
658 | 0 | my $time_t = Date::Parse::str2time($isodate); | |||||
659 | 0 | 0 | if (defined $time_t) { | ||||
660 | 0 | $date = POSIX::strftime ("%a, %d %b %Y %H:%M:%S$zonestr", | |||||
661 | localtime ($time_t)); | ||||||
662 | } | ||||||
663 | } | ||||||
664 | 0 | return $date; | |||||
665 | } | ||||||
666 | |||||||
667 | # Return an RFC822 date string, or undef if nothing known. | ||||||
668 | # This gets a sensible sort-by-date in the newsreader. | ||||||
669 | # |
||||||
670 | # redundant. | ||||||
671 | # | ||||||
672 | sub item_to_date { | ||||||
673 | 0 | 0 | 0 | my ($self, $item) = @_; | |||
674 | 0 | my $date; | |||||
675 | 0 | foreach my $elt ($item, item_to_channel($item)) { | |||||
676 | 0 | 0 | $date = (non_empty ($elt->first_child_trimmed_text('pubDate')) | ||||
0 | |||||||
0 | |||||||
0 | |||||||
0 | |||||||
0 | |||||||
0 | |||||||
0 | |||||||
0 | |||||||
0 | |||||||
677 | // non_empty ($elt->first_child_trimmed_text('dc:date')) | ||||||
678 | // non_empty ($elt->first_child_trimmed_text('jf:creationDate')) | ||||||
679 | # Atom | ||||||
680 | // non_empty ($elt->first_child_trimmed_text('modified')) | ||||||
681 | // non_empty ($elt->first_child_trimmed_text('updated')) | ||||||
682 | // non_empty ($elt->first_child_trimmed_text('issued')) | ||||||
683 | // non_empty ($elt->first_child_trimmed_text('dcterms:issued')) | ||||||
684 | // non_empty ($elt->first_child_trimmed_text('created')) | ||||||
685 | # channel | ||||||
686 | // non_empty ($elt->first_child_trimmed_text('lastBuildDate')) | ||||||
687 | # Atom | ||||||
688 | // non_empty ($elt->first_child_trimmed_text('published')) | ||||||
689 | # from Nature have dc:date anyway | ||||||
690 | // non_empty ($elt->first_child_trimmed_text('prism:publicationDate')) | ||||||
691 | ); | ||||||
692 | 0 | 0 | last if defined $date; | ||||
693 | } | ||||||
694 | 0 | return isodate_to_rfc822($date); | |||||
695 | } | ||||||
696 | @known{qw(/channel/dc:date | ||||||
697 | /channel/lastBuildDate | ||||||
698 | /channel/pubDate | ||||||
699 | /channel/updated | ||||||
700 | /channel/modified | ||||||
701 | |||||||
702 | /channel/item/dc:date | ||||||
703 | /channel/item/pubDate | ||||||
704 | /channel/item/updated | ||||||
705 | /channel/item/published | ||||||
706 | /channel/item/modified | ||||||
707 | /channel/item/created | ||||||
708 | /channel/item/issued | ||||||
709 | /channel/item/dcterms:issued | ||||||
710 | |||||||
711 | /channel/item/jf:creationDate --java-locale-human-readable | ||||||
712 | /channel/item/jf:modificationDate | ||||||
713 | /channel/item/jf:date --free-form | ||||||
714 | )} = (); | ||||||
715 | |||||||
716 | |||||||
717 | sub item_to_timet { | ||||||
718 | 0 | 0 | 0 | my ($self, $item) = @_; | |||
719 | ### item_to_timet() ... | ||||||
720 | 0 | 0 | my $str = $self->item_to_date($item) | ||||
721 | // return - POSIX::DBL_MAX(); # no date fields | ||||||
722 | |||||||
723 | 0 | require Date::Parse; | |||||
724 | ### $str | ||||||
725 | # print Date::Parse::str2time($str)," $str\n"; | ||||||
726 | return (Date::Parse::str2time($str) | ||||||
727 | 0 | 0 | // do { | ||||
728 | 0 | say __x('Unrecognised date "{date}" from {url}', | |||||
729 | date => $str, | ||||||
730 | url => $self->{'uri'}); | ||||||
731 | 0 | - POSIX::DBL_MAX(); | |||||
732 | }); | ||||||
733 | } | ||||||
734 | |||||||
735 | #----------------------------------------------------------------------------- | ||||||
736 | # Message-ID | ||||||
737 | |||||||
738 | # Return a message ID for something at $uri, optionally uniquified by $str. | ||||||
739 | # $uri is either a URI object or a url string. | ||||||
740 | # Weird chars in $uri or $str are escaped as necessary. | ||||||
741 | # Secret $self->{'msgidextra'} can make different message ids for the same | ||||||
742 | # content when testing. | ||||||
743 | # | ||||||
744 | # The path from $uri is incorporated in the result. fetch_html() needs this | ||||||
745 | # since the ETag identifier is only per-url, not globally unique. Suspect | ||||||
746 | # fetch_rss() needs it for a guid too (a non-permaLink one), as think the | ||||||
747 | # guid is only unique within the particular $uri feed, not globally and not | ||||||
748 | # even across multiple feeds on the same server. | ||||||
749 | # | ||||||
750 | sub url_to_msgid { | ||||||
751 | 0 | 0 | 0 | my ($self, $url, $str) = @_; | |||
752 | |||||||
753 | 0 | my $host; | |||||
754 | 0 | my $pathbit = $url; | |||||
755 | |||||||
756 | 0 | 0 | if (my $uri = eval { URI->new($url) }) { | ||||
0 | |||||||
757 | 0 | $uri = $uri->canonical; | |||||
758 | 0 | 0 | if ($uri->can('host')) { | ||||
0 | |||||||
759 | 0 | $host = $uri->host; | |||||
760 | 0 | $uri->host(''); | |||||
761 | 0 | $pathbit = $uri->as_string; | |||||
762 | |||||||
763 | # If the $uri schema has a host part but it's empty or "localhost" | ||||||
764 | # then try expanding that to hostname(). | ||||||
765 | # | ||||||
766 | # $uri schemas without a host part, like "urn:" in an Atom |
||||||
767 | # get hostname(), since want the generated msgid to come out the same | ||||||
768 | # if such a urn: appears from different downloaded locations. | ||||||
769 | # | ||||||
770 | 0 | 0 | 0 | if (is_empty($host) || $host eq 'localhost') { | |||
771 | 0 | require Sys::Hostname; | |||||
772 | 0 | eval { $host = Sys::Hostname::hostname() }; | |||||
0 | |||||||
773 | } | ||||||
774 | |||||||
775 | } elsif ($uri->can('authority')) { | ||||||
776 | # the "authority" part of a "tag:" schema | ||||||
777 | 0 | $host = $uri->authority; | |||||
778 | 0 | $uri->authority(''); | |||||
779 | 0 | $pathbit = $uri->as_string; | |||||
780 | } | ||||||
781 | } | ||||||
782 | |||||||
783 | # $host can be empty if running from a file:/// | ||||||
784 | # "localhost" is a bit bogus and in particular leafnode won't accept it. | ||||||
785 | # ".invalid" as per RFC 2606 | ||||||
786 | 0 | 0 | 0 | if (is_empty($host) || $host eq 'localhost') { | |||
787 | 0 | $host = 'rss2leafnode.invalid'; | |||||
788 | } | ||||||
789 | |||||||
790 | # ipv6 dotted hostname "[1234:5678::0000]" -> "1234.5678.0000..ipv6", | ||||||
791 | # because [ and : are not allowed (RFC 2822 "Atom" atext) | ||||||
792 | # $uri->canonical above lower cases any hex, for consistency | ||||||
793 | 0 | 0 | if (($host =~ s/^\[|\]$//g) | ($host =~ tr/:/./)) { | ||||
794 | 0 | $host .= '.ipv6'; | |||||
795 | } | ||||||
796 | |||||||
797 | # leafnode 2.0.0.alpha20070602a seems to insist on a "." in the host name | ||||||
798 | 0 | 0 | unless ($host =~ /\./) { | ||||
799 | 0 | $host .= '.withadot'; | |||||
800 | } | ||||||
801 | |||||||
802 | 0 | return ('<' | |||||
803 | . msgid_chars(join_non_empty('.', | ||||||
804 | "rss2leafnode" . $self->{'msgidextra'}, | ||||||
805 | $pathbit, | ||||||
806 | $str)) | ||||||
807 | . '@' | ||||||
808 | . msgid_chars($host) | ||||||
809 | . '>'); | ||||||
810 | } | ||||||
811 | # msgid_chars($str) returns $str with invalid Message-ID characters munged. | ||||||
812 | # Per RFC850 must be printing ascii and not < > or whitespace, but for | ||||||
813 | # safety reduce that a bit, in particular excluding ' and ". | ||||||
814 | sub msgid_chars { | ||||||
815 | 0 | 0 | 0 | my ($str) = @_; | |||
816 | 0 | require URI::Escape; | |||||
817 | 0 | return URI::Escape::uri_escape_utf8 ($str, "^A-Za-z0-9\\-_.!~*/:"); | |||||
818 | } | ||||||
819 | |||||||
820 | #------------------------------------------------------------------------------ | ||||||
821 | # news posting | ||||||
822 | # | ||||||
823 | # This used to run the "rnews" program, which in leafnode 2 does some direct | ||||||
824 | # writing to the spool. But that requires user "news" perms, and as of the | ||||||
825 | # June 2007 leafnode beta it tends to be a good deal slower because it reads | ||||||
826 | # the whole groupinfo file. It has the advantage of not being picky about | ||||||
827 | # message ID hostnames, and allowing read-only groups to be filled. But | ||||||
828 | # apart from that plain POST seems much easier for being "server neutral". | ||||||
829 | # | ||||||
830 | # IHAVE instead of POST would be a possibility, when available, though POST | ||||||
831 | # is probably more accurate in the sense it's a new article coming into the | ||||||
832 | # news system. | ||||||
833 | # | ||||||
834 | # Net::NNTP looks at $ENV{NNTPSERVER}, $ENV{NEWSHOST} and Net::Config | ||||||
835 | # nntp_hosts list for the news server. Maybe could have that here too, | ||||||
836 | # instead of always defaulting to localhost (in $self->{'nntp_host'}). | ||||||
837 | # Would want to find out the name chosen to show in diagnostics though. | ||||||
838 | |||||||
839 | # return a string "host:port", suitable for the Host arg to Net::NNTP->new | ||||||
840 | sub uri_to_nntp_host { | ||||||
841 | 0 | 0 | 0 | my ($uri) = @_; | |||
842 | 0 | 0 | return (non_empty($uri->host) // 'localhost') . ':' . $uri->port; | ||||
843 | } | ||||||
844 | |||||||
845 | sub nntp { | ||||||
846 | 0 | 0 | 0 | my ($self) = @_; | |||
847 | # reopen if different 'nntp_host' | ||||||
848 | 0 | 0 | 0 | if (! $self->{'nntp'} | |||
849 | || $self->{'nntp'}->host ne $self->{'nntp_host'}) { | ||||||
850 | 0 | my $host = $self->{'nntp_host'}; | |||||
851 | 0 | $self->verbose (1, __x("nntp: {host}", host => $host)); | |||||
852 | 0 | require Net::NNTP; | |||||
853 | 0 | 0 | my $nntp = $self->{'nntp'} | ||||
854 | = Net::NNTP->new ($host, ($self->{'verbose'} >= 2 | ||||||
855 | ? (Debug => 1) | ||||||
856 | : ())); | ||||||
857 | 0 | 0 | if (! $nntp) { | ||||
858 | 0 | croak __x("Cannot connect to NNTP on \"{host}\"\n", host => $host); | |||||
859 | } | ||||||
860 | 0 | 0 | if (! $nntp->postok) { | ||||
861 | 0 | $self->verbose (1, "Hmm, ", $nntp->host, " doesn't say \"posting ok\" ..."); | |||||
862 | } | ||||||
863 | } | ||||||
864 | 0 | return $self->{'nntp'}; | |||||
865 | } | ||||||
866 | |||||||
867 | sub nntp_close { | ||||||
868 | 0 | 0 | 0 | my ($self) = @_; | |||
869 | 0 | 0 | if (my $nntp = delete $self->{'nntp'}) { | ||||
870 | 0 | 0 | if (! $nntp->quit) { | ||||
871 | 0 | say "Error closing nntp: ",$nntp->message; | |||||
872 | } | ||||||
873 | } | ||||||
874 | } | ||||||
875 | |||||||
876 | # check that $group exists in the NNTP, return 1 if so, or 0 if not | ||||||
877 | sub nntp_group_check { | ||||||
878 | 0 | 0 | 0 | my ($self, $group) = @_; | |||
879 | 0 | my $nntp = $self->nntp; | |||||
880 | 0 | 0 | if (! $nntp->group($group)) { | ||||
881 | 0 | print __x("rss2leafnode: no group \"{group}\" on host \"{host}\" | |||||
882 | (See the rss2leafnode man page for notes on creating groups.) | ||||||
883 | ", | ||||||
884 | host => $nntp->host, | ||||||
885 | group => $group); | ||||||
886 | 0 | return 0; | |||||
887 | } | ||||||
888 | |||||||
889 | 0 | return 1; | |||||
890 | } | ||||||
891 | |||||||
892 | sub nntp_message_id_exists { | ||||||
893 | 0 | 0 | 0 | my ($self, $msgid) = @_; | |||
894 | 0 | my $ret = $self->nntp->nntpstat($msgid); | |||||
895 | 0 | 0 | if ($self->{'verbose'} >= 2) { | ||||
0 | |||||||
896 | 0 | 0 | $self->verbose (2, "'$msgid' ", ($ret ? 'exists already' : 'new')); | ||||
897 | } elsif ($self->{'verbose'} >= 1) { | ||||||
898 | 0 | 0 | if ($ret) { | ||||
899 | 0 | $self->verbose (1, ' ', __('exists already')); | |||||
900 | } | ||||||
901 | } | ||||||
902 | 0 | return $ret; | |||||
903 | } | ||||||
904 | |||||||
905 | # post $msg to NNTP, return true if successful | ||||||
906 | sub nntp_post { | ||||||
907 | 0 | 0 | 0 | my ($self, $msg) = @_; | |||
908 | 0 | my $nntp = $self->nntp; | |||||
909 | 0 | 0 | if (! $nntp->post ($msg->as_string)) { | ||||
910 | 0 | say __x('Cannot post: {message}', | |||||
911 | message => scalar($nntp->message)); | ||||||
912 | 0 | return 0; | |||||
913 | } | ||||||
914 | 0 | return 1; | |||||
915 | } | ||||||
916 | |||||||
917 | |||||||
918 | #------------------------------------------------------------------------------ | ||||||
919 | # HTML title | ||||||
920 | |||||||
921 | # extra data associated against a HTTP::Response object | ||||||
922 | Hash::Util::FieldHash::fieldhash (my %resp_exiftool_info); | ||||||
923 | |||||||
924 | # return hashref { Title => $str, ... }, or empty {} if no exiftool etc | ||||||
925 | sub resp_exiftool_info { | ||||||
926 | 0 | 0 | 0 | my ($resp) = @_; | |||
927 | 0 | 0 | defined $resp or return {}; | ||||
928 | 0 | 0 | if (! exists $resp_exiftool_info{$resp}) { | ||||
929 | 0 | $resp_exiftool_info{$resp} = _resp_exiftool_info($resp); | |||||
930 | ### exiftool info: $resp_exiftool_info{$resp} | ||||||
931 | } | ||||||
932 | 0 | return $resp_exiftool_info{$resp}; | |||||
933 | } | ||||||
934 | sub _resp_exiftool_info { | ||||||
935 | 0 | 0 | my ($resp) = @_; | ||||
936 | |||||||
937 | # Want ExifTool 8.22 to have PNG tEXt returned as utf8, but don't bother | ||||||
938 | # to enforce that. | ||||||
939 | # | ||||||
940 | # The returned fields from image formats with a defined charset are | ||||||
941 | # converted to the exiftool default "Charset" of utf8, and from other | ||||||
942 | # image formats the fields are bytes of something unknown. Might slightly | ||||||
943 | # like to know which is the case, and show raw bytes different from "bytes | ||||||
944 | # which ought to be utf8", but for now just Encode::decode_utf8() and let | ||||||
945 | # its Encode::FB_DEFAULT() put substitution chars for non-ascii non-utf8. | ||||||
946 | # | ||||||
947 | 0 | 0 | eval { require Image::ExifTool; 1 } || return {}; | ||||
0 | |||||||
0 | |||||||
948 | 0 | $resp->decode; | |||||
949 | 0 | my $cref = $resp->content_ref; | |||||
950 | 0 | return Image::ExifTool::ImageInfo | |||||
951 | ($cref, | ||||||
952 | ['Title','Author','Copyright','ImageSize'], # just these tags | ||||||
953 | {List => 0}); # get list values as comma separated | ||||||
954 | } | ||||||
955 | |||||||
956 | # $resp is a HTTP::Response, return title | ||||||
957 | sub html_title { | ||||||
958 | 0 | 0 | 0 | my ($resp) = @_; | |||
959 | |||||||
960 | return (# for images prefer filename+size over URI::Title just filename | ||||||
961 | 0 | 0 | non_empty (html_title_exiftool_image($resp)) | ||||
0 | |||||||
0 | |||||||
962 | |||||||
963 | // non_empty (html_title_urititle($resp)) | ||||||
964 | // non_empty (html_title_exiftool($resp)) | ||||||
965 | // $resp->title); | ||||||
966 | } | ||||||
967 | sub html_title_urititle { | ||||||
968 | 0 | 0 | 0 | my ($resp) = @_; | |||
969 | 0 | 0 | eval { require URI::Title } or return undef; | ||||
0 | |||||||
970 | |||||||
971 | # suppress some dodginess in URI::Title 1.82 | ||||||
972 | local $SIG{'__WARN__'} = sub { | ||||||
973 | 0 | 0 | my ($msg) = @_; | ||||
974 | 0 | 0 | $msg =~ /Use of uninitialized value/ or warn @_; | ||||
975 | 0 | }; | |||||
976 | 0 | $resp->decode; | |||||
977 | 0 | 0 | return URI::Title::title | ||||
978 | ({ url => ($resp->request->uri // ''), | ||||||
979 | data => $resp->content}); | ||||||
980 | } | ||||||
981 | sub html_title_exiftool_image { | ||||||
982 | 0 | 0 | 0 | my ($resp) = @_; | |||
983 | 0 | 0 | $resp->content_type =~ m{^image/} or return; | ||||
984 | 0 | 0 | if (defined (my $title = html_title_exiftool($resp))) { | ||||
985 | 0 | return $title; | |||||
986 | } | ||||||
987 | 0 | 0 | my $info = resp_exiftool_info($resp) // return; | ||||
988 | ### html_title_exiftool_image() on: $info | ||||||
989 | 0 | 0 | defined $info->{'ImageSize'} or return; | ||||
990 | 0 | return $resp->filename.' '.$info->{'ImageSize'}; | |||||
991 | } | ||||||
992 | sub html_title_exiftool { | ||||||
993 | 0 | 0 | 0 | my ($resp) = @_; | |||
994 | 0 | 0 | my $title = resp_exiftool_info($resp)->{'Title'} // return; | ||||
995 | 0 | return Encode::decode_utf8 ($title); | |||||
996 | } | ||||||
997 | |||||||
998 | |||||||
999 | #------------------------------------------------------------------------------ | ||||||
1000 | # mime | ||||||
1001 | |||||||
1002 | # prepended to "X-Mailer" header | ||||||
1003 | 1 | 1 | 11 | use constant mime_mailer_extra => "RSS2Leafnode $VERSION"; | |||
1 | 8 | ||||||
1 | 2434 | ||||||
1004 | |||||||
1005 | # $body is a MIME::Body object, append $str to it | ||||||
1006 | sub mime_body_append { | ||||||
1007 | 0 | 0 | 0 | my ($body, $str) = @_; | |||
1008 | 0 | $str = $body->as_string . "\n" . str_ensure_newline ($str); | |||||
1009 | 0 | 0 | my $IO = $body->open('w') | ||||
1010 | or die "rss2leafnode: body I/O open: $!"; | ||||||
1011 | 0 | $IO->print ($str); | |||||
1012 | 0 | 0 | $IO->close | ||||
1013 | or die "rss2leafnode: body I/O close: $!"; | ||||||
1014 | } | ||||||
1015 | |||||||
1016 | # if $str is not ascii then apply encode_mimewords() | ||||||
1017 | sub mimewords_non_ascii { | ||||||
1018 | 0 | 0 | 0 | my ($str) = @_; | |||
1019 | 0 | 0 | 0 | if (defined $str && ! is_ascii($str)) { | |||
1020 | 0 | require MIME::Words; | |||||
1021 | 0 | $str = MIME::Words::encode_mimewords (Encode::encode_utf8($str), | |||||
1022 | Charset => 'UTF-8'); | ||||||
1023 | } | ||||||
1024 | 0 | return $str; | |||||
1025 | } | ||||||
1026 | |||||||
1027 | sub mime_build { | ||||||
1028 | 0 | 0 | 0 | my ($self, $headers, @args) = @_; | |||
1029 | |||||||
1030 | # Headers in utf-8, the same as other text. The docs of | ||||||
1031 | # encode_mimewords() isn't clear, but seems to expect bytes of the | ||||||
1032 | # specified charset. | ||||||
1033 | 0 | foreach my $key (sort keys %$headers) { | |||||
1034 | 0 | $headers->{$key} | |||||
1035 | = mimewords_non_ascii(Text::Trim::trim($headers->{$key})); | ||||||
1036 | } | ||||||
1037 | |||||||
1038 | 0 | %$headers = (%$headers, @args); | |||||
1039 | 0 | 0 | $headers->{'Top'} //= 0; # default to a part not a toplevel | ||||
1040 | 0 | 0 | $headers->{'Encoding'} //= '-SUGGEST'; | ||||
1041 | |||||||
1042 | 0 | 0 | if ($headers->{'Top'}) { | ||||
1043 | 0 | my $now822 = rfc822_time_now(); | |||||
1044 | 0 | 0 | $headers->{'Date'} //= $now822; | ||||
1045 | 0 | $headers->{'Date-Received:'} = $now822; | |||||
1046 | } | ||||||
1047 | |||||||
1048 | 0 | 0 | if (utf8::is_utf8($headers->{'Data'})) { | ||||
1049 | 0 | warn 'Oops, mime_build() data should be bytes'; | |||||
1050 | } | ||||||
1051 | |||||||
1052 | # downgrade utf-8 to us-ascii if possible | ||||||
1053 | 0 | 0 | 0 | if ($headers->{'Type'} eq 'text/plain' | |||
0 | |||||||
0 | |||||||
1054 | && lc($headers->{'Charset'}||0) eq 'utf-8' | ||||||
1055 | && is_ascii ($headers->{'Data'})) { | ||||||
1056 | 0 | $headers->{'Charset'} = 'us-ascii'; | |||||
1057 | |||||||
1058 | # not sure mangling text/html body content is a good idea -- would only | ||||||
1059 | # want it on generated html, not downloaded | ||||||
1060 | # | ||||||
1061 | # if ($headers->{'Type'} eq 'text/html') { | ||||||
1062 | # $headers->{'Data'} =~ s{( | ||||||
1063 | # } | ||||||
1064 | } | ||||||
1065 | |||||||
1066 | 0 | @args = map {$_,$headers->{$_}} sort keys %$headers; | |||||
0 | |||||||
1067 | 0 | 0 | if ($self->{'verbose'} >= 4) { | ||||
1068 | 0 | require Data::Dumper; | |||||
1069 | 0 | $self->verbose (4, Data::Dumper->new([\@args],['mime headers'])->Dump); | |||||
1070 | } | ||||||
1071 | |||||||
1072 | 0 | require MIME::Entity; | |||||
1073 | 0 | my $top = MIME::Entity->build (Disposition => 'inline', @args); | |||||
1074 | |||||||
1075 | 0 | 0 | 0 | if ($headers->{'Top'} && ! defined $headers->{'X-Mailer:'}) { | |||
1076 | 0 | my $head = $top->head; | |||||
1077 | 0 | $head->set('X-Mailer', join_non_empty (', ', | |||||
1078 | $self->mime_mailer_extra, | ||||||
1079 | $head->get('X-Mailer'))); | ||||||
1080 | } | ||||||
1081 | |||||||
1082 | 0 | return $top; | |||||
1083 | } | ||||||
1084 | |||||||
1085 | # $resp is a HTTP::Response | ||||||
1086 | # Return a MIME::Entity which contains the response, and any further @headers. | ||||||
1087 | # If $self->{'render'} is true then render HTML to plain text. | ||||||
1088 | # | ||||||
1089 | sub mime_part_from_response { | ||||||
1090 | 0 | 0 | 0 | my ($self, $resp, @headers) = @_; | |||
1091 | |||||||
1092 | 0 | my $content_type = $resp->content_type; | |||||
1093 | 0 | $self->verbose (2, ' content-type: ',$content_type); | |||||
1094 | 0 | $resp->decode; | |||||
1095 | 0 | my $content = $resp->content; # the bytes | |||||
1096 | 0 | my $charset = $resp->content_charset; # and their charset | |||||
1097 | 0 | my $url = $resp->request->uri->as_string; | |||||
1098 | 0 | my $content_md5 = $resp->header('Content-MD5'); | |||||
1099 | |||||||
1100 | 0 | ($content, $content_type, $charset, my $rendered) | |||||
1101 | = $self->render_maybe ($content, $content_type, $charset, $url); | ||||||
1102 | 0 | 0 | if ($rendered) { | ||||
1103 | 0 | undef $content_md5; | |||||
1104 | } | ||||||
1105 | |||||||
1106 | 0 | return $self->mime_build | |||||
1107 | ({ 'Content-Language:' => scalar($resp->header('Content-Language')), | ||||||
1108 | 'Content-Location:' => $url, | ||||||
1109 | 'Content-MD5:' => $content_md5, | ||||||
1110 | @headers, | ||||||
1111 | }, | ||||||
1112 | Type => $content_type, | ||||||
1113 | Charset => $charset, | ||||||
1114 | Data => $content, | ||||||
1115 | Filename => $resp->filename); | ||||||
1116 | } | ||||||
1117 | |||||||
1118 | |||||||
1119 | # set "Lines:" header per RFC 1036 | ||||||
1120 | # MIME::Entity 5.428 doesn't seem to have anything for this itself | ||||||
1121 | # this is after qp or base64, is that right? the actual message lines | ||||||
1122 | sub mime_entity_lines { | ||||||
1123 | 0 | 0 | 0 | my ($top) = @_; | |||
1124 | 0 | $top->head->set('Lines', str_count_lines ($top->stringify_body)); | |||||
1125 | } | ||||||
1126 | |||||||
1127 | #------------------------------------------------------------------------------ | ||||||
1128 | # XML::Twig stuff | ||||||
1129 | |||||||
1130 | # Return the text of $elt and treat child elements as improperly escaped | ||||||
1131 | # parts of the text too. | ||||||
1132 | # | ||||||
1133 | # This is good for elements which are supposed to be HTML with etc |
||||||
1134 | # escaped as <p>, but copes with feeds that don't have the necessary | ||||||
1135 | # escapes and thus come out with xml child elements under $elt. | ||||||
1136 | # | ||||||
1137 | # For elements which are supposed to be plain text with no markup and no | ||||||
1138 | # sub-elements this will at least make improper child text visible, though | ||||||
1139 | # it might not look very good. | ||||||
1140 | # | ||||||
1141 | # As of June 2010 http://www.drweil.com/drw/ecs/rss.xml is an example of | ||||||
1142 | # improperly escaped html. | ||||||
1143 | # | ||||||
1144 | # FIXME: Any need to watch out for |
||||||
1145 | # | ||||||
1146 | sub elt_subtext { | ||||||
1147 | 0 | 0 | 0 | my ($elt) = @_; | |||
1148 | 0 | 0 | defined $elt or return undef; | ||||
1149 | 0 | 0 | if ($elt->is_text) { return $elt->text; } | ||||
0 | |||||||
1150 | 0 | return join ('', map {_elt_subtext_with_tags($_)} $elt->children); | |||||
0 | |||||||
1151 | } | ||||||
1152 | sub _elt_subtext_with_tags { | ||||||
1153 | 0 | 0 | my ($elt) = @_; | ||||
1154 | 0 | 0 | defined $elt or return undef; | ||||
1155 | 0 | 0 | if ($elt->is_text) { return $elt->text; } | ||||
0 | |||||||
1156 | 0 | return ($elt->start_tag | |||||
1157 | 0 | . join ('', map {_elt_subtext_with_tags($_)} $elt->children) | |||||
1158 | . $elt->end_tag); | ||||||
1159 | } | ||||||
1160 | |||||||
1161 | # $elt contains xhtml etc sub-elements. Return a plain html string. |
||||||
1162 | # Prefixes like |
||||||
1163 | # This relies on the map_xmlns mapping to give prefix "xhtml:" | ||||||
1164 | # | ||||||
1165 | sub elt_xhtml_to_html { | ||||||
1166 | 0 | 0 | 0 | my ($elt) = @_; | |||
1167 | |||||||
1168 | # could probably do it destructively, but just in case | ||||||
1169 | 0 | $elt = $elt->copy; | |||||
1170 | 0 | App::RSS2Leafnode::XML::Twig::Other::elt_tree_strip_prefix ($elt, 'xhtml'); | |||||
1171 | |||||||
1172 | # lose xmlns:xhtml="http://www.w3.org/1999/xhtml" | ||||||
1173 | 0 | $elt->strip_att('xmlns:xhtml'); | |||||
1174 | |||||||
1175 | # something fishy turns "href" to "xhtml:href", drop any "xhtml:" | ||||||
1176 | # bare "href" also gets turned into atom:href as the default namespace, | ||||||
1177 | # drop any "atom:" | ||||||
1178 | 0 | foreach my $child ($elt->descendants) { | |||||
1179 | 0 | foreach my $attname ($child->att_names) { | |||||
1180 | 0 | 0 | if ($attname =~ /^(xhtml|atom):(.*)/) { | ||||
1181 | 0 | $child->change_att_name($attname, $2); | |||||
1182 | } | ||||||
1183 | } | ||||||
1184 | } | ||||||
1185 | |||||||
1186 | 0 | my $old_pretty = $elt->set_pretty_print ('none'); | |||||
1187 | ### $old_pretty | ||||||
1188 | 0 | my $ret = $elt->xml_string; | |||||
1189 | 0 | $elt->set_pretty_print ($old_pretty); | |||||
1190 | 0 | return $ret; | |||||
1191 | |||||||
1192 | } | ||||||
1193 | |||||||
1194 | # elt_content_type() returns 'text', 'html', 'xhtml' or a mime type. | ||||||
1195 | # If no type="" attribute the default is 'text', except for RSS | ||||||
1196 | # |
||||||
1197 | # | ||||||
1198 | # RSS http://www.debian.org/News/weekly/dwn.en.rdf circa Feb 2010 had some | ||||||
1199 | # html in its |
||||||
1200 | # plain text) and that RSS is all plain text outside |
||||||
1201 | # | ||||||
1202 | # |
||||||
1203 | # not the formatting as html vs text. | ||||||
1204 | # | ||||||
1205 | @known{'/channel/item/dc:type'} = undef; | ||||||
1206 | # | ||||||
1207 | sub elt_content_type { | ||||||
1208 | 0 | 0 | 0 | my ($elt) = @_; | |||
1209 | 0 | 0 | if (! defined $elt) { return undef; } | ||||
0 | |||||||
1210 | |||||||
1211 | 0 | 0 | 0 | if (defined (my $type = ($elt->att('atom:type') // $elt->att('type')))) { | |||
1212 | # type="application/xhtml+xml" at http://xmltwig.com/blog/index.atom, | ||||||
1213 | # dunno if it should be just "xhtml", but recognise it anyway | ||||||
1214 | 0 | 0 | if ($type eq 'application/xhtml+xml') { return 'xhtml'; } | ||||
0 | |||||||
1215 | 0 | return $type; | |||||
1216 | } | ||||||
1217 | 0 | 0 | if ($elt->root->tag eq 'feed') { | ||||
1218 | 0 | return 'text'; # Atom |
|||||
1219 | } | ||||||
1220 | 0 | my $tag = $elt->tag; | |||||
1221 | 0 | 0 | if ($tag =~ /^itunes:/) { | ||||
1222 | # itunes spec is for text-only, no html markup | ||||||
1223 | 0 | return 'text'; | |||||
1224 | } | ||||||
1225 | 0 | 0 | 0 | if ($tag eq 'description' # RSS |
|||
1226 | || $tag eq 'content:encoded') { # same in content:encoded | ||||||
1227 | 0 | return 'html'; | |||||
1228 | } | ||||||
1229 | # other RSS is text | ||||||
1230 | 0 | return 'text'; | |||||
1231 | } | ||||||
1232 | |||||||
1233 | # $elt is an XML::Twig::Elt of an RSS or Atom text element. | ||||||
1234 | # Atom has a type="" attribute, RSS is html. Html or xhtml are rendered to | ||||||
1235 | # a single long line of plain text. | ||||||
1236 | # | ||||||
1237 | sub elt_to_rendered_line { | ||||||
1238 | 0 | 0 | 0 | my ($elt) = @_; | |||
1239 | 0 | 0 | defined $elt or return; | ||||
1240 | |||||||
1241 | 0 | my $str; | |||||
1242 | 0 | my $type = elt_content_type ($elt); | |||||
1243 | 0 | 0 | if ($type eq 'xhtml') { | ||||
1244 | 0 | $str = elt_xhtml_to_html ($elt); | |||||
1245 | 0 | $type = 'html'; | |||||
1246 | } else { | ||||||
1247 | 0 | $str = elt_subtext($elt); | |||||
1248 | } | ||||||
1249 | 0 | 0 | if ($type eq 'html') { | ||||
1250 | 0 | $str = html_to_rendered_line($str); | |||||
1251 | } | ||||||
1252 | # plain 'text' or anything unrecognised collapsed too | ||||||
1253 | 0 | return non_empty(collapse_whitespace($str)); | |||||
1254 | } | ||||||
1255 | |||||||
1256 | sub html_to_rendered_line { | ||||||
1257 | 0 | 0 | 0 | my ($html) = @_; | |||
1258 | 0 | require HTML::FormatText; | |||||
1259 | 0 | return collapse_whitespace | |||||
1260 | (HTML::FormatText->format_string ($html, | ||||||
1261 | leftmargin => 0, | ||||||
1262 | rightmargin => 999)); | ||||||
1263 | } | ||||||
1264 | |||||||
1265 | |||||||
1266 | #------------------------------------------------------------------------------ | ||||||
1267 | # XML::RSS::Timing | ||||||
1268 | |||||||
1269 | sub twig_to_timingfields { | ||||||
1270 | 0 | 0 | 0 | my ($self, $twig) = @_; | |||
1271 | 0 | 0 | return if ! defined $twig; | ||||
1272 | 0 | my $root = $twig->root; | |||||
1273 | 0 | my %timingfields; | |||||
1274 | |||||||
1275 | 0 | 0 | if (my $ttl = $root->first_descendant('ttl')) { | ||||
1276 | 0 | $timingfields{'ttl'} = $ttl->trimmed_text; | |||||
1277 | } | ||||||
1278 | 0 | 0 | if (my $skipHours = $root->first_descendant('skipHours')) { | ||||
1279 | 0 | $timingfields{'skipHours'} = [map {$_->trimmed_text} $skipHours->children('hour')]; | |||||
0 | |||||||
1280 | } | ||||||
1281 | 0 | 0 | if (my $skipDays = $root->first_descendant('skipDays')) { | ||||
1282 | 0 | $timingfields{'skipDays'} = [map {$_->trimmed_text} $skipDays->children('day')]; | |||||
0 | |||||||
1283 | } | ||||||
1284 | |||||||
1285 | # "syn:updatePeriod" etc | ||||||
1286 | 0 | foreach my $key (qw(updatePeriod updateFrequency updateBase)) { | |||||
1287 | 0 | 0 | if (my $update = $root->first_descendant("syn:$key")) { | ||||
1288 | 0 | $timingfields{$key} = $update->trimmed_text; | |||||
1289 | } | ||||||
1290 | } | ||||||
1291 | 0 | 0 | if ($self->{'verbose'} >= 2) { | ||||
1292 | 0 | require Data::Dumper; | |||||
1293 | 0 | $self->verbose (2, | |||||
1294 | Data::Dumper->new([\%timingfields],['timingfields']) | ||||||
1295 | ->Indent(1)->Sortkeys(1)->Dump); | ||||||
1296 | } | ||||||
1297 | 0 | 0 | if (! %timingfields) { | ||||
1298 | 0 | return; # no info | |||||
1299 | } | ||||||
1300 | |||||||
1301 | # if XML::RSS::Timing doesn't like the values then don't record them | ||||||
1302 | 0 | 0 | return unless $self->timingfields_to_timing(\%timingfields); | ||||
1303 | |||||||
1304 | 0 | return \%timingfields; | |||||
1305 | } | ||||||
1306 | @known{qw(/channel/skipDays | ||||||
1307 | /channel/skipDays/day | ||||||
1308 | /channel/skipHours | ||||||
1309 | /channel/skipHours/hour | ||||||
1310 | /channel/ttl | ||||||
1311 | /channel/syn:updateBase | ||||||
1312 | /channel/syn:updatePeriod | ||||||
1313 | /channel/syn:updateFrequency)} = (); | ||||||
1314 | |||||||
1315 | # return an XML::RSS::Timing object, or undef | ||||||
1316 | sub timingfields_to_timing { | ||||||
1317 | 0 | 0 | 0 | my ($self, $timingfields) = @_; | |||
1318 | 0 | 0 | $timingfields // return undef; | ||||
1319 | |||||||
1320 | 0 | 0 | eval { require XML::RSS::Timing } || return undef; | ||||
0 | |||||||
1321 | 0 | my $timing = XML::RSS::Timing->new; | |||||
1322 | 0 | $timing->use_exceptions(0); | |||||
1323 | 0 | while (my ($key, $value) = each %$timingfields) { | |||||
1324 | 0 | 0 | if (ref $value) { | ||||
1325 | 0 | $timing->$key (@$value); | |||||
1326 | } else { | ||||||
1327 | 0 | $timing->$key ($value); | |||||
1328 | } | ||||||
1329 | } | ||||||
1330 | 0 | 0 | if (my @complaints = $timing->complaints) { | ||||
1331 | 0 | say __x('XML::RSS::Timing complains about {url}', | |||||
1332 | url => $self->{'uri'}); | ||||||
1333 | 0 | foreach my $complaint (@complaints) { | |||||
1334 | 0 | say " $complaint"; | |||||
1335 | } | ||||||
1336 | 0 | return undef; | |||||
1337 | } | ||||||
1338 | 0 | return $timing; | |||||
1339 | } | ||||||
1340 | |||||||
1341 | |||||||
1342 | #------------------------------------------------------------------------------ | ||||||
1343 | # rss2leafnode.status file | ||||||
1344 | |||||||
1345 | # $self->{'global_status'} is a hashref containing entries URL => STATUS, | ||||||
1346 | # where URL is a string and STATUS is a sub-hashref of information | ||||||
1347 | |||||||
1348 | 1 | 1 | 9 | use constant STATUS_EXPIRE_DAYS => 45; | |||
1 | 2 | ||||||
1 | 4770 | ||||||
1349 | |||||||
1350 | # read $status_filename into $self->{'global_status'} | ||||||
1351 | sub status_read { | ||||||
1352 | 0 | 0 | 0 | my ($self) = @_; | |||
1353 | 0 | $self->{'global_status'} = {}; | |||||
1354 | 0 | my $status_filename = $self->status_filename; | |||||
1355 | 0 | $self->verbose (2, 'read status: ', $status_filename); | |||||
1356 | |||||||
1357 | 0 | $! = 0; | |||||
1358 | 0 | my $global_status = do $status_filename; | |||||
1359 | 0 | 0 | if (! defined $global_status) { | ||||
1360 | 0 | 0 | if ($! == POSIX::ENOENT()) { | ||||
1361 | 0 | $self->verbose (2, "status file doesn't exist"); | |||||
1362 | } else { | ||||||
1363 | 0 | say "rss2leafnode: error in $status_filename\n$@"; | |||||
1364 | 0 | say "ignoring that file"; | |||||
1365 | } | ||||||
1366 | 0 | $global_status = {}; | |||||
1367 | } | ||||||
1368 | 0 | $self->{'global_status'} = $global_status; | |||||
1369 | } | ||||||
1370 | |||||||
1371 | # delete old entries from $self->{'global_status'} | ||||||
1372 | sub status_prune { | ||||||
1373 | 0 | 0 | 0 | my ($self) = @_; | |||
1374 | 0 | 0 | my $global_status = $self->{'global_status'} // return; | ||||
1375 | 0 | my $pruned = 0; | |||||
1376 | 0 | my $old_time = time() - STATUS_EXPIRE_DAYS * 86400; | |||||
1377 | 0 | foreach my $key (keys %$global_status) { | |||||
1378 | 0 | 0 | if ($global_status->{$key}->{'status-time'} < $old_time) { | ||||
1379 | 0 | $self->verbose (2, __x("discard old status {url}\n", url => $key)); | |||||
1380 | 0 | delete $global_status->{$key}; | |||||
1381 | 0 | $pruned++; | |||||
1382 | } | ||||||
1383 | } | ||||||
1384 | 0 | 0 | if ($pruned) { | ||||
1385 | 0 | $self->verbose (1, __xn("discard {count} old status entry\n", | |||||
1386 | "discard {count} old status entries\n", | ||||||
1387 | $pruned, | ||||||
1388 | count => $pruned)); | ||||||
1389 | } | ||||||
1390 | } | ||||||
1391 | |||||||
1392 | # save $self->{'global_status'} into the $status_filename | ||||||
1393 | sub status_save { | ||||||
1394 | 0 | 0 | 0 | my ($self, $status) = @_; | |||
1395 | 0 | $status->{'status-time'} = time(); | |||||
1396 | 0 | 0 | if ($status->{'timingfields'}) { | ||||
1397 | 0 | $status->{'timingfields'}->{'lastPolled'} = $status->{'status-time'}; | |||||
1398 | } | ||||||
1399 | |||||||
1400 | 0 | $self->status_prune; | |||||
1401 | |||||||
1402 | 0 | require Data::Dumper; | |||||
1403 | 0 | my $str = Data::Dumper->new([$self->{'global_status'}],['global_status']) | |||||
1404 | ->Indent(1)->Sortkeys(1)->Terse(1)->Useqq(1)->Dump; | ||||||
1405 | 0 | $str = <<"HERE"; | |||||
1406 | # rss2leafnode status file -- automatically generated -- DO NOT EDIT | ||||||
1407 | # | ||||||
1408 | # (If there seems to be something very wrong then you can delete this file | ||||||
1409 | # and it'll be started afresh on the next run.) | ||||||
1410 | |||||||
1411 | $str | ||||||
1412 | |||||||
1413 | |||||||
1414 | # Local variables: | ||||||
1415 | # mode: perl-mode | ||||||
1416 | # End: | ||||||
1417 | HERE | ||||||
1418 | |||||||
1419 | 0 | my $status_filename = $self->status_filename; | |||||
1420 | 0 | my $out; | |||||
1421 | 0 | 0 | 0 | (open $out, '>', $status_filename | |||
0 | |||||||
1422 | and print $out $str | ||||||
1423 | and close $out) | ||||||
1424 | or croak "rss2leafnode: cannot write to $status_filename: $!\n"; | ||||||
1425 | } | ||||||
1426 | |||||||
1427 | # return a hashref which has status information about $url, or undef if | ||||||
1428 | # nothing recorded about $url | ||||||
1429 | sub status_geturl { | ||||||
1430 | 0 | 0 | 0 | my ($self, $url) = @_; | |||
1431 | 0 | 0 | $self->status_read if ! $self->{'global_status'}; | ||||
1432 | 0 | 0 | if (! $self->{'global_status'}->{$url}) { | ||||
1433 | 0 | $self->{'global_status'}->{$url} = { 'status-time' => time() }; | |||||
1434 | } | ||||||
1435 | 0 | return $self->{'global_status'}->{$url}; | |||||
1436 | } | ||||||
1437 | |||||||
1438 | # $resp is a HTTP::Response object from retrieving $url. | ||||||
1439 | # Optional $twig is an XML::Twig. | ||||||
1440 | # Record against $url any ETag, Last-Modified and ttl from $resp and $twig. | ||||||
1441 | # If $resp is an error return, or is undef, then do nothing. | ||||||
1442 | sub status_etagmod_resp { | ||||||
1443 | 0 | 0 | 0 | my ($self, $url, $resp, $twig) = @_; | |||
1444 | 0 | 0 | 0 | if ($resp && $resp->is_success) { | |||
1445 | 0 | my $status = $self->status_geturl ($url); | |||||
1446 | 0 | $status->{'Last-Modified'} = $resp->header('Last-Modified'); | |||||
1447 | 0 | $status->{'ETag'} = $resp->header('ETag'); | |||||
1448 | 0 | $status->{'timingfields'} = $self->twig_to_timingfields ($twig); | |||||
1449 | |||||||
1450 | 0 | 0 | 0 | if (! defined $status->{'ETag'} && ! defined $status->{'Last-Modified'}) { | |||
1451 | 0 | $self->verbose (1, " no ETag or Last-Modified"); | |||||
1452 | } | ||||||
1453 | 0 | 0 | if (defined (my $comments_count = $self->{'comments_count'})) { | ||||
1454 | 0 | $status->{'comments_count'} = $comments_count; | |||||
1455 | } | ||||||
1456 | |||||||
1457 | 0 | 0 | if ($twig) { | ||||
1458 | # record previously applied newest option | ||||||
1459 | 0 | $status->{'rss_newest_only'} = $self->{'rss_newest_only'}; | |||||
1460 | |||||||
1461 | # if (rss_newest_cmp($self,$status) > 0) { | ||||||
1462 | # # the newest number increases | ||||||
1463 | # } | ||||||
1464 | } | ||||||
1465 | 0 | foreach my $key (keys %$status) { | |||||
1466 | 0 | 0 | if (! defined $status->{$key}) { delete $status->{$key} } | ||||
0 | |||||||
1467 | } | ||||||
1468 | 0 | $self->status_save($status); | |||||
1469 | } | ||||||
1470 | } | ||||||
1471 | |||||||
1472 | # update recorded status for a $url with unchanged contents | ||||||
1473 | sub status_unchanged { | ||||||
1474 | 0 | 0 | 0 | my ($self, $url) = @_; | |||
1475 | 0 | $self->verbose (1, ' ', __('unchanged')); | |||||
1476 | 0 | $self->status_save ($self->status_geturl ($url)); | |||||
1477 | } | ||||||
1478 | |||||||
1479 | # $req is a HTTP::Request object. | ||||||
1480 | # Add "If-None-Match" and/or "If-Modified-Since" headers to it based on what | ||||||
1481 | # the status file has recorded from when we last fetched the url in $req. | ||||||
1482 | # Return 1 to download, 0 if nothing expected yet by RSS timing fields | ||||||
1483 | # | ||||||
1484 | sub status_etagmod_req { | ||||||
1485 | 0 | 0 | 0 | my ($self, $req, $for_rss) = @_; | |||
1486 | 0 | 0 | $self->{'global_status'} or $self->status_read; | ||||
1487 | |||||||
1488 | 0 | my $url = $req->uri->as_string; | |||||
1489 | my $status = $self->{'global_status'}->{$url} | ||||||
1490 | 0 | 0 | // do { | ||||
1491 | 0 | $self->verbose (2, __x("no status info for {url}\n", url => $url)); | |||||
1492 | 0 | return 1; # want download | |||||
1493 | }; | ||||||
1494 | |||||||
1495 | 0 | 0 | if ($for_rss) { | ||||
1496 | # if status says the last download was for only a certain number of | ||||||
1497 | # newest, then force a re-download if that option now different | ||||||
1498 | 0 | 0 | if (! str_equal($self->{'rss_newest_only'}, | ||||
1499 | $status->{'rss_newest_only'})) { | ||||||
1500 | 0 | return 1; # want download | |||||
1501 | } | ||||||
1502 | } | ||||||
1503 | |||||||
1504 | 0 | 0 | if (my $timing = $self->timingfields_to_timing ($status->{'timingfields'})) { | ||||
1505 | 0 | my $next = $timing->nextUpdate; | |||||
1506 | 0 | my $now = time(); | |||||
1507 | 0 | 0 | if ($next > $now) { | ||||
1508 | 0 | $self->verbose (1, ' ', | |||||
1509 | __x('timing: next update {time} (local time)', | ||||||
1510 | time => POSIX::strftime ("%H:%M:%S %a %d %b %Y", | ||||||
1511 | localtime($next)))); | ||||||
1512 | 0 | 0 | 0 | if (eval 'use Time::Duration::Locale; 1' | |||
1513 | || eval 'use Time::Duration; 1') { | ||||||
1514 | 0 | $self->verbose (1, ' ', __x('which is {duration} from now', | |||||
1515 | duration => duration($next-$now))); | ||||||
1516 | } | ||||||
1517 | 0 | return 0; # no update yet | |||||
1518 | } | ||||||
1519 | } | ||||||
1520 | 0 | 0 | if (defined (my $lastmod = $status->{'Last-Modified'})) { | ||||
1521 | 0 | $req->header('If-Modified-Since' => $lastmod); | |||||
1522 | } | ||||||
1523 | 0 | 0 | if (defined (my $etag = $status->{'ETag'})) { | ||||
1524 | 0 | $req->header('If-None-Match' => $etag); | |||||
1525 | } | ||||||
1526 | 0 | return 1; | |||||
1527 | } | ||||||
1528 | |||||||
1529 | # return -1 if x |
||||||
1530 | # sub rss_newest_cmp { | ||||||
1531 | # my ($x, $y) = @_; | ||||||
1532 | # if ($x->{'rss_newest_only'}) { | ||||||
1533 | # if (! $y->{'rss_newest_only'}) { | ||||||
1534 | # return -1; # x finite, y infinite | ||||||
1535 | # } | ||||||
1536 | # # x and y finite | ||||||
1537 | # return ($x->{'rss_newest_only'} <=> $y->{'rss_newest_only'}); | ||||||
1538 | # } else { | ||||||
1539 | # # x infinite, so 1 if y finite, 0 if y infinite too | ||||||
1540 | # return !! $y->{'rss_newest_only'}; | ||||||
1541 | # } | ||||||
1542 | # } | ||||||
1543 | sub str_equal { | ||||||
1544 | 0 | 0 | 0 | my ($x, $y) = @_; | |||
1545 | 0 | 0 | return ((defined $x && defined $y && $x eq $y) | ||||
1546 | || (! defined $x && ! defined $y)); | ||||||
1547 | } | ||||||
1548 | |||||||
1549 | #------------------------------------------------------------------------------ | ||||||
1550 | # render html | ||||||
1551 | |||||||
1552 | # $content_type is a string like "text/html" or "text/plain". | ||||||
1553 | # $content is data as raw bytes. | ||||||
1554 | # $charset is the character set of those bytes, eg. "utf-8". | ||||||
1555 | # | ||||||
1556 | # If the $render option is set, and $content_type is 'text/html', then | ||||||
1557 | # render $content down to 'text/plain', using either HTML::FormatText or | ||||||
1558 | # Lynx. | ||||||
1559 | # The return is a new triplet ($content, $content_type, $charset). | ||||||
1560 | # | ||||||
1561 | sub render_maybe { | ||||||
1562 | 0 | 0 | 0 | my ($self, $content, $content_type, $charset, $base_url) = @_; | |||
1563 | 0 | my $rendered = 0; | |||||
1564 | 0 | 0 | 0 | if ($self->{'render'} && $content_type eq 'text/html') { | |||
1565 | |||||||
1566 | 0 | my $class = $self->{'render'}; | |||||
1567 | 0 | 0 | if ($class !~ /^HTML::/) { $class = "HTML::FormatText::\u$class"; } | ||||
0 | |||||||
1568 | 0 | $class =~ s/::1$//; # "::1" is $render=1 for plain HTML::FormatText | |||||
1569 | 0 | require Module::Load; | |||||
1570 | 0 | Module::Load::load ($class); | |||||
1571 | |||||||
1572 | 0 | 0 | if ($class =~ /^HTML::FormatText($|::WithLinks)/) { | ||||
1573 | # Believe HTML::FormatText (as of version 2.04) doesn't know much | ||||||
1574 | # about input or output charsets, but it can be tricked into getting | ||||||
1575 | # pretty reasonable results by putting wide chars through it. | ||||||
1576 | # Likewise HTML::FormatText::WithLinks (as of version 0.11). | ||||||
1577 | # | ||||||
1578 | # FIXME: decode() can error out on bad charset | ||||||
1579 | 0 | $content = Encode::decode ($charset, $content); | |||||
1580 | 0 | local $SIG{'__WARN__'} = \&_warn_suppress_unknown_base; | |||||
1581 | 0 | $content = $class->format_string | |||||
1582 | ($content, | ||||||
1583 | base => $base_url, | ||||||
1584 | leftmargin => 0, | ||||||
1585 | rightmargin => $self->{'render_width'}); | ||||||
1586 | 0 | $content = Encode::encode_utf8 ($content); | |||||
1587 | |||||||
1588 | } else { | ||||||
1589 | # HTML::FormatExternal style charset specs | ||||||
1590 | 0 | $content = $class->format_string | |||||
1591 | ($content, | ||||||
1592 | base => $base_url, | ||||||
1593 | leftmargin => 0, | ||||||
1594 | rightmargin => $self->{'render_width'}, | ||||||
1595 | input_charset => $charset, | ||||||
1596 | output_charset => 'utf-8'); | ||||||
1597 | } | ||||||
1598 | 0 | $charset = 'UTF-8'; | |||||
1599 | 0 | $content_type = 'text/plain'; | |||||
1600 | 0 | $rendered = 1; | |||||
1601 | } | ||||||
1602 | 0 | return ($content, $content_type, $charset, $rendered); | |||||
1603 | } | ||||||
1604 | sub _warn_suppress_unknown_base { | ||||||
1605 | 0 | 0 | my ($msg) = @_; | ||||
1606 | 0 | 0 | $msg =~ /^Unknown configure option 'base'/ | ||||
1607 | or warn $msg; | ||||||
1608 | } | ||||||
1609 | |||||||
1610 | # $str is a wide-char string of text | ||||||
1611 | sub text_wrap { | ||||||
1612 | 0 | 0 | 0 | my ($self, $str, $prefix) = @_; | |||
1613 | 0 | 0 | if (! defined $prefix) { $prefix = ''; } | ||||
0 | |||||||
1614 | 0 | require Text::WrapI18N; | |||||
1615 | 0 | local $Text::WrapI18N::columns = $self->{'render_width'} + 1; | |||||
1616 | 0 | local $Text::WrapI18N::unexpand = 0; # no tabs in output | |||||
1617 | 0 | local $Text::WrapI18N::huge = 'overflow'; # don't break long words | |||||
1618 | 0 | $str =~ tr/\n/ /; | |||||
1619 | 0 | 0 | my $second_prefix = (length($prefix) ? ' 'x(length($prefix)+2) : ''); | ||||
1620 | 0 | return Text::WrapI18N::wrap($prefix, $second_prefix, $str); | |||||
1621 | } | ||||||
1622 | |||||||
1623 | #------------------------------------------------------------------------------ | ||||||
1624 | # Face icons | ||||||
1625 | |||||||
1626 | # $item is an XML::Twig::Elt of an RSS or Atom item | ||||||
1627 | # return a string value for the Face: header, or undef if no icon | ||||||
1628 | sub item_to_face { | ||||||
1629 | 0 | 0 | 0 | my ($self, $item) = @_; | |||
1630 | 0 | 0 | $self->{'get_icon'} || return; | ||||
1631 | 0 | 0 | my ($uri, $width, $height) = $self->item_image_uwh ($item) | ||||
1632 | or return; | ||||||
1633 | 0 | 0 | $self->face_wh_ok ($width, $height) || return; | ||||
1634 | 0 | return $self->download_face ($uri, $width, $height); | |||||
1635 | } | ||||||
1636 | |||||||
1637 | # $item is an XML::Twig::Elt of an RSS or Atom item | ||||||
1638 | # return values ($uri, $width, $height) of the |
||||||
1639 | # | ||||||
1640 | sub item_image_uwh { | ||||||
1641 | 0 | 0 | 0 | my ($self, $item) = @_; | |||
1642 | ### item_image_uwh() ... | ||||||
1643 | |||||||
1644 | 0 | foreach my $where ($item, | |||||
1645 | item_to_channel($item)) { | ||||||
1646 | ### image text: $where->first_child_text('image') | ||||||
1647 | |||||||
1648 | # identi.ca | ||||||
1649 | 0 | 0 | if (my $actor = $where->first_child('activity:actor')) { | ||||
1650 | 0 | my ($url, $width, $height); | |||||
1651 | 0 | foreach my $link_elt ($actor->children('link')) { | |||||
1652 | 0 | 0 | 0 | ($link_elt->att('rel')||$link_elt->att('atom:rel')||'') | |||
1653 | eq 'avatar' or next; | ||||||
1654 | 0 | 0 | $url = $link_elt->att('href') // $link_elt->att('atom:href') // next; | ||||
0 | |||||||
1655 | 0 | my $this_width = $link_elt->att('media:width'); | |||||
1656 | 0 | 0 | 0 | next if (defined $width | |||
0 | |||||||
1657 | && defined $this_width | ||||||
1658 | && $width < $this_width); # prefer smallest | ||||||
1659 | 0 | $url = App::RSS2Leafnode::XML::Twig::Other::elt_xml_based_uri ($link_elt, $url); | |||||
1660 | 0 | 0 | $width = ($this_width || 0); | ||||
1661 | 0 | 0 | $height = ($link_elt->att('media:height') || 0); | ||||
1662 | ### $url | ||||||
1663 | ### $width | ||||||
1664 | ### $height | ||||||
1665 | } | ||||||
1666 | 0 | 0 | if (defined $url) { | ||||
1667 | 0 | return ($url, $width, $height); | |||||
1668 | } | ||||||
1669 | } | ||||||
1670 | |||||||
1671 | # RSS | ||||||
1672 | # |
||||||
1673 | # |
||||||
1674 | # |
||||||
1675 | # |
||||||
1676 | # | ||||||
1677 | 0 | 0 | if (my $image_elt = $where->first_child('image')) { | ||||
1678 | 0 | my $url_elt; # XML::Twig::Elt where the url came from | |||||
1679 | my $url; # url string | ||||||
1680 | 0 | 0 | if ($url_elt = $image_elt->first_child('url')) { | ||||
1681 | 0 | $url = $url_elt->trimmed_text; | |||||
1682 | } else { | ||||||
1683 | # Cooper Hewitt museum http://blog.cooperhewitt.org/rss/?limit=10 | ||||||
1684 | # item |
||||||
1685 | # |
||||||
1686 | # ]]> | ||||||
1687 | # | ||||||
1688 | # don't want to encourage dodginess like this, but picking it out | ||||||
1689 | # isn't too hard | ||||||
1690 | 0 | 0 | if ($image_elt->text =~ / |
||||
1691 | ### image from html: $1 | ||||||
1692 | 0 | $url_elt = $image_elt; | |||||
1693 | 0 | $url = $1; | |||||
1694 | } | ||||||
1695 | } | ||||||
1696 | 0 | 0 | if (is_non_empty ($url)) { | ||||
1697 | 0 | my $width = $image_elt->first_child_text('width'); | |||||
1698 | 0 | 0 | 0 | unless (Scalar::Util::looks_like_number($width) && $width > 0) { | |||
1699 | 0 | $width = 0; | |||||
1700 | } | ||||||
1701 | 0 | my $height = $image_elt->first_child_text('height'); | |||||
1702 | 0 | 0 | 0 | unless (Scalar::Util::looks_like_number($height) && $height > 0) { | |||
1703 | 0 | $height = 0; | |||||
1704 | } | ||||||
1705 | ### item_image_uwh() RSS: $url | ||||||
1706 | 0 | return (App::RSS2Leafnode::XML::Twig::Other::elt_xml_based_uri ($url_elt, $url), | |||||
1707 | $width, $height); | ||||||
1708 | } | ||||||
1709 | } | ||||||
1710 | |||||||
1711 | # Atom channel |
||||||
1712 | # or channel |
||||||
1713 | # | ||||||
1714 | # |
||||||
1715 | # is bigger than the RSS 48x48, so would probably need shrinking. Rate | ||||||
1716 | # it below |
||||||
1717 | # | ||||||
1718 | # |
||||||
1719 | # Is it better to show the channel icon, being the From person? | ||||||
1720 | { | ||||||
1721 | 0 | my $elt; | |||||
0 | |||||||
1722 | 0 | my ($width, $height); | |||||
1723 | my $url = ((($elt = $where->first_child('icon')) | ||||||
1724 | && non_empty ($elt->text)) | ||||||
1725 | || (($elt = $where->first_child('logo')) | ||||||
1726 | && non_empty ($elt->text)) | ||||||
1727 | || (($elt = $where->first_child('itunes:image')) | ||||||
1728 | && non_empty ($elt->att('href'))) | ||||||
1729 | || (($elt = $where->first_child('media:thumbnail')) | ||||||
1730 | && is_non_empty ($elt->att('url')) | ||||||
1731 | && do { | ||||||
1732 | $width = $elt->att('width'); | ||||||
1733 | $height = $elt->att('height'); | ||||||
1734 | $elt->att('url') }) | ||||||
1735 | # seen att('atom:url' rather than plain 'url' ... | ||||||
1736 | || (($elt = $where->first_child('media:thumbnail')) | ||||||
1737 | && is_non_empty ($elt->att('atom:url')) | ||||||
1738 | 0 | 0 | && do { | ||||
1739 | $width = $elt->att('width'); | ||||||
1740 | $height = $elt->att('height'); | ||||||
1741 | $elt->att('atom:url') })); | ||||||
1742 | ### $url | ||||||
1743 | 0 | 0 | if ($url) { | ||||
1744 | 0 | 0 | 0 | unless (Scalar::Util::looks_like_number($width) && $width > 0) { | |||
1745 | 0 | $width = 0; | |||||
1746 | } | ||||||
1747 | 0 | 0 | 0 | unless (Scalar::Util::looks_like_number($height) && $height > 0) { | |||
1748 | 0 | $height = 0; | |||||
1749 | } | ||||||
1750 | 0 | return (App::RSS2Leafnode::XML::Twig::Other::elt_xml_based_uri ($elt, $url), | |||||
1751 | $width, | ||||||
1752 | $height); | ||||||
1753 | } | ||||||
1754 | } | ||||||
1755 | |||||||
1756 | # status.net for rss 1.0 | ||||||
1757 | # |
||||||
1758 | 0 | 0 | if (my $elt = $where->first_child('statusnet:postIcon')) { | ||||
1759 | 0 | 0 | if (is_non_empty (my $url = $elt->att('rdf:resource'))) { | ||||
1760 | 0 | return (App::RSS2Leafnode::XML::Twig::Other::elt_xml_based_uri ($elt, $url), | |||||
1761 | 0, 0); # unknown size | ||||||
1762 | } | ||||||
1763 | } | ||||||
1764 | |||||||
1765 | # |
||||||
1766 | # eg. from blogger.com | ||||||
1767 | # |
||||||
1768 | { | ||||||
1769 | 0 | my $elt; | |||||
0 | |||||||
1770 | 0 | 0 | 0 | if (($elt = $where->first_child('author')) | |||
0 | |||||||
0 | |||||||
1771 | && ($elt = $elt->first_child('gd:image')) | ||||||
1772 | && (is_non_empty (my $url = $elt->att('src') // $elt->att('atom:src')))) { | ||||||
1773 | ### $url | ||||||
1774 | 0 | 0 | return (App::RSS2Leafnode::XML::Twig::Other::elt_xml_based_uri ($elt, $url), | ||||
0 | |||||||
1775 | $elt->att('width') || $elt->att('atom:width') || 0, | ||||||
1776 | $elt->att('height') || $elt->att('atom:height') || 0); | ||||||
1777 | } | ||||||
1778 | } | ||||||
1779 | } | ||||||
1780 | 0 | return; | |||||
1781 | } | ||||||
1782 | @known{qw(/channel/logo | ||||||
1783 | /channel/icon | ||||||
1784 | /channel/image | ||||||
1785 | /channel/image/url | ||||||
1786 | /channel/image/width | ||||||
1787 | /channel/image/height | ||||||
1788 | /channel/image/title | ||||||
1789 | /channel/image/link | ||||||
1790 | /channel/image/description | ||||||
1791 | /channel/itunes:image | ||||||
1792 | /channel/statusnet:postIcon | ||||||
1793 | |||||||
1794 | /channel/item/image | ||||||
1795 | /channel/item/media:thumbnail | ||||||
1796 | /channel/item/statusnet:postIcon | ||||||
1797 | )} = (); | ||||||
1798 | |||||||
1799 | # $resp is a HTTP::Response | ||||||
1800 | # return a string value for the Face: header, or undef if no icon | ||||||
1801 | sub http_resp_to_face { | ||||||
1802 | 0 | 0 | 0 | my ($self, $resp) = @_; | |||
1803 | 0 | 0 | $self->{'get_icon'} || return; | ||||
1804 | |||||||
1805 | 0 | 0 | my $uri = http_resp_favicon_uri($resp) || return; | ||||
1806 | 0 | $self->verbose (2, ' response favicon URI: ', $uri); | |||||
1807 | 0 | return $self->download_face ($uri, 0, 0); | |||||
1808 | } | ||||||
1809 | |||||||
1810 | # $resp is a HTTP::Response | ||||||
1811 | # if it's a html with a favicon link return a URI object of that image | ||||||
1812 | # | ||||||
1813 | # http://www.w3.org/2005/10/howto-favicon | ||||||
1814 | # | ||||||
1815 | sub http_resp_favicon_uri { | ||||||
1816 | 0 | 0 | 0 | my ($resp) = @_; | |||
1817 | 0 | 0 | $resp->headers->content_is_html || return; | ||||
1818 | 0 | require HTML::Parser; | |||||
1819 | 0 | my $href; | |||||
1820 | my $p; | ||||||
1821 | $p = HTML::Parser->new (api_version => 3, | ||||||
1822 | start_h => [ sub { | ||||||
1823 | 0 | 0 | my ($tagname, $attr) = @_; | ||||
1824 | 0 | 0 | 0 | if ($tagname eq 'link' | |||
1825 | && $attr->{'rel'} eq 'icon') { | ||||||
1826 | 0 | $href = $attr->{'href'}; | |||||
1827 | 0 | $p->eof; | |||||
1828 | } | ||||||
1829 | 0 | }, "tagname, attr"]); | |||||
1830 | 0 | $resp->decode; | |||||
1831 | 0 | $p->parse ($resp->content); | |||||
1832 | 0 | 0 | return $href && URI->new_abs ($href, $resp->base); | ||||
1833 | } | ||||||
1834 | |||||||
1835 | # return base64 string value for "Face:" header | ||||||
1836 | # $width and $height are from attributes if known, or 0 if not | ||||||
1837 | sub download_face { | ||||||
1838 | 0 | 0 | 0 | my ($self, $uri, $width, $height) = @_; | |||
1839 | 0 | my $key = $uri->canonical->as_string; | |||||
1840 | 0 | 0 | if (! exists $self->{'download_face'}->{$key}) { | ||||
1841 | 0 | $self->{'download_face'}->{$key} | |||||
1842 | = $self->download_face_uncached ($uri, $width, $height); | ||||||
1843 | } | ||||||
1844 | 0 | return $self->{'download_face'}->{$key}; | |||||
1845 | } | ||||||
1846 | sub download_face_uncached { | ||||||
1847 | 0 | 0 | 0 | my ($self, $url, $width, $height) = @_; | |||
1848 | |||||||
1849 | 0 | $self->{'download_face_uncached'} = $url; | |||||
1850 | 0 | $self->verbose (1, ' image download: ', $url); | |||||
1851 | |||||||
1852 | 0 | require HTTP::Request; | |||||
1853 | 0 | my $req = HTTP::Request->new (GET => $url); | |||||
1854 | 0 | my $resp = $self->ua->request($req); | |||||
1855 | 0 | 0 | if (! $resp->is_success) { | ||||
1856 | 0 | print __x(" no image: {status}\n", | |||||
1857 | status => $resp->status_line); | ||||||
1858 | 0 | return; | |||||
1859 | } | ||||||
1860 | |||||||
1861 | 0 | my $type = $resp->content_type; | |||||
1862 | ### $type | ||||||
1863 | # FIXME: is mime=>$type the right way? could give it a look at the url | ||||||
1864 | # basename or server's suggested filename too, for Read() to use the | ||||||
1865 | # extension. | ||||||
1866 | 0 | 0 | 0 | if ($type eq 'image/vnd.microsoft.icon' || $type eq 'image/x-icon') { | |||
0 | |||||||
1867 | # mime.xml of imagemagick 6.6.0 only has "image/x-ico", and nothing for | ||||||
1868 | # ico in magic.xml | ||||||
1869 | 0 | $type = 'ico'; | |||||
1870 | } elsif ($type =~ m{^image/(.*)$}i) { | ||||||
1871 | 0 | $type = $1; | |||||
1872 | } else { | ||||||
1873 | 0 | $self->verbose (2, 'ignore non-image icon type: ',$type); | |||||
1874 | 0 | return; | |||||
1875 | } | ||||||
1876 | |||||||
1877 | 0 | $resp->decode; | |||||
1878 | 0 | my $data = $resp->content; | |||||
1879 | 0 | 0 | 0 | if ($type ne 'png' | |||
0 | |||||||
0 | |||||||
0 | |||||||
1880 | || $width == 0 || $height == 0 | ||||||
1881 | || $width > 48 || $height > 48) { | ||||||
1882 | 0 | 0 | $data = $self->imagemagick_to_png($type,$data) // return; | ||||
1883 | } | ||||||
1884 | 0 | $self->verbose (2, " image for Face ",length($data)," bytes"); | |||||
1885 | |||||||
1886 | # use a space as a separator since MIME::Entity will collapse out a | ||||||
1887 | # newline and make an enormous long word which then can't be split across | ||||||
1888 | # header lines and will likely exceed the nntp 998 char single-line limit | ||||||
1889 | 0 | require MIME::Base64; | |||||
1890 | 0 | $data = MIME::Base64::encode_base64($data, " "); | |||||
1891 | ### $data | ||||||
1892 | |||||||
1893 | 0 | return $data; | |||||
1894 | } | ||||||
1895 | |||||||
1896 | sub face_wh_ok { | ||||||
1897 | 0 | 0 | 0 | my ($self, $width, $height) = @_; | |||
1898 | |||||||
1899 | 0 | 0 | 0 | if ($width > 0 && $width > 2*$height) { | |||
1900 | # some obnoxious banner | ||||||
1901 | 0 | $self->verbose (1, ' ', | |||||
1902 | __x('image is a banner ({width}x{height}), ignore', | ||||||
1903 | width => $width, height => $height)); | ||||||
1904 | 0 | return 0; | |||||
1905 | } | ||||||
1906 | 0 | return 1; | |||||
1907 | } | ||||||
1908 | |||||||
1909 | #------------------------------------------------------------------------------ | ||||||
1910 | # ImageMagick bits | ||||||
1911 | |||||||
1912 | # $type is "gif", "ico" etc, $data is an image in a byte string | ||||||
1913 | # return a byte string of png, or undef if $data unrecognised | ||||||
1914 | sub imagemagick_to_png { | ||||||
1915 | 0 | 0 | 0 | my ($self, $type, $data) = @_; | |||
1916 | ### $type | ||||||
1917 | 0 | 0 | my $image = $self->imagemagick_from_data($type,$data) // return; | ||||
1918 | |||||||
1919 | 0 | my $width = $image->Get('width'); | |||||
1920 | 0 | my $height = $image->Get('height'); | |||||
1921 | ### compress: $image->Get('compression') | ||||||
1922 | 0 | $self->verbose (2, " image ${width}x${height}"); | |||||
1923 | 0 | 0 | 0 | if ($width == 0 || $height == 0) { | |||
1924 | 0 | return; | |||||
1925 | } | ||||||
1926 | 0 | 0 | 0 | if ($width <= 48 && $height <= 48 && $type eq 'png') { | |||
0 | |||||||
1927 | 0 | return $data; | |||||
1928 | } | ||||||
1929 | |||||||
1930 | # having downloaded the image is it better to keep a banner but shrink it, | ||||||
1931 | # or discard as no good? | ||||||
1932 | # | ||||||
1933 | # $self->face_wh_ok ($width, $height) || return; | ||||||
1934 | |||||||
1935 | 0 | 0 | 0 | if ($width > 48 || $height > 48) { | |||
1936 | 0 | my $factor; | |||||
1937 | 0 | 0 | 0 | if ($width <= 2*48 && $height <= 2*48) { | |||
1938 | 0 | $factor = 0.5; | |||||
1939 | } else { | ||||||
1940 | 0 | $factor = min (48 / $width, 48 / $height); | |||||
1941 | } | ||||||
1942 | 0 | $width = POSIX::ceil ($width * $factor); | |||||
1943 | 0 | $height = POSIX::ceil ($height * $factor); | |||||
1944 | 0 | $self->verbose (2, " image shrink by $factor to ${width}x${height}"); | |||||
1945 | # cf LiquidResize() or plain Resize() | ||||||
1946 | 0 | $image->AdaptiveResize (width => $width, height => $height); | |||||
1947 | } | ||||||
1948 | |||||||
1949 | 0 | my $ret = $image->Set (magick => 'PNG8'); | |||||
1950 | ### ret: "$ret" | ||||||
1951 | ### ret: $ret+0 | ||||||
1952 | 0 | 0 | if ($ret != 0) { | ||||
1953 | 0 | print "oops, imagemagick doesn't like PNG8: $ret\n"; | |||||
1954 | 0 | return; | |||||
1955 | } | ||||||
1956 | ### compress: $image->Get('compression') | ||||||
1957 | |||||||
1958 | # $image->Write ('/tmp/x.png'); | ||||||
1959 | 0 | ($data) = $image->ImageToBlob (); | |||||
1960 | 0 | return $data; | |||||
1961 | } | ||||||
1962 | |||||||
1963 | |||||||
1964 | # $type is "png", "ico" etc, $data is an image in a byte string | ||||||
1965 | # return a Image::Magick object, or undef if Perl-Magick not available | ||||||
1966 | sub imagemagick_from_data { | ||||||
1967 | 0 | 0 | 0 | my ($self, $type, $data) = @_; | |||
1968 | ### imagemagick_from_data(): $type | ||||||
1969 | 0 | 0 | eval { require Image::Magick } or return; | ||||
0 | |||||||
1970 | |||||||
1971 | 0 | my $image = Image::Magick->new (magick=>$type); | |||||
1972 | # $image->Set(debug=>'All'); | ||||||
1973 | 0 | my $ret = $image->BlobToImage ($data); | |||||
1974 | ### ret: "$ret" | ||||||
1975 | ### ret: $ret+0 | ||||||
1976 | 0 | 0 | if ($ret == 1) { | ||||
1977 | 0 | return $image; | |||||
1978 | } | ||||||
1979 | |||||||
1980 | # try again without the $type forced, in case bad Content-Type from http | ||||||
1981 | 0 | $image = Image::Magick->new; | |||||
1982 | # $image->Set(debug=>'All'); | ||||||
1983 | 0 | $ret = $image->BlobToImage ($data); | |||||
1984 | ### ret: "$ret" | ||||||
1985 | ### ret: $ret+0 | ||||||
1986 | 0 | 0 | if ($ret == 1) { | ||||
1987 | 0 | return $image; | |||||
1988 | } | ||||||
1989 | |||||||
1990 | 0 | print __x(" imagemagick doesn't like image data ({length} bytes) from {url}: {error}\n", | |||||
1991 | length => length($data), | ||||||
1992 | url => $self->{'download_face_uncached'}, | ||||||
1993 | error => $ret); | ||||||
1994 | 0 | return undef; | |||||
1995 | } | ||||||
1996 | |||||||
1997 | |||||||
1998 | #------------------------------------------------------------------------------ | ||||||
1999 | # XML::Liberal | ||||||
2000 | |||||||
2001 | use constant::defer have_xml_liberal => sub { | ||||||
2002 | 0 | 0 | my ($self) = @_; | ||||
2003 | 0 | 0 | 0 | if (eval { require XML::Liberal; 1 }) { | |||
0 | 0 | ||||||
0 | 0 | ||||||
2004 | 0 | 0 | return 1; | ||||
2005 | } | ||||||
2006 | 0 | 0 | $self->verbose (3, __x('XML::Liberal not available: {error}', error => $@)); | ||||
2007 | 0 | 0 | return 0; | ||||
2008 | 1 | 1 | 14 | }; | |||
1 | 2 | ||||||
1 | 13 | ||||||
2009 | |||||||
2010 | # try to correct $xmlstr | ||||||
2011 | # if successful return a new xml string, otherwise return undef | ||||||
2012 | sub xml_liberal_correction { | ||||||
2013 | 0 | 0 | 0 | my ($self, $xmlstr) = @_; | |||
2014 | 0 | 0 | $self->have_xml_liberal or return; | ||||
2015 | |||||||
2016 | ### try XML-Liberal ... | ||||||
2017 | 0 | my $liberal = XML::Liberal->new('LibXML'); | |||||
2018 | 0 | 0 | if (my $doc = eval { $liberal->parse_string($xmlstr) }) { | ||||
0 | |||||||
2019 | 0 | return $doc->toString; | |||||
2020 | } else { | ||||||
2021 | 0 | $self->verbose (2, __x('XML::Liberal parse error: {error}', error => $@)); | |||||
2022 | 0 | return undef; | |||||
2023 | } | ||||||
2024 | } | ||||||
2025 | |||||||
2026 | |||||||
2027 | #------------------------------------------------------------------------------ | ||||||
2028 | # error as news message | ||||||
2029 | |||||||
2030 | sub error_message { | ||||||
2031 | 0 | 0 | 0 | my ($self, $subject, $message, $attach_bytes) = @_; | |||
2032 | |||||||
2033 | 0 | require Encode; | |||||
2034 | 0 | my $charset = 'utf-8'; | |||||
2035 | 0 | $message = str_ensure_newline ($message); | |||||
2036 | 0 | $message = Encode::encode ($charset, $message, Encode::FB_DEFAULT()); | |||||
2037 | |||||||
2038 | 0 | my $date = rfc822_time_now(); | |||||
2039 | 0 | require Digest::MD5; | |||||
2040 | 0 | my $msgid = $self->url_to_msgid | |||||
2041 | ('http://localhost', | ||||||
2042 | Digest::MD5::md5_base64 ($date.$subject.$message)); | ||||||
2043 | |||||||
2044 | 0 | my $top = $self->mime_build | |||||
2045 | ({ | ||||||
2046 | 'Path:' => 'localhost', | ||||||
2047 | 'Newsgroups:' => $self->{'nntp_group'}, | ||||||
2048 | From => __('RSS2Leafnode').' |
||||||
2049 | Subject => $subject, | ||||||
2050 | Date => $date, | ||||||
2051 | 'Message-ID' => $msgid, | ||||||
2052 | }, | ||||||
2053 | Top => 1, | ||||||
2054 | Type => 'text/plain', | ||||||
2055 | Charset => $charset, | ||||||
2056 | Data => $message); | ||||||
2057 | |||||||
2058 | 0 | 0 | if (defined $attach_bytes) { | ||||
2059 | 0 | $top->make_multipart; | |||||
2060 | 0 | my $part = $self->mime_build | |||||
2061 | ({}, | ||||||
2062 | Charset => 'none', | ||||||
2063 | Type => 'application/octet-stream', | ||||||
2064 | Data => $attach_bytes); | ||||||
2065 | 0 | $top->add_part ($part); | |||||
2066 | } | ||||||
2067 | |||||||
2068 | 0 | mime_entity_lines($top); | |||||
2069 | 0 | 0 | $self->nntp_post($top) || return; | ||||
2070 | 0 | say __x('{group} 1 new article', group => $self->{'nntp_group'}); | |||||
2071 | } | ||||||
2072 | |||||||
2073 | |||||||
2074 | #------------------------------------------------------------------------------ | ||||||
2075 | # fetch HTML | ||||||
2076 | |||||||
2077 | sub http_resp_to_from { | ||||||
2078 | 0 | 0 | 0 | my ($self, $resp) = @_; | |||
2079 | ### http_resp_to_from() | ||||||
2080 | 0 | 0 | return $self->http_resp_exiftool_author($resp) | ||||
2081 | // 'nobody@'.$self->uri_to_host; | ||||||
2082 | } | ||||||
2083 | sub http_resp_exiftool_author { | ||||||
2084 | 0 | 0 | 0 | my ($self, $resp) = @_; | |||
2085 | # PNG Author field, or HTML author | ||||||
2086 | 0 | 0 | my $author = resp_exiftool_info($resp)->{'Author'} // return; | ||||
2087 | 0 | return $self->email_format_maybe (Encode::decode_utf8($author), '', undef); | |||||
2088 | } | ||||||
2089 | |||||||
2090 | sub http_resp_to_copyright { | ||||||
2091 | 0 | 0 | 0 | my ($self, $resp) = @_; | |||
2092 | ### http_http_resp_to_copyright() ... | ||||||
2093 | |||||||
2094 | 0 | my @copyrights = non_empty($resp->header('X-Meta-Copyright')); | |||||
2095 | 0 | 0 | unless ($resp->content_type =~ m{^text/}) { | ||||
2096 | # PNG Copyright field, perhaps other formats | ||||||
2097 | 0 | push @copyrights, non_empty(resp_exiftool_info($resp)->{'Copyright'}); | |||||
2098 | } | ||||||
2099 | 0 | return \@copyrights; | |||||
2100 | } | ||||||
2101 | |||||||
2102 | # return a "Keywords:" string, or undef if nothing | ||||||
2103 | sub http_resp_to_keywords { | ||||||
2104 | 0 | 0 | 0 | my ($self, $resp) = @_; | |||
2105 | ### http_resp_to_keywords() ... | ||||||
2106 | |||||||
2107 | 0 | my @keywords = $resp->header('X-Meta-Keywords'); | |||||
2108 | |||||||
2109 | 0 | 0 | if ($resp->headers->content_is_html) { | ||||
2110 | 0 | $resp->decode; | |||||
2111 | 0 | require HTML::Parser; | |||||
2112 | my $p = HTML::Parser->new | ||||||
2113 | (api_version => 3, | ||||||
2114 | report_tags => ['meta'], | ||||||
2115 | start_h => [ sub { | ||||||
2116 | 0 | 0 | my ($tagname, $attr) = @_; | ||||
2117 | # facebook thing | ||||||
2118 | 0 | 0 | 0 | if ($tagname eq 'meta' | |||
0 | |||||||
2119 | && lc($attr->{'property'}||'') eq 'og:type') { | ||||||
2120 | 0 | push @keywords, $attr->{'content'}; | |||||
2121 | } | ||||||
2122 | |||||||
2123 | 0 | }, "tagname, attr" ]); | |||||
2124 | 0 | $p->parse ($resp->decoded_content); | |||||
2125 | } | ||||||
2126 | ### @keywords | ||||||
2127 | |||||||
2128 | 0 | return join_non_empty | |||||
2129 | 0 | (', ', List::MoreUtils::uniq(map {collapse_whitespace($_)} | |||||
2130 | @keywords)); | ||||||
2131 | } | ||||||
2132 | |||||||
2133 | sub fetch_html { | ||||||
2134 | 0 | 0 | 1 | my ($self, $group, $url, %options) = @_; | |||
2135 | ### fetch_html() ... | ||||||
2136 | |||||||
2137 | 0 | local @{$self}{keys %options} = values %options; # hash slice | |||||
0 | |||||||
2138 | 0 | $self->verbose (1, __x('page: {url}', url => $url)); | |||||
2139 | |||||||
2140 | 0 | my $group_uri = URI->new($group,'news'); | |||||
2141 | 0 | local $self->{'nntp_host'} = uri_to_nntp_host ($group_uri); | |||||
2142 | 0 | local $self->{'nntp_group'} = $group = $group_uri->group; | |||||
2143 | 0 | 0 | $self->nntp_group_check($group) or return; | ||||
2144 | |||||||
2145 | 0 | require HTTP::Request; | |||||
2146 | 0 | my $req = HTTP::Request->new (GET => $url); | |||||
2147 | 0 | $self->status_etagmod_req ($req); | |||||
2148 | 0 | my $resp = $self->ua->request($req); | |||||
2149 | 0 | 0 | if ($resp->code == 304) { | ||||
2150 | 0 | $self->status_unchanged ($url); | |||||
2151 | 0 | return; | |||||
2152 | } | ||||||
2153 | 0 | 0 | if (! $resp->is_success) { | ||||
2154 | 0 | print __x("rss2leafnode: {url}\n {status}\n", | |||||
2155 | url => $url, | ||||||
2156 | status => $resp->status_line); | ||||||
2157 | 0 | return; | |||||
2158 | } | ||||||
2159 | 0 | $self->verbose (2, $resp->headers->as_string); | |||||
2160 | 0 | $self->enforce_html_charset_from_content ($resp); | |||||
2161 | |||||||
2162 | # message id is either the etag if present, or an md5 of the content if not | ||||||
2163 | my $msgid = $self->url_to_msgid | ||||||
2164 | ($url, | ||||||
2165 | 0 | 0 | $resp->header('ETag') // do { | ||||
2166 | 0 | require Digest::MD5; | |||||
2167 | 0 | $resp->decode; | |||||
2168 | 0 | my $content = $resp->content; | |||||
2169 | 0 | Digest::MD5::md5_base64($content) | |||||
2170 | }); | ||||||
2171 | 0 | 0 | return 0 if $self->nntp_message_id_exists ($msgid); | ||||
2172 | |||||||
2173 | 0 | 0 | my $subject = (html_title($resp) | ||||
0 | |||||||
2174 | // $resp->filename | ||||||
2175 | # show original url in subject, not anywhere redirected | ||||||
2176 | // __x('RSS2Leafnode {url}', url => $url)); | ||||||
2177 | |||||||
2178 | 0 | my $from = $self->http_resp_to_from($resp); | |||||
2179 | 0 | my $date = $resp->header('Last-Modified'); | |||||
2180 | 0 | my $face = $self->http_resp_to_face($resp); | |||||
2181 | 0 | my $copyright = $self->http_resp_to_copyright($resp); | |||||
2182 | 0 | my $keywords = $self->http_resp_to_keywords($resp); | |||||
2183 | |||||||
2184 | 0 | my $part = $self->http_resp_extract_main($resp); | |||||
2185 | |||||||
2186 | 0 | my $top = $self->mime_part_from_response | |||||
2187 | ($resp, | ||||||
2188 | Top => 1, | ||||||
2189 | 'Path:' => scalar($self->uri_to_host), | ||||||
2190 | 'Newsgroups:' => $group, | ||||||
2191 | From => $from, | ||||||
2192 | Subject => $subject, | ||||||
2193 | Date => $date, | ||||||
2194 | 'Message-ID' => $msgid, | ||||||
2195 | Keywords => $keywords, | ||||||
2196 | 'Face:' => $face, | ||||||
2197 | 'X-Copyright:' => $copyright); | ||||||
2198 | 0 | 0 | if ($part) { | ||||
2199 | ### attach full part ... | ||||||
2200 | 0 | $top->make_multipart; | |||||
2201 | 0 | $top->add_part ($part); | |||||
2202 | } | ||||||
2203 | |||||||
2204 | 0 | mime_entity_lines($top); | |||||
2205 | 0 | 0 | $self->nntp_post($top) || return; | ||||
2206 | 0 | $self->status_etagmod_resp ($url, $resp); | |||||
2207 | 0 | say __x("{group} 1 new article", group => $group); | |||||
2208 | } | ||||||
2209 | |||||||
2210 | # $resp is a HTTP::Response | ||||||
2211 | # If the $self->{'html_extract_main'} option is true and $resp is html then | ||||||
2212 | # resplace the $resp content with HTML::ExtractMain extracted part. | ||||||
2213 | # | ||||||
2214 | sub http_resp_extract_main { | ||||||
2215 | 0 | 0 | 0 | my ($self, $resp) = @_; | |||
2216 | |||||||
2217 | 0 | 0 | $self->{'html_extract_main'} or return; | ||||
2218 | 0 | 0 | $resp->headers->content_is_html() or return; | ||||
2219 | |||||||
2220 | 0 | 0 | my $full_part | ||||
2221 | = (defined $self->{'html_extract_main'} | ||||||
2222 | && $self->{'html_extract_main'} eq 'attach_full' | ||||||
2223 | && $self->mime_part_from_response($resp, | ||||||
2224 | Disposition => "attachment")); | ||||||
2225 | |||||||
2226 | 0 | require HTML::ExtractMain; | |||||
2227 | 0 | HTML::ExtractMain->VERSION(0.63); # for output_type=>'html' | |||||
2228 | 0 | $resp->decode; # expand any compression | |||||
2229 | 0 | my $content = $resp->decoded_content; # as wide-chars | |||||
2230 | |||||||
2231 | # Output type 'html' differs from the default xhtml by a few entities, in | ||||||
2232 | # particular it avoids ' which is an xml-ism not in the html standards. | ||||||
2233 | # Various browsers support &apos anyway, but not for example by w3m. | ||||||
2234 | 0 | $content = HTML::ExtractMain::extract_main_html($content, | |||||
2235 | output_type => 'html'); | ||||||
2236 | 0 | 0 | if (! defined $content) { | ||||
2237 | 0 | $self->verbose(1, __(" HTML::ExtractMain no main part found, posting whole")); | |||||
2238 | 0 | return; | |||||
2239 | } | ||||||
2240 | ### main extracted: $content | ||||||
2241 | 0 | $resp->remove_header('Content-MD5'); # since changed content | |||||
2242 | 0 | my $charset = $resp->content_charset; | |||||
2243 | 0 | $content = Encode::encode ($charset, $content); | |||||
2244 | 0 | $resp->content($content); | |||||
2245 | |||||||
2246 | 0 | return $full_part; | |||||
2247 | } | ||||||
2248 | |||||||
2249 | #------------------------------------------------------------------------------ | ||||||
2250 | # RSS hacks | ||||||
2251 | |||||||
2252 | # This is a hack for Yahoo Finance feed uniqification. | ||||||
2253 | # $item is a feed hashref. If it has 'link' field with a yahoo.com | ||||||
2254 | # redirection like | ||||||
2255 | # | ||||||
2256 | # http://au.rd.yahoo.com/finance/news/rss/financenews/*http://au.biz.yahoo.com/071003/30/1fdvx.html | ||||||
2257 | # | ||||||
2258 | # then return the last target url part. Otherwise return false. | ||||||
2259 | # | ||||||
2260 | # This allows the item to be identified by its final target link, so as to | ||||||
2261 | # avoid duplication when the item appears in multiple yahoo feeds with a | ||||||
2262 | # different leading part. (There's no guid in yahoo feeds, as of Oct 2007.) | ||||||
2263 | # | ||||||
2264 | sub item_yahoo_permalink { | ||||||
2265 | 0 | 0 | 0 | my ($item) = @_; | |||
2266 | 0 | 0 | my $url = $item->first_child_text('link') | ||||
2267 | // return undef; | ||||||
2268 | 0 | 0 | $url =~ m{^http://[^/]*yahoo\.com/.*\*(http://.*yahoo\.com.*)$} | ||||
2269 | or return undef; | ||||||
2270 | 0 | return $1; | |||||
2271 | } | ||||||
2272 | |||||||
2273 | # This is a special case for Google Groups RSS feeds. | ||||||
2274 | # The arguments are link elements [$name,$uri]. If there's a google groups | ||||||
2275 | # like "http://groups.google.com/group/cfcdev/msg/445d4ccfdabf086b" then | ||||||
2276 | # return a mailing list address like "cfcdev@googlegroups.com". If not in | ||||||
2277 | # that form then return undef. | ||||||
2278 | # | ||||||
2279 | sub googlegroups_link_email { | ||||||
2280 | ## no critic (RequireInterpolationOfMetachars) | ||||||
2281 | 0 | 0 | 0 | foreach my $l (@_) { | |||
2282 | 0 | 0 | 0 | if ($l->{'uri'} | |||
2283 | && $l->{'uri'}->canonical =~ m{^http://groups\.google\.com/group/([^/]+)/}) { | ||||||
2284 | 0 | return ($1 . '@googlegroups.com'); | |||||
2285 | } | ||||||
2286 | } | ||||||
2287 | 0 | return undef; | |||||
2288 | } | ||||||
2289 | |||||||
2290 | # This is a nasty hack for http://www.aireview.com.au/rss.php | ||||||
2291 | # $url is a link url string just fetched, $resp is a HTTP::Response. The | ||||||
2292 | # return is a possibly new HTTP::Response object. | ||||||
2293 | # | ||||||
2294 | # The first fetch of an item link from aireview gives back content like | ||||||
2295 | # | ||||||
2296 | # | ||||||
2297 | # | ||||||
2298 | # plus some cookies in the headers. The URL "zz=1" in that line seems very | ||||||
2299 | # dodgy, it ends up going to the home page with mozilla. In any case a | ||||||
2300 | # fresh fetch of the link url with the cookies provided is enough to get the | ||||||
2301 | # actual content. | ||||||
2302 | # | ||||||
2303 | # The LWP::UserAgent::FramesReady module on cpan has a similar match of a | ||||||
2304 | # Refresh, for use with frames. It works by turning the response into a | ||||||
2305 | # "302 Moved temporarily" for LWP to follow. urlcheck.pl at | ||||||
2306 | # http://www.cpan.org/authors/id/P/PH/PHILMI/urlcheck-1.00.pl likewise | ||||||
2307 | # follows. But alas both obey the URL given in the , which is no good | ||||||
2308 | # here. | ||||||
2309 | # | ||||||
2310 | sub aireview_follow { | ||||||
2311 | 0 | 0 | 0 | my ($self, $url, $resp) = @_; | |||
2312 | |||||||
2313 | 0 | 0 | if ($resp->is_success) { | ||||
2314 | 0 | $resp->decode; | |||||
2315 | 0 | my $content = $resp->content; | |||||
2316 | 0 | 0 | if ($content =~ /]*Refresh[^>]*checkForCookies/i) { | ||||
2317 | 0 | $self->verbose (1, ' following aireview META Refresh with cookies'); | |||||
2318 | 0 | require HTTP::Request; | |||||
2319 | 0 | my $req = HTTP::Request->new (GET => $url); | |||||
2320 | 0 | $resp = $self->ua->request($req); | |||||
2321 | } | ||||||
2322 | } | ||||||
2323 | 0 | return $resp; | |||||
2324 | } | ||||||
2325 | |||||||
2326 | |||||||
2327 | #------------------------------------------------------------------------------ | ||||||
2328 | # RSS links | ||||||
2329 | |||||||
2330 | |||||||
2331 | # WordPress (http://wordpress.org/) pre 2.5 had a bug | ||||||
2332 | # http://core.trac.wordpress.org/ticket/6579 where it gave | ||||||
2333 | # type="appication/atom+xml" missing the "l" in "application/". | ||||||
2334 | # Don't want to workaround every bad generator, but this one is GPL | ||||||
2335 | # free and the past versions still found for instance in the | ||||||
2336 | # language log http://languagelog.ldc.upenn.edu/nll/ in Feb 2011 | ||||||
2337 | # | ||||||
2338 | sub mime_type_is_rss { | ||||||
2339 | 0 | 0 | 0 | my ($self, $type) = @_; | |||
2340 | 0 | return ($type =~ m{^appl?ication/atom\+xml$}); | |||||
2341 | } | ||||||
2342 | sub atom_link_is_rss { | ||||||
2343 | 0 | 0 | 0 | my ($self, $elt) = @_; | |||
2344 | 0 | 0 | my $type = $elt->att('atom:type') // $elt->att('type') // return 0; | ||||
0 | |||||||
2345 | 0 | return $self->mime_type_is_rss($type); | |||||
2346 | } | ||||||
2347 | |||||||
2348 | # $str is a string like "doi:10.1000/182" or "10.1000/182". | ||||||
2349 | # Return a url string like "http://doi.org/10.1000/182". | ||||||
2350 | sub doi_to_uri { | ||||||
2351 | 0 | 0 | 0 | my ($str) = @_; | |||
2352 | 0 | $str =~ s/^doi://; | |||||
2353 | |||||||
2354 | # DOI numbers can potentially include URI reserved characters, let URI.pm | ||||||
2355 | # percent encode them when necessary. | ||||||
2356 | 0 | my $uri = URI->new('http://doi.org/'); | |||||
2357 | 0 | $uri->path($str); | |||||
2358 | 0 | return $uri; | |||||
2359 | } | ||||||
2360 | |||||||
2361 | # return list of hashrefs, each being | ||||||
2362 | # { name => $str, | ||||||
2363 | # uri => $uri_object, | ||||||
2364 | # download => $boolean, | ||||||
2365 | # priority => $number, | ||||||
2366 | # } | ||||||
2367 | # | ||||||
2368 | # Links are listed from highest to lowest priority. The current priority | ||||||
2369 | # levels are | ||||||
2370 | # 0 plain links | ||||||
2371 | # -10 comment RSS | ||||||
2372 | # -20 author home page | ||||||
2373 | # -100 geo location text-only | ||||||
2374 | # -101 statusnet geo location | ||||||
2375 | # -200 |
||||||
2376 | # | ||||||
2377 | sub item_to_links { | ||||||
2378 | 0 | 0 | 0 | my ($self, $item) = @_; | |||
2379 | |||||||
2380 | # |
||||||
2381 | # something has been expanded into the item, or should it be shown? | ||||||
2382 | |||||||
2383 | # FIXME: |
||||||
2384 | # |
||||||
2385 | # Can have a |
||||||
2386 | # | ||||||
2387 | # ENHANCE-ME: |
||||||
2388 | # | ||||||
2389 | # FIXME: |
||||||
2390 | # formats etc. Have seen this from archive.org just duplicating | ||||||
2391 | # |
||||||
2392 | # | ||||||
2393 | # |
||||||
2394 | # case. | ||||||
2395 | # | ||||||
2396 | 0 | my @elts = $item->children (qr/^(link | |||||
2397 | |enclosure | ||||||
2398 | |content | ||||||
2399 | |wiki:diff | ||||||
2400 | |wiki:history | ||||||
2401 | |comments | ||||||
2402 | |wfw:comment | ||||||
2403 | |wfw:commentRss | ||||||
2404 | |foaf:maker | ||||||
2405 | |sioc:has_creator | ||||||
2406 | |sioc:has_discussion | ||||||
2407 | |sioc:links_to | ||||||
2408 | |sioc:reply_of | ||||||
2409 | |statusnet:origin | ||||||
2410 | |dc:source | ||||||
2411 | |prism:url | ||||||
2412 | )$/ix); | ||||||
2413 | ### link elts: "@elts" | ||||||
2414 | |||||||
2415 | 0 | my @links; | |||||
2416 | 0 | foreach my $elt (@elts) { | |||||
2417 | 0 | 0 | if ($self->{'verbose'} >= 2) { | ||||
2418 | 0 | require Text::Wrap; | |||||
2419 | 0 | local $Text::Wrap::huge = 'overflow'; # don't break long URLs etc | |||||
2420 | 0 | $self->verbose (2, "link\n", Text::Trim::trim($elt->sprint)); | |||||
2421 | } | ||||||
2422 | |||||||
2423 | 0 | my $tag = lc($elt->tag); | |||||
2424 | ### $tag | ||||||
2425 | 0 | 0 | 0 | if ($tag eq 'content' && atom_content_flavour($elt) ne 'link') { | |||
2426 | 0 | next; | |||||
2427 | } | ||||||
2428 | 0 | my $l = { download => 1 }; | |||||
2429 | |||||||
2430 | 0 | foreach my $name ('hreflang', 'title', 'type') { | |||||
2431 | 0 | 0 | $l->{$name} = ($elt->att("atom:$name") // $elt->att($name)); | ||||
2432 | } | ||||||
2433 | |||||||
2434 | 0 | 0 | my $rel = non_empty($elt->att('atom:rel') // $elt->att('rel')); | ||||
2435 | 0 | 0 | if (defined $rel) { | ||||
2436 | # Atom rel="..." | ||||||
2437 | # Maybe: if ($rel eq 'next') ... # not sure about "next" link | ||||||
2438 | |||||||
2439 | 0 | 0 | 0 | if ($rel eq 'self' # the feed itself (in the channel normally) | |||
0 | |||||||
0 | |||||||
2440 | || $rel eq 'edit' # to edit the item, maybe | ||||||
2441 | || $rel eq 'service.edit' # to edit the item | ||||||
2442 | || $rel eq 'license' # probably only in the channel part normally | ||||||
2443 | ) { | ||||||
2444 | 0 | $self->verbose (1, ' ', __x('skip link "{type}"', type => $rel)); | |||||
2445 | 0 | next; | |||||
2446 | } | ||||||
2447 | 0 | 0 | if ($rel eq 'alternate') { | ||||
0 | |||||||
0 | |||||||
0 | |||||||
0 | |||||||
0 | |||||||
0 | |||||||
0 | |||||||
2448 | # "alternate" is supposed to be the content as the entry, but in a | ||||||
2449 | # web page or something. Not sure that's always quite true, so show | ||||||
2450 | # it as a plain link. If no |
||||||
2451 | # supposed to be mandatory. | ||||||
2452 | |||||||
2453 | } elsif ($rel eq 'enclosure') { | ||||||
2454 | 0 | $l->{'name'} = __('Encl'); | |||||
2455 | |||||||
2456 | } elsif ($rel eq 'ostatus:conversation') { | ||||||
2457 | 0 | $l->{'name'} = __('Conversation'); | |||||
2458 | 0 | $l->{'download'} = 0; | |||||
2459 | |||||||
2460 | } elsif ($rel eq 'ostatus:attention') { | ||||||
2461 | 0 | $l->{'name'} = __('Attention'); | |||||
2462 | 0 | $l->{'download'} = 0; | |||||
2463 | |||||||
2464 | } elsif ($rel eq 'related') { | ||||||
2465 | 0 | $l->{'name'} = __('Related'); | |||||
2466 | |||||||
2467 | } elsif ($rel eq 'replies') { | ||||||
2468 | # Atom | ||||||
2469 | 0 | my $count = $self->item_elt_comments_count($item,$elt); | |||||
2470 | 0 | 0 | if ($self->atom_link_is_rss($elt)) { | ||||
2471 | 0 | 0 | $l->{'name'} = (defined $count | ||||
2472 | ? __x('RSS Replies({count})', count => $count) | ||||||
2473 | : __('RSS Replies')); | ||||||
2474 | 0 | $l->{'priority'} = -10; | |||||
2475 | } else { | ||||||
2476 | 0 | 0 | $l->{'name'} = (defined $count | ||||
2477 | ? __x('Replies({count})', count => $count) | ||||||
2478 | : __('Replies')); | ||||||
2479 | } | ||||||
2480 | 0 | $l->{'download'} = 0; | |||||
2481 | |||||||
2482 | } elsif ($rel eq 'service.post') { | ||||||
2483 | 0 | $l->{'name'} = __('Comments'); | |||||
2484 | 0 | $l->{'download'} = 0; | |||||
2485 | |||||||
2486 | } elsif ($rel eq 'via') { | ||||||
2487 | 0 | $l->{'name'} = __('Via'); | |||||
2488 | 0 | $l->{'download'} = 0; | |||||
2489 | |||||||
2490 | } else { | ||||||
2491 | 0 | $l->{'name'} = __x('{linkrel}', linkrel => $rel); | |||||
2492 | } | ||||||
2493 | |||||||
2494 | } else { # ! defined $rel | ||||||
2495 | # tags without rel="" attribute | ||||||
2496 | # | ||||||
2497 | 0 | 0 | if ($tag eq 'enclosure') { | ||||
0 | |||||||
0 | |||||||
0 | |||||||
0 | |||||||
0 | |||||||
0 | |||||||
0 | |||||||
0 | |||||||
2498 | 0 | $l->{'name'} = __('Encl'); | |||||
2499 | |||||||
2500 | } elsif ($tag eq 'dc:source') { | ||||||
2501 | # might be free form text or might be url or other formal system | ||||||
2502 | 0 | $l->{'name'} = __('Source'); | |||||
2503 | 0 | $l->{'download'} = 0; | |||||
2504 | 0 | $l->{'maybe_text'} = 1; | |||||
2505 | 0 | $l->{'priority'} = -10; | |||||
2506 | |||||||
2507 | } elsif ($tag eq 'wiki:diff') { | ||||||
2508 | 0 | $l->{'name'} = __('Diff'); | |||||
2509 | |||||||
2510 | } elsif ($tag eq 'wiki:history') { | ||||||
2511 | 0 | $l->{'name'} = __('History'); | |||||
2512 | 0 | $l->{'download'} = 0; | |||||
2513 | |||||||
2514 | } elsif ($tag =~ /foaf:maker|sioc:has_creator/) { | ||||||
2515 | 0 | $l->{'name'} = __('Author'); | |||||
2516 | 0 | $l->{'download'} = 0; | |||||
2517 | 0 | $l->{'priority'} = -20; # low | |||||
2518 | |||||||
2519 | } elsif ($tag eq 'statusnet:origin') { | ||||||
2520 | 0 | $l->{'name'} = __('Geo location'); | |||||
2521 | 0 | $l->{'download'} = 0; | |||||
2522 | 0 | $l->{'priority'} = -101; # just after Geo location | |||||
2523 | |||||||
2524 | } elsif ($tag eq 'sioc:has_discussion') { | ||||||
2525 | 0 | $l->{'name'} = __('Discussion'); | |||||
2526 | 0 | $l->{'download'} = 0; | |||||
2527 | |||||||
2528 | } elsif ($tag eq 'wfw:commentrss') { | ||||||
2529 | 0 | 0 | if (defined (my $count = $self->item_elt_comments_count($item,$elt))) { | ||||
2530 | 0 | $l->{'name'} = __x('RSS Comments({count})', count => $count); | |||||
2531 | } else { | ||||||
2532 | 0 | $l->{'name'} = __('RSS Comments'); | |||||
2533 | } | ||||||
2534 | 0 | $l->{'download'} = 0; | |||||
2535 | 0 | $l->{'priority'} = -10; | |||||
2536 | |||||||
2537 | } elsif ($tag =~ /comment/) { # |
||||||
2538 | 0 | 0 | if (defined (my $count = $self->item_elt_comments_count($item,$elt))) { | ||||
2539 | 0 | $l->{'name'} = __x('Comments({count})', count => $count); | |||||
2540 | } else { | ||||||
2541 | 0 | $l->{'name'} = __('Comments'); | |||||
2542 | } | ||||||
2543 | 0 | $l->{'download'} = 0; | |||||
2544 | } | ||||||
2545 | } | ||||||
2546 | |||||||
2547 | # Atom | ||||||
2548 | # RSS http:.. | ||||||
2549 | # RSS |
||||||
2550 | 0 | 0 | $l->{'uri'} //= (non_empty ($elt->att('atom:href')) # Atom | ||||
0 | |||||||
0 | |||||||
0 | |||||||
0 | |||||||
0 | |||||||
2551 | // non_empty ($elt->att('href')) # Atom | ||||||
2552 | // non_empty ($elt->att('atom:src')) # Atom |
||||||
2553 | // non_empty ($elt->att('src')) # Atom |
||||||
2554 | // non_empty ($elt->att('url')) # RSS |
||||||
2555 | # |
||||||
2556 | // non_empty ($elt->att('rdf:resource'))); | ||||||
2557 | 0 | 0 | if (defined $l->{'ur'}) { | ||||
2558 | 0 | $l->{'maybe_text'} = 0; # above are definite urls | |||||
2559 | } | ||||||
2560 | 0 | 0 | $l->{'uri'} //= (non_empty ($elt->trimmed_text) # RSS | ||||
0 | |||||||
2561 | // next); # no contents | ||||||
2562 | 0 | 0 | 0 | if ($l->{'maybe_text'} && $l->{'uri'} !~ m{^[a-z]+://}) { | |||
2563 | 0 | $l->{'name'} .= ': ' . delete $l->{'uri'}; | |||||
2564 | } else { | ||||||
2565 | 0 | $l->{'uri'} = App::RSS2Leafnode::XML::Twig::Other::elt_xml_based_uri ($elt, $l->{'uri'}); | |||||
2566 | } | ||||||
2567 | |||||||
2568 | 0 | 0 | $l->{'name'} //= __('Link'); | ||||
2569 | |||||||
2570 | 0 | my @paren; | |||||
2571 | # show length if biggish, often provided on enclosures but not plain | ||||||
2572 | # links | ||||||
2573 | 0 | 0 | 0 | if (defined (my $length = ($elt->att('atom:length') | |||
2574 | // $elt->att('length')))) { | ||||||
2575 | 0 | push @paren, $self->format_size_in_bytes($length); | |||||
2576 | } | ||||||
2577 | # |
||||||
2578 | # seconds, otherwise MM:SS or HH:MM:SS. | ||||||
2579 | 0 | 0 | 0 | if ($tag eq 'enclosure' | |||
2580 | && defined (my $duration = non_empty ($item->first_child_text('itunes:duration')))) { | ||||||
2581 | 0 | 0 | if ($duration !~ /:/) { | ||||
2582 | 0 | $duration = __px('s-for-seconds', '{duration}s', | |||||
2583 | duration => $duration); | ||||||
2584 | } | ||||||
2585 | 0 | push @paren, collapse_whitespace($duration); | |||||
2586 | } | ||||||
2587 | 0 | 0 | if (@paren) { | ||||
2588 | 0 | $l->{'name'} .= '('.join(', ',@paren). ')'; | |||||
2589 | } | ||||||
2590 | |||||||
2591 | ### push link: $l | ||||||
2592 | 0 | push @links, $l; | |||||
2593 | } | ||||||
2594 | |||||||
2595 | 0 | 0 | 0 | if (! $item->first_child('prism:url') | |||
2596 | && (my $elt = $item->first_child('prism:doi'))) { | ||||||
2597 | # Eg. http://www.nature.com/nature/current_issue/rss | ||||||
2598 | # which also has dc:identifier as the same DOI | ||||||
2599 | 0 | push @links, { name => __('DOI'), | |||||
2600 | uri => doi_to_uri($elt->trimmed_text), | ||||||
2601 | }; | ||||||
2602 | } | ||||||
2603 | |||||||
2604 | # eg. RSS |
||||||
2605 | # Atom |
||||||
2606 | # |
||||||
2607 | # |
||||||
2608 | # | ||||||
2609 | # |
||||||
2610 | # | ||||||
2611 | # | ||||||
2612 | # | ||||||
2613 | 0 | foreach my $elt ($item->children('source')) { | |||||
2614 | 0 | 0 | my $str = non_empty (elt_to_rendered_line($elt->first_child('title'))) | ||||
2615 | // non_empty ($elt->trimmed_text); | ||||||
2616 | 0 | 0 | if (defined $str) { | ||||
2617 | ### source: $str | ||||||
2618 | 0 | push @links, { name => __('Source') . ": $str", | |||||
2619 | download => 0, | ||||||
2620 | priority => -200, | ||||||
2621 | }; | ||||||
2622 | } | ||||||
2623 | 0 | foreach my $subelt | |||||
0 | |||||||
2624 | ($elt, | ||||||
2625 | grep {$self->atom_link_is_rss($_)} $elt->children('link') | ||||||
2626 | ) { | ||||||
2627 | 0 | 0 | 0 | if (defined $subelt | |||
0 | |||||||
0 | |||||||
2628 | && defined (my $url = non_empty ($subelt->att('url')) | ||||||
2629 | // non_empty ($subelt->att('href')) | ||||||
2630 | // non_empty ($subelt->att('atom:href')))) { | ||||||
2631 | 0 | push @links, { name => __('Source RSS'), | |||||
2632 | uri => App::RSS2Leafnode::XML::Twig::Other::elt_xml_based_uri($subelt,$url), | ||||||
2633 | download => 0, | ||||||
2634 | priority => -200, | ||||||
2635 | }; | ||||||
2636 | } | ||||||
2637 | } | ||||||
2638 | } | ||||||
2639 | |||||||
2640 | # Merge together duplicate urls, so as not to download two copies as | ||||||
2641 | # attachments, and so as to make it clear when there's only one | ||||||
2642 | # destination for two things. | ||||||
2643 | # | ||||||
2644 | # Have seen same url under and |
||||||
2645 | # http://sourceforge.net/export/rss2_keepsake.php?group_id=203650 | ||||||
2646 | # or same url under and |
||||||
2647 | # http://abc.net.au/rn/podcast/feeds/sci.xml | ||||||
2648 | { | ||||||
2649 | 0 | my %seen; | |||||
0 | |||||||
2650 | 0 | @links = grep { | |||||
2651 | 0 | my $l = $_; | |||||
2652 | 0 | my $want = 1; | |||||
2653 | 0 | 0 | if (my $uri = $l->{'uri'}) { | ||||
2654 | 0 | my $canonical = $uri->canonical; | |||||
2655 | 0 | $canonical->fragment(undef); # ignore #foo anchor for uniqueness | |||||
2656 | 0 | 0 | if (my $prev_l = $seen{$canonical}) { | ||||
2657 | 0 | $want = 0; | |||||
2658 | 0 | 0 | $prev_l->{'download'} ||= $l->{'download'}; | ||||
2659 | 0 | 0 | $l->{'priority'} = max ($l->{'priority'}||0, | ||||
0 | |||||||
2660 | $prev_l->{'priority'}||0); | ||||||
2661 | |||||||
2662 | # prefer no anchor if have both with and without | ||||||
2663 | 0 | 0 | if (is_empty($l->{'uri'}->fragment)) { | ||||
2664 | 0 | $prev_l->{'uri'} = $l->{'uri'}; | |||||
2665 | } | ||||||
2666 | |||||||
2667 | 0 | 0 | if ($prev_l->{'name'} eq __('Link')) { | ||||
0 | |||||||
0 | |||||||
2668 | # name "Link" doesn't say much, prefer the other over "Link" | ||||||
2669 | 0 | $prev_l->{'name'} = $l->{'name'}; | |||||
2670 | } elsif ($l->{'name'} eq __('Link')) { | ||||||
2671 | # don't append "Link" to the previous | ||||||
2672 | } elsif ($l->{'name'} eq $prev_l->{'name'}) { | ||||||
2673 | # don't double the same name | ||||||
2674 | } else { | ||||||
2675 | 0 | $prev_l->{'name'} .= ", $l->{'name'}"; | |||||
2676 | } | ||||||
2677 | } | ||||||
2678 | 0 | $seen{$canonical} = $l; | |||||
2679 | } | ||||||
2680 | $want | ||||||
2681 | 0 | } @links; | |||||
2682 | } | ||||||
2683 | 0 | foreach my $l (@links) { | |||||
2684 | 0 | 0 | if ($l->{'uri'}) { | ||||
2685 | 0 | $l->{'name'} .= ':'; | |||||
2686 | } | ||||||
2687 | } | ||||||
2688 | |||||||
2689 | 0 | 0 | if (my $elt = $item->first_child('hlxcd:helex-company-data')) { | ||||
2690 | # Eg. http://www.helex.gr/rss-feeds | ||||||
2691 | # http://www.helex.gr/web/guest/rss-feeds/-/asset_publisher/companiesrss/custom-rss | ||||||
2692 | 0 | my $str = join_non_empty | |||||
2693 | (' ', | ||||||
2694 | $elt->first_child_text('hlxcd:company-ticker-symbol'), | ||||||
2695 | $elt->first_child_text('hlxcd:company-name')); | ||||||
2696 | 0 | 0 | if (is_non_empty($str)) { | ||||
2697 | 0 | push @links, { name => __('Company:').' '.$str, | |||||
2698 | download => 0, | ||||||
2699 | priority => -100, | ||||||
2700 | }; | ||||||
2701 | } | ||||||
2702 | } | ||||||
2703 | |||||||
2704 | 0 | 0 | if (defined (my $str = $self->item_to_lat_long_alt_str ($item))) { | ||||
2705 | 0 | push @links, { name => $str, | |||||
2706 | download => 0, | ||||||
2707 | priority => -100, # lat/long low priority | ||||||
2708 | }; | ||||||
2709 | } | ||||||
2710 | |||||||
2711 | # re:rank as for example from stackexchange.com | ||||||
2712 | # What does label="" usually show? Are parens like this good? | ||||||
2713 | 0 | foreach my $elt ($item->children('re:rank')) { | |||||
2714 | 0 | my $label = $elt->att('label'); | |||||
2715 | 0 | my $value = elt_to_rendered_line($elt); | |||||
2716 | 0 | 0 | push @links, { name => (defined $label | ||||
2717 | ? __x('Rank: {value} ({label})', value => $value, label => $label) | ||||||
2718 | : __x('Rank: {value}', value => $value)), | ||||||
2719 | download => 0, | ||||||
2720 | priority => -200, # low priority | ||||||
2721 | }; | ||||||
2722 | } | ||||||
2723 | |||||||
2724 | # eg. |
||||||
2725 | # is there any value in the role="" part? | ||||||
2726 | 0 | foreach my $elt ($item->children('media:credit')) { | |||||
2727 | 0 | push @links, { name => __x('Credit: {who}', | |||||
2728 | who => scalar(elt_to_rendered_line($elt))), | ||||||
2729 | download => 0, | ||||||
2730 | priority => -200, # very low priority | ||||||
2731 | }; | ||||||
2732 | } | ||||||
2733 | |||||||
2734 | # |
||||||
2735 | # | ||||||
2736 | # Allow for empty |
||||||
2737 | # http://abc.net.au/rn/podcast/feeds/sci.xml | ||||||
2738 | # | ||||||
2739 | 0 | foreach my $elt ($item->children('itunes:explicit')) { | |||||
2740 | 0 | 0 | my $line = elt_to_rendered_line($elt) | ||||
2741 | // next; # skip empty |
||||||
2742 | 0 | push @links, { name => __x('Explicit: {value}', value => $line), | |||||
2743 | download => 0, | ||||||
2744 | priority => -200, # very low priority | ||||||
2745 | }; | ||||||
2746 | } | ||||||
2747 | |||||||
2748 | # |
||||||
2749 | # a fun kind of commentary thing | ||||||
2750 | 0 | foreach my $elt ($item->children('slash:department')) { | |||||
2751 | 0 | push @links, { name => __x('Department: {department}', | |||||
2752 | department => scalar(elt_to_rendered_line($elt))), | ||||||
2753 | download => 0, | ||||||
2754 | priority => -200, # very low priority | ||||||
2755 | }; | ||||||
2756 | } | ||||||
2757 | |||||||
2758 | 0 | return @links; | |||||
2759 | } | ||||||
2760 | @known{qw( | ||||||
2761 | /channel/item/pheedo:origLink | ||||||
2762 | /channel/item/feedburner:origLink | ||||||
2763 | |||||||
2764 | /channel/item/link | ||||||
2765 | /channel/item/enclosure | ||||||
2766 | /channel/item/dc:source | ||||||
2767 | /channel/item/dc:identifier | ||||||
2768 | /channel/item/comments | ||||||
2769 | /channel/item/wfw:comment | ||||||
2770 | /channel/item/wfw:commentRss | ||||||
2771 | /channel/item/slash:comments | ||||||
2772 | /channel/item/slash:hit_parade | ||||||
2773 | /channel/item/slash:department | ||||||
2774 | /channel/item/thr:total | ||||||
2775 | /channel/item/content --atom | ||||||
2776 | /channel/item/wiki:diff | ||||||
2777 | /channel/item/itunes:duration | ||||||
2778 | /channel/item/re:rank | ||||||
2779 | |||||||
2780 | /channel/wiki:interwiki | ||||||
2781 | /channel/wiki:interwiki/rdf:Description | ||||||
2782 | /channel/wiki:interwiki/rdf:Description/rdf:value | ||||||
2783 | /channel/item/wiki:version | ||||||
2784 | /channel/item/wiki:status | ||||||
2785 | /channel/item/wiki:history | ||||||
2786 | /channel/item/foaf:maker | ||||||
2787 | /channel/item/sioc:has_creator | ||||||
2788 | /channel/item/sioc:has_discussion | ||||||
2789 | /channel/item/sioc:links_to | ||||||
2790 | /channel/item/sioc:reply_of | ||||||
2791 | /channel/item/media:credit | ||||||
2792 | /channel/item/itunes:explicit | ||||||
2793 | /channel/item/itunes:block | ||||||
2794 | /channel/item/hlxcd:helex-company-data | ||||||
2795 | |||||||
2796 | --believed-to-be-duplicate-of-description | ||||||
2797 | /channel/item/media:content | ||||||
2798 | /channel/item/media:text | ||||||
2799 | |||||||
2800 | /channel/item/prism:doi | ||||||
2801 | /channel/item/prism:publicationName | ||||||
2802 | /channel/item/prism:publicationDate | ||||||
2803 | /channel/item/prism:url | ||||||
2804 | /channel/item/prism:volume | ||||||
2805 | /channel/item/prism:number | ||||||
2806 | /channel/item/prism:section | ||||||
2807 | /channel/item/prism:startingPage | ||||||
2808 | /channel/item/prism:endingPage | ||||||
2809 | )} = (); | ||||||
2810 | |||||||
2811 | # sub any_link_replies_nonfeed { | ||||||
2812 | # foreach my $elt (@_) { | ||||||
2813 | # # ### any_link_replies_nonfeed(): ref $elt, "$elt", $elt->tag | ||||||
2814 | # # my $rel = ($elt->att('atom:rel') // $elt->att('rel') // ''); | ||||||
2815 | # # my $type = ($elt->att('atom:type') // $elt->att('type') // ''); | ||||||
2816 | # # ### $rel | ||||||
2817 | # # ### $type | ||||||
2818 | # if ($elt->tag eq 'link' | ||||||
2819 | # && (($elt->att('atom:rel') // $elt->att('rel') // '') | ||||||
2820 | # eq 'replies') | ||||||
2821 | # && (($elt->atom_link_is_rss($elt))) { | ||||||
2822 | # return 1; | ||||||
2823 | # } | ||||||
2824 | # } | ||||||
2825 | # return 0; | ||||||
2826 | # } | ||||||
2827 | |||||||
2828 | # Return a string which is the latitude, longitude and possibly altitude | ||||||
2829 | # from the item. If no location in the item then return undef. | ||||||
2830 | # | ||||||
2831 | sub item_to_lat_long_alt_str { | ||||||
2832 | 0 | 0 | 0 | my ($self, $item) = @_; | |||
2833 | 0 | 0 | my ($lat, $long, $alt) = $self->item_to_lat_long_alt_values ($item) | ||||
2834 | or return; | ||||||
2835 | ### $lat | ||||||
2836 | ### $long | ||||||
2837 | ### $alt | ||||||
2838 | |||||||
2839 | 0 | 0 | if (Scalar::Util::looks_like_number($lat)) { | ||||
2840 | 0 | 0 | $lat = ($lat >= 0 | ||||
2841 | # TRANSLATORS: the latin1/unicode degree symbol can be used here | ||||||
2842 | # instead of " deg", if it will be recognised in translation, | ||||||
2843 | # etc. | ||||||
2844 | ? __x('{latitude} deg N', latitude => $lat) | ||||||
2845 | : __x('{latitude} deg S', latitude => -$lat)); | ||||||
2846 | } | ||||||
2847 | 0 | 0 | if (Scalar::Util::looks_like_number($long)) { | ||||
2848 | 0 | 0 | $long = ($long >= 0 | ||||
2849 | ? __x('{longitude} deg E', longitude => $long) | ||||||
2850 | : __x('{longitude} deg W', longitude => -$long)); | ||||||
2851 | } | ||||||
2852 | |||||||
2853 | 0 | 0 | if (is_non_empty ($alt)) { | ||||
2854 | 0 | return __x('Geo location: {latitude}, {longitude}, alt {altitude}m', | |||||
2855 | latitude => $lat, | ||||||
2856 | longitude => $long, | ||||||
2857 | altitude => $alt); | ||||||
2858 | } else { | ||||||
2859 | 0 | return __x('Geo location: {latitude}, {longitude}', | |||||
2860 | latitude => $lat, | ||||||
2861 | longitude => $long); | ||||||
2862 | } | ||||||
2863 | } | ||||||
2864 | |||||||
2865 | # Return a list of values which are the latitude, longitude and possibly | ||||||
2866 | # altitude extracted from $item. | ||||||
2867 | # | ||||||
2868 | # ($latitude, $longitude, $altitude) | ||||||
2869 | # ($latitude, $longitude) | ||||||
2870 | # () | ||||||
2871 | # | ||||||
2872 | # If no location then return an empty list. Some of the values returned | ||||||
2873 | # might be empty strings if say there's a |
||||||
2874 | # | ||||||
2875 | # Latitude is degrees North, or negative for South. Longitude is degrees | ||||||
2876 | # East, or negative for West. Both possibly with decimal places. | ||||||
2877 | # | ||||||
2878 | sub item_to_lat_long_alt_values { | ||||||
2879 | 0 | 0 | 0 | my ($self, $item) = @_; | |||
2880 | |||||||
2881 | # per-item |
||||||
2882 | # http://earthquake.usgs.gov/eqcenter/recenteqsww/catalogs/eqs7day-M5.xml | ||||||
2883 | # |
||||||
2884 | # |
||||||
2885 | # |
||||||
2886 | # | ||||||
2887 | # or under geo:Point, maybe, eg. http://www.gdacs.org/xml/RSSTC.xml | ||||||
2888 | # |
||||||
2889 | # |
||||||
2890 | # |
||||||
2891 | # |
||||||
2892 | # | ||||||
2893 | 0 | foreach my $elt ($item, $item->children(qr/^geo:point$/i)) { | |||||
2894 | 0 | my $lat = $elt->first_child_trimmed_text('geo:lat'); | |||||
2895 | 0 | 0 | if (is_non_empty ($lat)) { | ||||
2896 | 0 | return ($lat, | |||||
2897 | $elt->first_child_trimmed_text('geo:long'), | ||||||
2898 | non_empty ($elt->first_child_trimmed_text('geo:alt'))); | ||||||
2899 | } | ||||||
2900 | } | ||||||
2901 | |||||||
2902 | # |
||||||
2903 | # |
||||||
2904 | # space separator per http://www.georss.org/Encodings | ||||||
2905 | { | ||||||
2906 | 0 | my $str = $item->first_child_trimmed_text ('georss:point'); | |||||
0 | |||||||
2907 | 0 | 0 | if (is_non_empty ($str)) { | ||||
2908 | 0 | return split(/\s+/, $str, 2); # no altitude | |||||
2909 | } | ||||||
2910 | } | ||||||
2911 | |||||||
2912 | # |
||||||
2913 | # | ||||||
2914 | # rdf:resource="http://sws.geonames.org/2638077/"> | ||||||
2915 | # | ||||||
2916 | 0 | 0 | if (my $elt = $item->first_child ('statusnet:origin')) { | ||||
2917 | 0 | 0 | if (defined (my $lat = $elt->att('geo:lat'))) { | ||||
2918 | 0 | my $long = $elt->att('geo:long'); | |||||
2919 | 0 | return ($lat, $long); | |||||
2920 | } | ||||||
2921 | } | ||||||
2922 | |||||||
2923 | 0 | return; # not found | |||||
2924 | } | ||||||
2925 | @known{qw(/channel/item/geo:lat | ||||||
2926 | /channel/item/geo:long | ||||||
2927 | /channel/item/geo:alt | ||||||
2928 | /channel/item/geo:Point | ||||||
2929 | /channel/item/geo:Point/geo:lat | ||||||
2930 | /channel/item/geo:Point/geo:long | ||||||
2931 | /channel/item/georss:point | ||||||
2932 | /channel/item/statusnet:origin | ||||||
2933 | )} = (); | ||||||
2934 | |||||||
2935 | |||||||
2936 | sub links_to_html { | ||||||
2937 | 0 | 0 | 0 | 0 | @_ or return ''; | ||
2938 | |||||||
2939 | # |
||||||
2940 | # up by a line-wrap, which can make it hard to cut and paste. can |
||||||
2941 | # prevent a line wrap, but it ends up treated as starting a paragraph, | ||||||
2942 | # separate from the 'name' part. | ||||||
2943 | # | ||||||
2944 | 0 | my $str = ''; | |||||
2945 | 0 | my $sep = "\n\n \n"; |
|||||
2946 | 0 | foreach my $l (@_) { | |||||
2947 | 0 | $str .= "$sep |
|||||
2948 | 0 | $sep = " \n"; |
|||||
2949 | |||||||
2950 | 0 | 0 | if (defined (my $uri = $l->{'uri'})) { | ||||
2951 | 0 | $str .= " | |||||
2952 | 0 | 0 | if (defined (my $hreflang = $l->{'hreflang'})) { | ||||
2953 | 0 | $str .= " hreflang=\"$Entitize{$hreflang}\""; | |||||
2954 | } | ||||||
2955 | 0 | 0 | if (defined (my $type = $l->{'type'})) { | ||||
2956 | 0 | $str .= " type=\"$Entitize{$type}\""; | |||||
2957 | } | ||||||
2958 | 0 | $uri = $Entitize{$uri}; | |||||
2959 | 0 | $str .= " href=\"$uri\">$uri"; | |||||
2960 | } | ||||||
2961 | 0 | $str .= "\n"; | |||||
2962 | } | ||||||
2963 | 0 | return "$str\n"; | |||||
2964 | } | ||||||
2965 | |||||||
2966 | sub links_to_text { | ||||||
2967 | 0 | 0 | 0 | return join ('', map { join_non_empty (' ', | |||
0 | |||||||
2968 | $_->{'name'}, | ||||||
2969 | $_->{'uri'}) . "\n" } @_); | ||||||
2970 | } | ||||||
2971 | |||||||
2972 | |||||||
2973 | #------------------------------------------------------------------------------ | ||||||
2974 | # "From:" and email addresses | ||||||
2975 | |||||||
2976 | 1 | 1 | 6680 | use constant DUMMY_EMAIL_ADDRESS => 'nobody@rss2leafnode.dummy'; | |||
1 | 3 | ||||||
1 | 9427 | ||||||
2977 | |||||||
2978 | { | ||||||
2979 | my %tag_to_link_name | ||||||
2980 | = (author => __('Author:'), | ||||||
2981 | creator => __('Creator:'), | ||||||
2982 | contributor => __('Contributor:'), | ||||||
2983 | managingEditor => __('Managing Editor:'), | ||||||
2984 | webMaster => __('Webmaster:'), | ||||||
2985 | publisher => __('Publisher:'), | ||||||
2986 | owner => __('Owner:'), | ||||||
2987 | username => __('User:'), | ||||||
2988 | ); | ||||||
2989 | |||||||
2990 | # Return ($from, $linkhash,$linkhash,...). | ||||||
2991 | # $from is a string like "foo@example.com". | ||||||
2992 | # Multiple authors are for example "foo@example.com, quux@example.com" as | ||||||
2993 | # per RFC5322 email, though currently no Sender: is picked out from among | ||||||
2994 | # them. | ||||||
2995 | # | ||||||
2996 | # Eg. |
||||||
2997 | # each author of an article http://www.nature.com/nature/current_issue/rss | ||||||
2998 | # | ||||||
2999 | sub item_to_from { | ||||||
3000 | 0 | 0 | 0 | my ($self, $item) = @_; | |||
3001 | ### item_to_from() ... | ||||||
3002 | 0 | my $channel = item_to_channel($item); | |||||
3003 | |||||||
3004 | # |
||||||
3005 | # looser. The RSS recommendation is |
||||||
3006 | # and |
||||||
3007 | # | ||||||
3008 | # |
||||||
3009 | # attribute. | ||||||
3010 | # | ||||||
3011 | # |
||||||
3012 | # | ||||||
3013 | # |
||||||
3014 | # to show just the primary author or authors. | ||||||
3015 | # | ||||||
3016 | 0 | my @from; | |||||
3017 | my @links; | ||||||
3018 | 0 | foreach my $try ([$item, 'author'], | |||||
3019 | [$item, 'jf:author'], | ||||||
3020 | [$item, 'slate:author'], | ||||||
3021 | [$item, 'dc:creator'], | ||||||
3022 | [$item, 'dc:contributor'], | ||||||
3023 | [$item, 'wiki:username'], | ||||||
3024 | [$item, 'itunes:author'], | ||||||
3025 | |||||||
3026 | [$channel, 'author'], | ||||||
3027 | [$channel, 'dc:creator'], | ||||||
3028 | [$channel, 'itunes:author'], | ||||||
3029 | [$channel, 'managingEditor'], | ||||||
3030 | [$channel, 'webMaster'], | ||||||
3031 | |||||||
3032 | [$item, 'dc:publisher'], | ||||||
3033 | [$channel, 'dc:publisher'], | ||||||
3034 | [$channel, 'itunes:owner'], | ||||||
3035 | ) { | ||||||
3036 | 0 | my ($where, $tag) = @$try; | |||||
3037 | ### $tag | ||||||
3038 | |||||||
3039 | 0 | 0 | if (my @elts = $item->children($tag)) { | ||||
3040 | 0 | foreach my $elt (@elts) { | |||||
3041 | ### elt for From: $elt->sprint | ||||||
3042 | 0 | push @from, $self->elt_to_email($elt); | |||||
3043 | |||||||
3044 | # author's home page etc as a link | ||||||
3045 | 0 | 0 | 0 | if (my $uri = | |||
0 | |||||||
3046 | (# Atom | ||||||
3047 | # |
||||||
3048 | # |
||||||
3049 | # |
||||||
3050 | # | ||||||
3051 | # | ||||||
3052 | non_empty ($elt->first_child_text('uri')) | ||||||
3053 | |||||||
3054 | # slate.com | ||||||
3055 | # |
||||||
3056 | // non_empty ($elt->att('url')) | ||||||
3057 | |||||||
3058 | # ModWiki dc:contributor example | ||||||
3059 | # |
||||||
3060 | # |
||||||
3061 | # | ||||||
3062 | # The text shows rss:link= and the example just link=. | ||||||
3063 | # | ||||||
3064 | // non_empty (do { | ||||||
3065 | 0 | 0 | 0 | my $child; ($child = $elt->first_child('rdf:Description')) | |||
0 | |||||||
3066 | && ($child->att('link') // $child->att('rss:link')) | ||||||
3067 | }))) { | ||||||
3068 | 0 | my $tag = $elt->tag; | |||||
3069 | 0 | $tag =~ s/.*?://; | |||||
3070 | 0 | 0 | push @links, { uri => URI->new($uri), | ||||
3071 | name => ($tag_to_link_name{$tag} // "\u$tag:"), | ||||||
3072 | download => 0, | ||||||
3073 | priority => -20 }; | ||||||
3074 | } | ||||||
3075 | } | ||||||
3076 | } | ||||||
3077 | 0 | 0 | last if @from; | ||||
3078 | } | ||||||
3079 | 0 | 0 | if (! @from) { | ||||
3080 | # Atom |
||||||
3081 | # Hope the channel title is different from the item title. | ||||||
3082 | 0 | @from = ($self->email_format (elt_to_rendered_line | |||||
3083 | ($channel->first_child('title')))); | ||||||
3084 | } | ||||||
3085 | 0 | 0 | if (! @from) { | ||||
3086 | 0 | @from = ('nobody@'.$self->uri_to_host); | |||||
3087 | } | ||||||
3088 | |||||||
3089 | ### @from | ||||||
3090 | 0 | return (join(', ',@from), | |||||
3091 | @links); | ||||||
3092 | } | ||||||
3093 | @known{qw(/channel/author | ||||||
3094 | /channel/author/name --atom | ||||||
3095 | /channel/author/uri --atom | ||||||
3096 | /channel/author/url --atom-typo-maybe | ||||||
3097 | /channel/author/email --atom | ||||||
3098 | /channel/managingEditor | ||||||
3099 | /channel/webMaster | ||||||
3100 | /channel/dc:publisher | ||||||
3101 | /channel/dc:creator | ||||||
3102 | /channel/itunes:author | ||||||
3103 | |||||||
3104 | /channel/item/author | ||||||
3105 | /channel/item/author/name --atom | ||||||
3106 | /channel/item/author/uri --atom | ||||||
3107 | /channel/item/author/url --atom-typo-maybe | ||||||
3108 | /channel/item/author/email --atom | ||||||
3109 | /channel/item/author/gd:extendedProperty --good-dinner | ||||||
3110 | /channel/item/dc:creator | ||||||
3111 | /channel/item/dc:publisher | ||||||
3112 | /channel/item/wiki:username | ||||||
3113 | /channel/item/itunes:author | ||||||
3114 | /channel/item/dc:contributor | ||||||
3115 | /channel/item/dc:contributor/rdf:Description | ||||||
3116 | /channel/item/dc:contributor/rdf:Description/rdf:value | ||||||
3117 | /channel/item/jf:author | ||||||
3118 | /channel/item/slate:author | ||||||
3119 | |||||||
3120 | /channel/item/contributor --atom | ||||||
3121 | /channel/item/contributor/name | ||||||
3122 | /channel/item/contributor/uri | ||||||
3123 | /channel/item/contributor/url --atom-typo-maybe | ||||||
3124 | /channel/item/contributor/email | ||||||
3125 | |||||||
3126 | /channel/item/activity:actor | ||||||
3127 | /channel/item/activity:verb --usually-post-or-something | ||||||
3128 | /channel/item/activity:object-type --is-this-anything | ||||||
3129 | )} = (); | ||||||
3130 | } | ||||||
3131 | |||||||
3132 | # $elt is an XML::Twig::Elt | ||||||
3133 | # Return an email address, either just the text part of $elt or Atom | ||||||
3134 | # sub-elements |
||||||
3135 | # If $elt is empty then return an empty list. | ||||||
3136 | # | ||||||
3137 | sub elt_to_email { | ||||||
3138 | 0 | 0 | 0 | my ($self, $elt) = @_; | |||
3139 | 0 | 0 | return unless defined $elt; | ||||
3140 | |||||||
3141 | # |
||||||
3142 | # |
||||||
3143 | 0 | my $email = elt_to_rendered_line ($elt->first_child(qr/^(itunes:)?email$/)); | |||||
3144 | |||||||
3145 | # |
||||||
3146 | # |
||||||
3147 | 0 | 0 | my $display = elt_to_rendered_line ($elt->first_child(qr/^(itunes:)?name$/)) | ||||
3148 | // ''; | ||||||
3149 | |||||||
3150 | my $maybe = join | ||||||
3151 | (' ', | ||||||
3152 | non_empty ($elt->text_only), | ||||||
3153 | 0 | non_empty (do { | |||||
3154 | # |
||||||
3155 | 0 | 0 | my $rdfdesc; ($rdfdesc = $elt->first_child('rdf:Description')) | ||||
0 | |||||||
3156 | && $rdfdesc->first_child_text('rdf:value') | ||||||
3157 | })); | ||||||
3158 | |||||||
3159 | 0 | return $self->email_format_maybe ($maybe, $display, $email); | |||||
3160 | } | ||||||
3161 | |||||||
3162 | # $mailbox_re is a mailbox with domain, like "foo@example.com" | ||||||
3163 | # Allows no dots like "foo@localhost". | ||||||
3164 | # Allows dashes like "www-something@example.com". | ||||||
3165 | # | ||||||
3166 | # $mailbox_with_comment_re allows an optional paren comment part like | ||||||
3167 | # "foo@example.com (Foo)" | ||||||
3168 | # | ||||||
3169 | # cf Email::Address $addr_spec, but its version 1.890 loosened to allow a | ||||||
3170 | # domain-less bare "foo", which is no good | ||||||
3171 | # | ||||||
3172 | my $words_with_dots_re = qr/[[:word:]-]+(\.[[:word:]-]+)*/; | ||||||
3173 | my $mailbox_re = qr/$words_with_dots_re\@$words_with_dots_re/o; | ||||||
3174 | my $mailbox_with_comment_re = qr/$mailbox_re(\s*\([^\)]*\))?/os; | ||||||
3175 | |||||||
3176 | # $maybe is some free-form author part possibly including a foo@example.com | ||||||
3177 | # $display is a display part for the email like "Foo", possibly empty "" | ||||||
3178 | # $email is a mailbox "foo@example.com", or undef | ||||||
3179 | # return an rfc822 "Foo |
||||||
3180 | # | ||||||
3181 | sub email_format_maybe { | ||||||
3182 | 0 | 0 | 0 | my ($self, $maybe, $display, $email) = @_; | |||
3183 | ### email_format_maybe() start | ||||||
3184 | ### $maybe | ||||||
3185 | ### $display | ||||||
3186 | |||||||
3187 | |||||||
3188 | # look also at $display the same in case Atom no |
||||||
3189 | # which is a mailbox and can be corrected, | ||||||
3190 | # eg. http://www.weather.gov/alerts-beta/hi.php?x=0 | ||||||
3191 | # | ||||||
3192 | # Or $maybe full email like | ||||||
3193 | 0 | 0 | if (is_empty($email)) { | ||||
3194 | 0 | foreach ($maybe, $display) { | |||||
3195 | |||||||
3196 | 0 | 0 | 0 | if (/^\s*(mailto:)?($mailbox_with_comment_re)\s*$/o) { | |||
0 | |||||||
3197 | ### maybe or display is a mailbox | ||||||
3198 | # "foo@example.com" | ||||||
3199 | # "mailto:foo@example.com" | ||||||
3200 | # "foo@example.com (Foo)" | ||||||
3201 | 0 | $email = $2; | |||||
3202 | 0 | undef $_; | |||||
3203 | 0 | last; | |||||
3204 | |||||||
3205 | } elsif (/(.*)\((mailto:)?($mailbox_re)\)\s*$/o | ||||||
3206 | || /(.*)<(mailto:)?($mailbox_re)>\s*$/o) { | ||||||
3207 | ### maybe or display part is display plus mailbox | ||||||
3208 | # "Foo (mailto:foo@example.com)" | ||||||
3209 | # "Foo (foo@example.com)" | ||||||
3210 | # "Foo |
||||||
3211 | # | ||||||
3212 | 0 | $_ = $1; | |||||
3213 | 0 | $email = $3; | |||||
3214 | 0 | last; | |||||
3215 | } | ||||||
3216 | } | ||||||
3217 | } | ||||||
3218 | |||||||
3219 | 0 | 0 | $display .= ' '.($maybe//''); | ||||
3220 | 0 | my $ret; | |||||
3221 | 0 | 0 | 0 | if (is_empty($email) && $display =~ /^$mailbox_re$/o) { | |||
3222 | # display or maybe is a "foo@example.com" or "foo@example.com (Foo)", | ||||||
3223 | # return it as-is, in particular leave it in "(Foo)" style comment | ||||||
3224 | 0 | $ret = $display; | |||||
3225 | } else { | ||||||
3226 | 0 | $ret = $self->email_format ($display, $email); | |||||
3227 | } | ||||||
3228 | |||||||
3229 | # Collapse whitespace against possible tabs and newlines in an |
||||||
3230 | # from googlegroups for instance. MIME::Entity seems to collapse | ||||||
3231 | # newlines, but not tabs. | ||||||
3232 | 0 | return non_empty (collapse_whitespace ($ret)); | |||||
3233 | } | ||||||
3234 | |||||||
3235 | # $display is a display part for the email "Foo", possibly empty "" | ||||||
3236 | # $email is a mailbox "foo@example.com", or undef or empty "" | ||||||
3237 | # return an rfc822 "Foo |
||||||
3238 | # | ||||||
3239 | sub email_format { | ||||||
3240 | 0 | 0 | 0 | my ($self, $display, $email) = @_; | |||
3241 | ### $display | ||||||
3242 | 0 | $display = Text::Trim::trim($display); | |||||
3243 | 0 | $email = Text::Trim::trim($email); | |||||
3244 | 0 | 0 | if (is_empty($display)) { | ||||
3245 | 0 | 0 | if (is_empty($email)) { | ||||
3246 | 0 | return; | |||||
3247 | } else { | ||||||
3248 | 0 | return $email; | |||||
3249 | } | ||||||
3250 | } | ||||||
3251 | 0 | 0 | if (is_empty($email)) { | ||||
3252 | # think can't have empty <> or omitted, otherwise the quoted part is | ||||||
3253 | # still parsed as an address, certainly it's not rfc822 compliant to | ||||||
3254 | # omit | ||||||
3255 | 0 | $email = 'nobody@'.$self->uri_to_host; | |||||
3256 | } else { | ||||||
3257 | 0 | $email = $email; | |||||
3258 | } | ||||||
3259 | 0 | return email_phrase_quote_maybe($display) . " <$email>"; | |||||
3260 | } | ||||||
3261 | |||||||
3262 | # return $str with quotes like "Foo Bar" if it needs them to go in an email | ||||||
3263 | # display part | ||||||
3264 | sub email_phrase_quote_maybe { | ||||||
3265 | 0 | 0 | 0 | my ($str) = @_; | |||
3266 | 0 | 0 | return if ! defined $str; | ||||
3267 | |||||||
3268 | # RFC2822 "atext" characters, with "-" last | ||||||
3269 | 0 | 0 | if ($str =~ m<[^[:alnum:][:space:]!#\$%&'*+/=?^_`{|}~-]>) { | ||||
3270 | # strange chars, need to quote | ||||||
3271 | 0 | return email_phrase_quote($str); | |||||
3272 | } else { | ||||||
3273 | # alphanumeric and whitespace, no quotes | ||||||
3274 | 0 | return $str; | |||||
3275 | } | ||||||
3276 | } | ||||||
3277 | sub email_phrase_quote { | ||||||
3278 | 0 | 0 | 0 | my ($str) = @_; | |||
3279 | 0 | 0 | return if ! defined $str; | ||||
3280 | 0 | $str =~ s/^"(.*)"$/$1/; # strip existing quotes | |||||
3281 | 0 | $str =~ s/(["\\])/\\$1/g; # escape internal quotes and backslashes | |||||
3282 | 0 | return "\"$str\""; | |||||
3283 | } | ||||||
3284 | |||||||
3285 | |||||||
3286 | #------------------------------------------------------------------------------ | ||||||
3287 | # rss_newest_only | ||||||
3288 | |||||||
3289 | { | ||||||
3290 | my %multiplier = (minute => 60, | ||||||
3291 | hour => 3600, | ||||||
3292 | day => 86400, | ||||||
3293 | week => 86400 * 7, | ||||||
3294 | month => 365.25 * 86400 / 12, | ||||||
3295 | year => 365.25 * 86400, | ||||||
3296 | ); | ||||||
3297 | # return a target time_t, or undef | ||||||
3298 | sub rss_newest_only_timet { | ||||||
3299 | 0 | 0 | 0 | my ($self) = @_; | |||
3300 | |||||||
3301 | 0 | 0 | if (defined (my $str = $self->{'rss_newest_only'})) { | ||||
3302 | 0 | 0 | if ($str =~ /^\s*(\d+)\s*(minute|hour|day|week|month|year)s?\s*$/) { | ||||
3303 | 0 | return time() - $1*$multiplier{$2}; | |||||
3304 | } | ||||||
3305 | } | ||||||
3306 | 0 | return undef; | |||||
3307 | } | ||||||
3308 | } | ||||||
3309 | |||||||
3310 | # return a number, or undef | ||||||
3311 | sub rss_newest_only_count { | ||||||
3312 | 0 | 0 | 0 | my ($self) = @_; | |||
3313 | 0 | 0 | if (defined (my $str = $self->{'rss_newest_only'})) { | ||||
3314 | 0 | 0 | if (Scalar::Util::looks_like_number($str)) { | ||||
3315 | ### rss_newest_only number: $str | ||||||
3316 | 0 | return $str; | |||||
3317 | } | ||||||
3318 | } | ||||||
3319 | 0 | return undef; | |||||
3320 | } | ||||||
3321 | |||||||
3322 | # return @items restricted or filtered by rss_newest_only | ||||||
3323 | sub rss_newest_only_items { | ||||||
3324 | 0 | 0 | 0 | my ($self, @items) = @_; | |||
3325 | |||||||
3326 | 0 | 0 | if (defined (my $count = $self->rss_newest_only_count)) { | ||||
3327 | 0 | 0 | if ($count == 0) { | ||||
3328 | # rss_newest_only=>0 means don't apply a newest | ||||||
3329 | 0 | return @items; | |||||
3330 | } | ||||||
3331 | 0 | my $before = scalar(@items); | |||||
3332 | 0 | require Sort::Key::Top; | |||||
3333 | 0 | 0 | @items = Sort::Key::Top::rnkeytop (sub { $self->item_to_timet($_) }, | ||||
3334 | 0 | $count, @items); | |||||
3335 | |||||||
3336 | 0 | my $after = scalar(@items); | |||||
3337 | 0 | 0 | if ($before != $after) { | ||||
3338 | 0 | $self->verbose (1, " rss_newest_only reduce by count from $before items to $after items"); | |||||
3339 | } | ||||||
3340 | 0 | return @items; | |||||
3341 | } | ||||||
3342 | |||||||
3343 | 0 | 0 | if (defined (my $target_timet = $self->rss_newest_only_timet)) { | ||||
3344 | 0 | my $before = scalar(@items); | |||||
3345 | 0 | @items = grep { my $got_timet = $self->item_to_timet($_); | |||||
0 | |||||||
3346 | 0 | 0 | ! defined $got_timet || $got_timet >= $target_timet } | ||||
3347 | @items; | ||||||
3348 | 0 | my $after = scalar(@items); | |||||
3349 | 0 | 0 | if ($before != $after) { | ||||
3350 | 0 | $self->verbose (1, " rss_newest_only reduce by age from $before to $after items"); | |||||
3351 | } | ||||||
3352 | 0 | return @items; | |||||
3353 | } | ||||||
3354 | |||||||
3355 | 0 | 0 | if (defined (my $str = $self->{'rss_newest_only'})) { | ||||
3356 | 0 | die "rss2leafnode: unrecognised rss_newest_only: ",$str; | |||||
3357 | } | ||||||
3358 | 0 | return @items; | |||||
3359 | } | ||||||
3360 | |||||||
3361 | |||||||
3362 | #------------------------------------------------------------------------------ | ||||||
3363 | # fetch RSS | ||||||
3364 | |||||||
3365 | my $map_xmlns | ||||||
3366 | = { | ||||||
3367 | 'http://purl.org/rss/1.0/' => 'rss', | ||||||
3368 | 'http://www.w3.org/2005/Atom' => 'atom', | ||||||
3369 | 'http://www.w3.org/1999/02/22-rdf-syntax-ns#' => 'rdf', | ||||||
3370 | 'http://purl.org/rss/1.0/modules/content/' => 'content', | ||||||
3371 | 'http://purl.org/rss/1.0/modules/slash/' => 'slash', | ||||||
3372 | 'http://purl.org/rss/1.0/modules/syndication/' => 'syn', | ||||||
3373 | 'http://purl.org/syndication/thread/1.0' => 'thr', | ||||||
3374 | 'http://wellformedweb.org/CommentAPI/' => 'wfw', | ||||||
3375 | 'http://www.w3.org/1999/xhtml' => 'xhtml', | ||||||
3376 | 'http://www.itunes.com/dtds/podcast-1.0.dtd' => 'itunes', | ||||||
3377 | 'http://rssnamespace.org/feedburner/ext/1.0' => 'feedburner', | ||||||
3378 | 'http://www.helex.gr/helex-schemas/xsd/CompanyDataAtomAttributes-v1.xsd' | ||||||
3379 | => 'hlxcd', | ||||||
3380 | |||||||
3381 | # http://www.prismstandard.org/specifications/2.0/PRISM_prism_namespace_2.0.pdf | ||||||
3382 | 'http://prismstandard.org/namespaces/basic/2.0/' => 'prism', | ||||||
3383 | |||||||
3384 | # http://www.rssboard.org/media-rss | ||||||
3385 | 'http://search.yahoo.com/mrss' => 'media', | ||||||
3386 | |||||||
3387 | 'http://www.w3.org/2003/01/geo/wgs84_pos#' => 'geo', | ||||||
3388 | 'http://www.georss.org/georss' => 'georss', | ||||||
3389 | 'http://www.pheedo.com/namespace/pheedo' => 'pheedo', | ||||||
3390 | 'http://api.twitter.com' => 'twitter', | ||||||
3391 | 'http://xmlns.com/foaf/0.1/' => 'foaf', | ||||||
3392 | 'http://status.net/ont/' => 'statusnet', | ||||||
3393 | 'http://rdfs.org/sioc/ns#' => 'sioc', | ||||||
3394 | 'http://www.slate.com' => 'slate', | ||||||
3395 | 'http://activitystrea.ms/spec/1.0/' => 'activity', | ||||||
3396 | 'http://ostatus.org/schema/1.0' => 'ostatus', | ||||||
3397 | |||||||
3398 | # http://tools.ietf.org/html/draft-snell-atompub-feed-index-10 | ||||||
3399 | 'http://purl.org/atompub/rank/1.0' => 're', | ||||||
3400 | |||||||
3401 | # per http://docs.jivesoftware.com/latest/documentation/rss.html#output | ||||||
3402 | 'http://www.jivesoftware.com/xmlns/jiveforums/rss' => 'jf', | ||||||
3403 | |||||||
3404 | # these two are different, but treat the same for now | ||||||
3405 | 'http://backend.userland.com/creativeCommonsRssModule'=>'creativeCommons', | ||||||
3406 | 'http://creativecommons.org/ns#' =>'creativeCommons', | ||||||
3407 | |||||||
3408 | # Common Alerts Protocol | ||||||
3409 | 'urn:oasis:names:tc:emergency:cap:1.1' => 'cap', | ||||||
3410 | |||||||
3411 | # central bank exchange rates format, | ||||||
3412 | # spec http://www.cbwiki.net/wiki/index.php/RSS-CBMain | ||||||
3413 | # eg. RBA http://www.rba.gov.au/rss/rss-cb-exchange-rates.xml | ||||||
3414 | 'http://www.cbwiki.net/wiki/index.php/Specification_1.1' => 'cb', | ||||||
3415 | |||||||
3416 | # earthquakes | ||||||
3417 | # eg. http://earthquake.usgs.gov/earthquakes/shakemap/rss.xml | ||||||
3418 | 'http://earthquake.usgs.gov/rss/1.0/' => 'eq', | ||||||
3419 | |||||||
3420 | 'http://purl.org/dc/elements/1.1/' => 'dc', | ||||||
3421 | 'http://purl.org/dc/terms/' => 'dcterms', | ||||||
3422 | |||||||
3423 | # purl.org might be supposed to be the home for wiki:, but it's a 404 | ||||||
3424 | # and usemod.com suggests its page instead | ||||||
3425 | # Spec at http://www.meatballwiki.org/wiki/ModWiki | ||||||
3426 | 'http://purl.org/rss/1.0/modules/wiki/' => 'wiki', | ||||||
3427 | 'http://www.usemod.com/cgi-bin/mb.pl?ModWiki' => 'wiki', | ||||||
3428 | |||||||
3429 | # not sure if this is supposed to be necessary, but without it | ||||||
3430 | # "xml:lang" attributes are turned into "lang" | ||||||
3431 | 'http://www.w3.org/XML/1998/namespace' => 'xml', | ||||||
3432 | }; | ||||||
3433 | |||||||
3434 | sub twig_parse { | ||||||
3435 | 0 | 0 | 0 | my ($self, $xml) = @_; | |||
3436 | ### twig_parse() ... | ||||||
3437 | |||||||
3438 | # default "discard_spaces" chucks leading and trailing space on content, | ||||||
3439 | # which is usually a good thing | ||||||
3440 | # | ||||||
3441 | 0 | require XML::Twig; | |||||
3442 | 0 | XML::Twig->VERSION('3.34'); # for att_exists() | |||||
3443 | 0 | my $twig = XML::Twig->new (map_xmlns => $map_xmlns, | |||||
3444 | pretty_print => 'wrapped'); | ||||||
3445 | 0 | $twig->safe_parse ($xml); | |||||
3446 | 0 | my $err = $@; | |||||
3447 | ### $err | ||||||
3448 | |||||||
3449 | # Try to fix bad non-ascii chars by putting it through Encode::from_to(). | ||||||
3450 | # Encode::FB_DEFAULT substitutes U+FFFD when going to unicode, or question | ||||||
3451 | # mark "?" going to non-unicode. Mozilla does some sort of similar | ||||||
3452 | # liberal byte interpretation so as to at least display something from a | ||||||
3453 | # dodgy feed. | ||||||
3454 | # | ||||||
3455 | 0 | 0 | 0 | if ($err && $err =~ /not well-formed \(invalid token\) at (line \d+, column \d+, byte (\d+))/) { | |||
3456 | 0 | my $where = $1; | |||||
3457 | 0 | my $byte = ord(substr($xml,$2,1)); | |||||
3458 | 0 | 0 | if ($byte >= 128) { | ||||
3459 | 0 | 0 | my $charset = $twig->encoding // 'utf-8'; | ||||
3460 | 0 | $self->verbose (1, sprintf ("parse error, attempt re-code $charset for byte 0x%02X\n", $byte)); | |||||
3461 | 0 | require Encode; | |||||
3462 | 0 | my $recoded_xml = $xml; | |||||
3463 | 0 | Encode::from_to($recoded_xml, $charset, $charset, Encode::FB_DEFAULT()); | |||||
3464 | |||||||
3465 | 0 | $twig = XML::Twig->new (map_xmlns => $map_xmlns); | |||||
3466 | 0 | 0 | if ($twig->safe_parse ($recoded_xml)) { | ||||
3467 | 0 | $twig->root->set_att('rss2leafnode:fixup', | |||||
3468 | "Recoded bad bytes to charset $charset"); | ||||||
3469 | 0 | print __x("Feed {url}\n recoded {charset} to parse, expect substitutions for bad non-ascii\n ({where})\n", | |||||
3470 | url => $self->{'uri'}, | ||||||
3471 | charset => $charset, | ||||||
3472 | where => $where); | ||||||
3473 | 0 | undef $err; | |||||
3474 | } | ||||||
3475 | } | ||||||
3476 | } | ||||||
3477 | |||||||
3478 | # Or attempt to put it through XML::Liberal, if available. | ||||||
3479 | # | ||||||
3480 | 0 | 0 | if ($err) { | ||||
3481 | 0 | my $liberal_xml = $self->xml_liberal_correction($xml); | |||||
3482 | 0 | 0 | if (defined $liberal_xml) { | ||||
3483 | ### reparse xml liberal fixup with twig ... | ||||||
3484 | 0 | $twig = XML::Twig->new (map_xmlns => $map_xmlns); | |||||
3485 | 0 | 0 | if ($twig->safe_parse ($liberal_xml)) { | ||||
3486 | ### now ok ... | ||||||
3487 | 0 | $err = Text::Trim::trim($err); | |||||
3488 | 0 | $twig->root->set_att('rss2leafnode:fixup', | |||||
3489 | "XML::Liberal fixed: {error}", | ||||||
3490 | error => $err); | ||||||
3491 | 0 | print __x("Feed {url}\n parse error: {error}\n continuing with repairs by XML::Liberal\n", | |||||
3492 | url => $self->{'uri'}, | ||||||
3493 | error => $err); | ||||||
3494 | 0 | undef $err; | |||||
3495 | } | ||||||
3496 | } | ||||||
3497 | ### now err: $err | ||||||
3498 | } | ||||||
3499 | |||||||
3500 | 0 | 0 | if ($err) { | ||||
3501 | # XML::Parser seems to stick some spurious leading whitespace on the error | ||||||
3502 | 0 | $err = Text::Trim::trim($err); | |||||
3503 | |||||||
3504 | 0 | $self->verbose (1, __x("Parse error on URL {url}\n{error}", | |||||
3505 | url => $self->{'uri'}, | ||||||
3506 | error => $err)); | ||||||
3507 | 0 | return (undef, $err); | |||||
3508 | } | ||||||
3509 | |||||||
3510 | # Strip any explicit "rss:" or "atom:" namespace down to bare part. | ||||||
3511 | # Should be unambiguous and is easier than giving tag names both with and | ||||||
3512 | # without the namespace. Undocumented set_ns_as_default() might do this | ||||||
3513 | # ... or might not. | ||||||
3514 | # | ||||||
3515 | 0 | my $root = $twig->root; | |||||
3516 | 0 | App::RSS2Leafnode::XML::Twig::Other::elt_tree_strip_prefix ($root, 'atom'); | |||||
3517 | 0 | App::RSS2Leafnode::XML::Twig::Other::elt_tree_strip_prefix ($root, 'rss'); | |||||
3518 | |||||||
3519 | # somehow map_xmlns mangles default attributes like "decimals=...", prefer | ||||||
3520 | # to see them without rss: or atom: -- maybe | ||||||
3521 | # foreach my $child ($root->descendants_or_self) { | ||||||
3522 | # foreach my $attname ($child->att_names) { | ||||||
3523 | # if ($attname =~ /^(atom|rss):(.*)/) { | ||||||
3524 | # $child->change_att_name($attname, $2); | ||||||
3525 | # } | ||||||
3526 | # } | ||||||
3527 | # } | ||||||
3528 | |||||||
3529 | ### add xml base | ||||||
3530 | 0 | 0 | 0 | if (defined $self->{'uri'} && ! $root->att_exists('xml:base')) { | |||
3531 | 0 | $root->set_att ('xml:base', $self->{'uri'}); | |||||
3532 | } | ||||||
3533 | |||||||
3534 | ### success | ||||||
3535 | 0 | return ($twig, undef); | |||||
3536 | } | ||||||
3537 | |||||||
3538 | sub item_to_channel { | ||||||
3539 | 0 | 0 | 0 | my ($item) = @_; | |||
3540 | # parent for RSS or Atom, but sibling "channel" for RDF | ||||||
3541 | 0 | my $channel = $item->parent; | |||||
3542 | 0 | 0 | return ($channel->first_child('channel') | ||||
3543 | // $channel); | ||||||
3544 | } | ||||||
3545 | |||||||
3546 | # return a Message-ID string for this $item coming from $self->{'uri'} | ||||||
3547 | # | ||||||
3548 | sub item_to_msgid { | ||||||
3549 | 0 | 0 | 0 | my ($self, $item) = @_; | |||
3550 | |||||||
3551 | 0 | 0 | if (is_non_empty (my $id = $item->first_child_text('id'))) { | ||||
3552 | # Atom |
||||||
3553 | 0 | return $self->url_to_msgid ($id, $item->first_child_text('updated')); | |||||
3554 | } | ||||||
3555 | |||||||
3556 | 0 | my $guid; | |||||
3557 | 0 | my $isPermaLink = 0; | |||||
3558 | 0 | 0 | if (my $elt = $item->first_child('guid')) { | ||||
3559 | # ignore empty |
||||||
3560 | # http://abc.net.au/rn/podcast/feeds/sci.xml | ||||||
3561 | 0 | 0 | if (is_non_empty (my $str = collapse_whitespace ($elt->text))) { | ||||
3562 | 0 | $guid = $str; | |||||
3563 | 0 | 0 | $isPermaLink = (lc($elt->att('isPermaLink') // 'true') eq 'true'); | ||||
3564 | } | ||||||
3565 | } | ||||||
3566 | |||||||
3567 | 0 | 0 | if ($isPermaLink) { # |
||||
3568 | 0 | return $self->url_to_msgid ($guid); | |||||
3569 | } | ||||||
3570 | 0 | 0 | if (my $link = item_yahoo_permalink ($item)) { | ||||
3571 | 0 | return $self->url_to_msgid ($link); | |||||
3572 | } | ||||||
3573 | 0 | 0 | if (defined $guid) { # |
||||
3574 | 0 | return $self->url_to_msgid ($self->{'uri'}, $guid); | |||||
3575 | } | ||||||
3576 | |||||||
3577 | # nothing in the item, use the feed url and MD5 of some fields which | ||||||
3578 | # will hopefully distinguish it from other items at this url | ||||||
3579 | 0 | $self->verbose (2, ' msgid from MD5'); | |||||
3580 | 0 | return $self->url_to_msgid | |||||
3581 | ($self->{'uri'}, | ||||||
3582 | md5_of_utf8 (join_non_empty ('', | ||||||
3583 | 0 | map {$item->first_child_text($_)} | |||||
3584 | qw(title | ||||||
3585 | author | ||||||
3586 | dc:creator | ||||||
3587 | description | ||||||
3588 | content | ||||||
3589 | link | ||||||
3590 | pubDate | ||||||
3591 | published | ||||||
3592 | updated | ||||||
3593 | )))); | ||||||
3594 | } | ||||||
3595 | # FIXME: is |
||||||
3596 | # |
||||||
3597 | @known{qw(/channel/item/guid | ||||||
3598 | /channel/item/id | ||||||
3599 | /channel/item/wordzilla:id | ||||||
3600 | /channel/item/slate:id | ||||||
3601 | )} = (); | ||||||
3602 | |||||||
3603 | # Return an "In-Reply-To:" value for $item, being a space-separated list of | ||||||
3604 | # Message-ID strings including angles <>, or undef if nothing. The message | ||||||
3605 | # ids match up to an Atom |
||||||
3606 | # | ||||||
3607 | # RFC 4685 has |
||||||
3608 | # reply to multiple originals. | ||||||
3609 | # | ||||||
3610 | # Eg. comment feeds under | ||||||
3611 | # http://wickedgooddinner.blogspot.com/feeds/posts/default | ||||||
3612 | # | ||||||
3613 | sub item_to_in_reply_to { | ||||||
3614 | 0 | 0 | 0 | my ($self, $item) = @_; | |||
3615 | |||||||
3616 | 0 | my @ids; | |||||
3617 | 0 | foreach my $elt ($item->children('thr:in-reply-to')) { | |||||
3618 | 0 | 0 | my $ref = ($elt->att('thr:ref') | ||||
0 | |||||||
0 | |||||||
3619 | // $elt->att('ref') | ||||||
3620 | // $elt->att('atom:ref') # comes out atom: under map_xmlns ... | ||||||
3621 | // next); | ||||||
3622 | # probably shouldn't be relative actually ... | ||||||
3623 | 0 | $ref = App::RSS2Leafnode::XML::Twig::Other::elt_xml_based_uri ($elt, $ref); | |||||
3624 | 0 | push @ids, $self->url_to_msgid ($ref); | |||||
3625 | } | ||||||
3626 | 0 | 0 | if (@ids) { | ||||
3627 | 0 | return join (' ', @ids); | |||||
3628 | } else { | ||||||
3629 | 0 | return undef; | |||||
3630 | } | ||||||
3631 | } | ||||||
3632 | @known{qw(/channel/item/thr:in-reply-to | ||||||
3633 | )} = (); | ||||||
3634 | |||||||
3635 | # Return a string of comma separated keywords per RFC1036 and RFC2822. | ||||||
3636 | # | ||||||
3637 | # RSS |
||||||
3638 | # that in as a bit of a fallback, being better than nothing for | ||||||
3639 | # classification. | ||||||
3640 | # | ||||||
3641 | # Atom |
||||||
3642 | # attribute being the displayable part. Have seen only the "term" attribute | ||||||
3643 | # though. | ||||||
3644 | # | ||||||
3645 | # |
||||||
3646 | # it in for more classification for now. Can have child |
||||||
3647 | # elements as sub-categories, but don't worry about them, haven't seen any | ||||||
3648 | # real ones, only the sample at | ||||||
3649 | # http://www.apple.com/itunes/podcasts/specs.html#example | ||||||
3650 | # | ||||||
3651 | # |
||||||
3652 | # |
||||||
3653 | # particularly informative. | ||||||
3654 | # | ||||||
3655 | # |
||||||
3656 | # should be in the keywords if it's also in the body text, but at least | ||||||
3657 | # offers a bit of classification in the headers. | ||||||
3658 | # | ||||||
3659 | # |
||||||
3660 | # want a bit of decoding. Not much used, but for instance | ||||||
3661 | # http://www.gdacs.org/xml/RSSTC.xml | ||||||
3662 | # http://earthquake.usgs.gov/eqcenter/recenteqsww/catalogs/eqs7day-M5.xml | ||||||
3663 | # | ||||||
3664 | # How much value is there in the channel keywords? | ||||||
3665 | # | ||||||
3666 | { | ||||||
3667 | my $re = qr/^(category | ||||||
3668 | |itunes:category | ||||||
3669 | |cap:category | ||||||
3670 | |itunes:keywords | ||||||
3671 | |media:keywords | ||||||
3672 | |dc:subject | ||||||
3673 | |slash:section | ||||||
3674 | |slate:topic | ||||||
3675 | )$/x; | ||||||
3676 | sub item_to_keywords { | ||||||
3677 | 0 | 0 | 0 | my ($self, $item) = @_; | |||
3678 | 0 | my $channel = item_to_channel($item); | |||||
3679 | |||||||
3680 | 0 | return join_non_empty | |||||
3681 | (', ', | ||||||
3682 | List::MoreUtils::uniq | ||||||
3683 | 0 | (map { collapse_whitespace($_) } | |||||
3684 | 0 | 0 | map { split /,/ } | ||||
0 | |||||||
0 | |||||||
0 | |||||||
0 | |||||||
0 | |||||||
3685 | 0 | map { ($_->att('text') # itunes:category | |||||
3686 | // $_->att('itunes:text') # itunes:category | ||||||
3687 | // $_->att('atom:label') # atom |
||||||
3688 | // $_->att('label') # atom |
||||||
3689 | // $_->att('atom:term') # atom |
||||||
3690 | // $_->att('term') # atom |
||||||
3691 | // $_->text) } # other | ||||||
3692 | ($item->children($re), | ||||||
3693 | $channel->children($re), | ||||||
3694 | # |
||||||
3695 | 0 | map {$_->children('cb:keyword')} $item->children, | |||||
3696 | ))); | ||||||
3697 | } | ||||||
3698 | # maybe could show |
||||||
3699 | # too, for now just omit | ||||||
3700 | @known{qw(/channel/category | ||||||
3701 | /channel/itunes:category | ||||||
3702 | /channel/itunes:category/itunes:category | ||||||
3703 | |||||||
3704 | /channel/item/category | ||||||
3705 | /channel/item/itunes:keywords | ||||||
3706 | /channel/item/media:keywords | ||||||
3707 | /channel/item/slash:section | ||||||
3708 | /channel/item/slate:topic | ||||||
3709 | /channel/item/slate:section | ||||||
3710 | )} = (); | ||||||
3711 | } | ||||||
3712 | |||||||
3713 | { | ||||||
3714 | # Feturn a string for the "Importance:" header of RFC 1911, RFC 2156 | ||||||
3715 | # voice and X.400 messaging. Possible values 'high', 'normal', 'low'. | ||||||
3716 | # 'normal' is the header default, return undef in that case in the | ||||||
3717 | # interests of not junking up headers with defaults | ||||||
3718 | # | ||||||
3719 | my %cap_severity_high = (extreme => 1, | ||||||
3720 | severe => 1); | ||||||
3721 | my %cap_severity_normal = (moderate => 1); | ||||||
3722 | my %cap_severity_low = (minor => 1); | ||||||
3723 | |||||||
3724 | sub item_to_importance { | ||||||
3725 | 0 | 0 | 0 | my ($self, $item) = @_; | |||
3726 | |||||||
3727 | 0 | 0 | my $cap_severity = lc($item->first_child_trimmed_text('cap:severity') | ||||
3728 | // ''); | ||||||
3729 | 0 | 0 | my $wiki_importance = ($item->first_child_trimmed_text('wiki:importance') | ||||
3730 | // ''); | ||||||
3731 | 0 | 0 | if ($cap_severity) { | ||||
3732 | 0 | $self->verbose (2, " CAP severity: ",$cap_severity); | |||||
3733 | 0 | $self->verbose (2, " Wiki importance: ",$wiki_importance); | |||||
3734 | } | ||||||
3735 | |||||||
3736 | 0 | 0 | if ($cap_severity_high{$cap_severity}) { | ||||
3737 | 0 | return 'high'; | |||||
3738 | } | ||||||
3739 | 0 | 0 | if ($cap_severity_normal{$cap_severity}) { | ||||
3740 | 0 | return undef; # default "normal" | |||||
3741 | } | ||||||
3742 | 0 | 0 | 0 | if ($cap_severity_low{$cap_severity} | |||
3743 | || $wiki_importance eq 'minor') { | ||||||
3744 | 0 | return 'low'; | |||||
3745 | } | ||||||
3746 | 0 | return undef; # unknown | |||||
3747 | } | ||||||
3748 | @known{qw(/channel/item/wiki:importance | ||||||
3749 | )} = (); | ||||||
3750 | } | ||||||
3751 | { | ||||||
3752 | # Return a string for the "Priority:" header of RFC 1327, RFC 2156. | ||||||
3753 | # Possible values 'urgent', 'normal', 'non-urgent'. | ||||||
3754 | # 'normal' is the header default, return undef in that case in the | ||||||
3755 | # interests of not junking up headers with defaults | ||||||
3756 | # | ||||||
3757 | # |
||||||
3758 | # for when response action should be taken. Is the |
||||||
3759 | # better indicator of transmission priority? | ||||||
3760 | # | ||||||
3761 | my %cap_severity_urgent = (extreme => 1, | ||||||
3762 | severe => 1); | ||||||
3763 | my %cap_severity_normal = (moderate => 1); | ||||||
3764 | |||||||
3765 | sub item_to_priority { | ||||||
3766 | 0 | 0 | 0 | my ($self, $item) = @_; | |||
3767 | |||||||
3768 | 0 | 0 | my $cap_severity = lc($item->first_child_trimmed_text('cap:severity') | ||||
3769 | // ''); | ||||||
3770 | |||||||
3771 | 0 | 0 | if ($cap_severity_urgent{$cap_severity}) { | ||||
3772 | 0 | return 'urgent'; | |||||
3773 | } | ||||||
3774 | 0 | 0 | if ($cap_severity_normal{$cap_severity}) { | ||||
3775 | 0 | return undef; # default "normal" | |||||
3776 | } | ||||||
3777 | 0 | if (0) { # nothing for this yet | |||||
3778 | return 'non-urgent'; | ||||||
3779 | } | ||||||
3780 | 0 | return undef; # unknown | |||||
3781 | } | ||||||
3782 | } | ||||||
3783 | |||||||
3784 | # return a string for the slightly unofficial "Precedence:" header | ||||||
3785 | # might be able to identify lists gatewayed to RSS and give "list" for them | ||||||
3786 | # maybe "bulk" would suit low priority stuff | ||||||
3787 | # for now nothing | ||||||
3788 | # | ||||||
3789 | # sub item_to_precedence { | ||||||
3790 | # my ($self, $item) = @_; | ||||||
3791 | # return undef; # nothing | ||||||
3792 | # } | ||||||
3793 | |||||||
3794 | # return the host part of $self->{'uri'}, or "localhost" if none | ||||||
3795 | sub uri_to_host { | ||||||
3796 | 0 | 0 | 0 | my ($self) = @_; | |||
3797 | 0 | my $uri = $self->{'uri'}; | |||||
3798 | ### uri_to_host(): $uri | ||||||
3799 | 0 | 0 | return (non_empty ($uri && $uri->can('host') && $uri->host) | ||||
0 | |||||||
3800 | // 'localhost'); | ||||||
3801 | } | ||||||
3802 | |||||||
3803 | sub item_to_subject { | ||||||
3804 | 0 | 0 | 0 | my ($self, $item) = @_; | |||
3805 | |||||||
3806 | # Atom |
||||||
3807 | return | ||||||
3808 | 0 | 0 | (elt_to_rendered_line ($item->first_child('title')) | ||||
0 | |||||||
0 | |||||||
0 | |||||||
3809 | |||||||
3810 | # |
||||||
3811 | # present without a plain |
||||||
3812 | # | ||||||
3813 | // elt_to_rendered_line ($item->first_child('dc:title')) | ||||||
3814 | |||||||
3815 | # eg. https://archive.org/services/collection-rss.php has |
||||||
3816 | # in addition to plain |
||||||
3817 | # |
||||||
3818 | # | ||||||
3819 | // elt_to_rendered_line ($item->first_child('media:title')) | ||||||
3820 | |||||||
3821 | # |
||||||
3822 | # better than nothing. Not sure have ever actually seen |
||||||
3823 | # without |
||||||
3824 | # | ||||||
3825 | // elt_to_rendered_line ($item->first_child('dc:subject')) | ||||||
3826 | |||||||
3827 | // __('no subject')); | ||||||
3828 | } | ||||||
3829 | @known{qw(/channel/title | ||||||
3830 | /channel/dc:subject | ||||||
3831 | /channel/subtitle | ||||||
3832 | /channel/itunes:subtitle | ||||||
3833 | |||||||
3834 | /channel/item/dc:subject | ||||||
3835 | /channel/item/title | ||||||
3836 | /channel/item/media:title | ||||||
3837 | /channel/item/dc:title | ||||||
3838 | /channel/item/itunes:title | ||||||
3839 | /channel/item/itunes:subtitle --not-using-this-as-yet | ||||||
3840 | /channel/item/slate:menuline --copy-of-subject-it-seems | ||||||
3841 | /channel/item/slate:rubric --blog-title | ||||||
3842 | /channel/item/slate:blog --blog-title | ||||||
3843 | /channel/item/slate:legacy_url --same-as-link-it-seems | ||||||
3844 | )} = (); | ||||||
3845 | |||||||
3846 | |||||||
3847 | # return language code string for Content-Language, or undef | ||||||
3848 | # return is per RFC 1766, RFC 3066, RFC 4646 | ||||||
3849 | # | ||||||
3850 | # xml:lang is defined to be per RFC 4646, no mangling needed | ||||||
3851 | # RSS |
||||||
3852 | # |
||||||
3853 | # cf. I18N::LangTags if mangling might be needed one day | ||||||
3854 | # | ||||||
3855 | sub item_to_language { | ||||||
3856 | 0 | 0 | 0 | my ($self, $item) = @_; | |||
3857 | 0 | my $lang; | |||||
3858 | |||||||
3859 | 0 | 0 | if (my $elt = $item->first_child('content')) { | ||||
3860 | 0 | $lang = non_empty ($elt->att('xml:lang')); | |||||
3861 | } | ||||||
3862 | # Either |
||||||
3863 | # xml:lang="" tag, in the item itself or in channel, and maybe xml:lang in | ||||||
3864 | # toplevel |
||||||
3865 | # xml:lang, not a |
||||||
3866 | 0 | for ( ; $item; $item = $item->parent) { | |||||
3867 | 0 | 0 | $lang //= (non_empty ($item->first_child_trimmed_text | ||||
0 | |||||||
0 | |||||||
3868 | (qr/^((dc:)?language|twitter:lang)$/)) | ||||||
3869 | // non_empty ($item->att('xml:lang')) | ||||||
3870 | // next); | ||||||
3871 | } | ||||||
3872 | 0 | 0 | return ($lang // $self->{'resp'}->content_language); | ||||
3873 | } | ||||||
3874 | @known{qw(/channel/language | ||||||
3875 | /channel/dc:language | ||||||
3876 | /channel/twitter:lang | ||||||
3877 | /channel/item/language | ||||||
3878 | /channel/item/dc:language | ||||||
3879 | /channel/item/twitter:lang | ||||||
3880 | )} = (); | ||||||
3881 | |||||||
3882 | # return arrayref of copyright strings | ||||||
3883 | # Keep all of multiple rights/license/etc in the interests of preserving all | ||||||
3884 | # statements. | ||||||
3885 | sub item_to_copyright { | ||||||
3886 | 0 | 0 | 0 | my ($self, $item) = @_; | |||
3887 | 0 | my $channel = item_to_channel($item); | |||||
3888 | |||||||
3889 | # |
||||||
3890 | # suppress the latter in the presence of the former (dcterms: collapsed to | ||||||
3891 | # dc: by the map_xmlns). | ||||||
3892 | # | ||||||
3893 | # Atom |
||||||
3894 | # always plain text | ||||||
3895 | # | ||||||
3896 | 0 | my $re = qr/^(rights # Atom | |||||
3897 | |copyright # RSS, don't think entity-encoded html allowed there | ||||||
3898 | |dcterms:license | ||||||
3899 | |dc:rights | ||||||
3900 | |creativeCommons:licen[cs]e | ||||||
3901 | )$/x; | ||||||
3902 | # Atom sub-elem |
||||||
3903 | 0 | my @parents = ($item, $channel, $item->children('source')); | |||||
3904 | |||||||
3905 | 0 | my @strings; | |||||
3906 | 0 | foreach my $elt (map {$_->children($re)} @parents) { | |||||
0 | |||||||
3907 | 0 | push @strings, | |||||
3908 | join_non_empty(' ', | ||||||
3909 | elt_to_rendered_line($elt), | ||||||
3910 | # eg. |
||||||
3911 | $elt->att('rdf:resource')); | ||||||
3912 | } | ||||||
3913 | |||||||
3914 | # | ||||||
3915 | 0 | foreach my $link (map {$_->children('link')} @parents) { | |||||
0 | |||||||
3916 | ### link for copyright: $link->sprint | ||||||
3917 | 0 | 0 | 0 | if (($link->att('atom:rel')//$link->att('rel')//'') eq 'license') { | |||
0 | |||||||
3918 | 0 | 0 | push @strings, $link->att('atom:href')//$link->att('href'); | ||||
3919 | } | ||||||
3920 | } | ||||||
3921 | ### @strings | ||||||
3922 | 0 | return [ List::MoreUtils::uniq(grep {defined} @strings) ]; | |||||
0 | |||||||
3923 | } | ||||||
3924 | @known{qw(/channel/copyright | ||||||
3925 | /channel/rights | ||||||
3926 | /channel/dc:rights | ||||||
3927 | /channel/dc:license | ||||||
3928 | /channel/creativeCommons:licence | ||||||
3929 | /channel/creativeCommons:license | ||||||
3930 | /channel/item/dc:rights | ||||||
3931 | /channel/item/dc:license | ||||||
3932 | /channel/item/creativeCommons:licence | ||||||
3933 | /channel/item/creativeCommons:license | ||||||
3934 | )} = (); | ||||||
3935 | # /channel/item/media:credit --nothing-much-in-this-one | ||||||
3936 | |||||||
3937 | |||||||
3938 | # return string or undef | ||||||
3939 | sub item_to_generator { | ||||||
3940 | 0 | 0 | 0 | my ($self, $item) = @_; | |||
3941 | 0 | my $channel = item_to_channel($item); | |||||
3942 | 0 | my @strings; | |||||
3943 | |||||||
3944 | # both RSS and Atom use |
||||||
3945 | # Atom can include version="" and uri="" | ||||||
3946 | 0 | 0 | if (my $generator = $channel->first_child('generator')) { | ||||
3947 | 0 | push @strings, join_non_empty (' ', | |||||
3948 | $generator->text, | ||||||
3949 | $generator->att('atom:version'), | ||||||
3950 | $generator->att('version'), | ||||||
3951 | $generator->att('atom:uri'), | ||||||
3952 | $generator->att('uri')); | ||||||
3953 | } | ||||||
3954 | |||||||
3955 | # FIXME: is this bit right? | ||||||
3956 | # | ||||||
3957 | # source="<a href="http://nongnu.org/identica-mode/" rel="nofollow">Emacs Identica-mode</a>" | ||||||
3958 | # source_link="http://nongnu.org/identica-mode/"> | ||||||
3959 | # | ||||||
3960 | 0 | 0 | if (my $notice = $item->first_child('statusnet:notice_info')) { | ||||
3961 | 0 | 0 | if (defined (my $html = $notice->att('atom:source'))) { | ||||
3962 | 0 | push @strings, join_non_empty (' ', | |||||
3963 | html_to_rendered_line($html), | ||||||
3964 | $notice->att('atom:source_link')); | ||||||
3965 | } | ||||||
3966 | } | ||||||
3967 | |||||||
3968 | 0 | return collapse_whitespace (join_non_empty (', ', @strings)); | |||||
3969 | } | ||||||
3970 | @known{qw(/channel/item/statusnet:notice_info | ||||||
3971 | )} = (); | ||||||
3972 | |||||||
3973 | # return URL string or undef/empty | ||||||
3974 | sub item_to_feedburner { | ||||||
3975 | 0 | 0 | 0 | my ($self, $item) = @_; | |||
3976 | 0 | my $channel = item_to_channel($item); | |||||
3977 | 0 | 0 | my $elt = $channel->first_child('feedburner:info') || return; | ||||
3978 | 0 | 0 | my $uri = $elt->att('uri') // return; | ||||
3979 | 0 | return URI->new_abs ($uri, 'http://feeds.feedburner.com/')->as_string; | |||||
3980 | } | ||||||
3981 | |||||||
3982 | # $elt is an Atom |
||||||
3983 | sub atom_content_flavour { | ||||||
3984 | 0 | 0 | 0 | my ($elt) = @_; | |||
3985 | 0 | 0 | if (! defined $elt) { return ''; } | ||||
0 | |||||||
3986 | 0 | 0 | my $type = ($elt->att('atom:type') // $elt->att('type')); | ||||
3987 | 0 | 0 | 0 | if ($elt->att('atom:src') || $elt->att('src')) { | |||
3988 | # |
||||||
3989 | 0 | return 'link'; | |||||
3990 | } | ||||||
3991 | 0 | 0 | 0 | if (! defined $type | |||
0 | |||||||
0 | |||||||
0 | |||||||
3992 | || $type eq 'html' | ||||||
3993 | || $type eq 'xhtml' | ||||||
3994 | || $type eq 'application/xhtml+xml' | ||||||
3995 | || $type =~ m{^text/}) { | ||||||
3996 | 0 | return 'body'; | |||||
3997 | } | ||||||
3998 | 0 | return 'attach'; | |||||
3999 | } | ||||||
4000 | |||||||
4001 | sub html_wrap_fragment { | ||||||
4002 | 0 | 0 | 0 | my ($item, $fragment, $language) = @_; | |||
4003 | 0 | 0 | my $charset = (is_ascii($fragment) ? 'us-ascii' : 'utf-8'); | ||||
4004 | 0 | my $base_uri = App::RSS2Leafnode::XML::Twig::Other::elt_xml_base($item); | |||||
4005 | 0 | 0 | my $base_header = (defined $base_uri | ||||
4006 | ? " |
||||||
4007 | : ''); | ||||||
4008 | 0 | 0 | if (is_non_empty ($language)) { | ||||
4009 | 0 | $language = " lang=\"$Entitize{$language}\""; | |||||
4010 | } else { | ||||||
4011 | 0 | $language = ''; | |||||
4012 | } | ||||||
4013 | 0 | return (<<"HERE", $charset); | |||||
4014 | |||||||
4015 | |||||||
4016 | |||||||
4017 | |||||||
4018 | $base_header | ||||||
4019 | |||||||
4020 | $fragment | ||||||
4021 | |||||||
4022 | HERE | ||||||
4023 | } | ||||||
4024 | |||||||
4025 | # $self->{'rss_charset_override'}, if set, means the bytes are actually in | ||||||
4026 | # that charset. Enforce this by replacing the " | ||||||
4027 | # bytes. Do a decode() and re-encode() to cope with non-ascii like say | ||||||
4028 | # utf-16. | ||||||
4029 | # | ||||||
4030 | # XML::RSS::LibXML has an "encoding" option on its new(), but that's for | ||||||
4031 | # feed creation or something, a parse() still follows the tag. | ||||||
4032 | # | ||||||
4033 | sub enforce_rss_charset_override { | ||||||
4034 | 0 | 0 | 0 | my ($self, $xml) = @_; | |||
4035 | 0 | 0 | if (my $charset = $self->{'rss_charset_override'}) { | ||||
4036 | 0 | $xml = Encode::decode ($charset, $xml); | |||||
4037 | 0 | 0 | if ($xml =~ s/(<\?xml[^>]*encoding="?)([^">]+)/$1$charset/i) { | ||||
0 | |||||||
4038 | 0 | $self->verbose (2, "replace encoding=$2 tag with encoding=$charset"); | |||||
4039 | } elsif ($xml =~ s/(<\?xml[^?>]*)/$1 encoding="$charset"/i) { | ||||||
4040 | 0 | $self->verbose (2, "insert encoding=\"$charset\""); | |||||
4041 | } else { | ||||||
4042 | 0 | my $str = "\n"; | |||||
4043 | 0 | $self->verbose (2, "insert $str"); | |||||
4044 | 0 | $xml = $str . $xml; | |||||
4045 | } | ||||||
4046 | 0 | $self->verbose (3, "xml now:\n$xml\n"); | |||||
4047 | 0 | $xml = Encode::encode ($charset, $xml); | |||||
4048 | } | ||||||
4049 | 0 | return $xml; | |||||
4050 | } | ||||||
4051 | |||||||
4052 | # slightly experimental extract of "cap" fields as from | ||||||
4053 | # http://www.nws.noaa.gov/alerts-beta/ | ||||||
4054 | # http://www.weather.gov/alerts-beta/ca.php?x=0 | ||||||
4055 | sub item_common_alert_protocol { | ||||||
4056 | 0 | 0 | 0 | my ($self, $item, $want_html) = @_; | |||
4057 | 0 | my @fields; | |||||
4058 | 0 | foreach my $elt ($item->children(qr/^cap:/)) { | |||||
4059 | 0 | (my $field = $elt->name) =~ s/^cap://; | |||||
4060 | 0 | 0 | 0 | if ($field eq 'geocode' || $field eq 'parameter') { | |||
4061 | # dunno how to show these yet ... | ||||||
4062 | 0 | next; | |||||
4063 | } | ||||||
4064 | 0 | $known{'/channel/item/'.$elt->name} = undef; | |||||
4065 | |||||||
4066 | 0 | my $value = elt_to_rendered_line ($elt); | |||||
4067 | 0 | $value = Text::Trim::trim ($value); | |||||
4068 | 0 | 0 | if (is_non_empty ($value)) { | ||||
4069 | 0 | push @fields, [ "\u$field: ", $value ]; | |||||
4070 | } | ||||||
4071 | } | ||||||
4072 | 0 | 0 | if (! @fields) { | ||||
4073 | 0 | return ''; | |||||
4074 | } | ||||||
4075 | # FIXME: This $width padding doesn't come out in html, only in text. The | ||||||
4076 | # NOAA is Atom plain text, so that one is ok at least. | ||||||
4077 | 0 | my $width = max(map {length $_->[0]} @fields); | |||||
0 | |||||||
4078 | 0 | @fields = map { my $field = $_->[0]; | |||||
0 | |||||||
4079 | 0 | my $value = $_->[1]; | |||||
4080 | 0 | $field = sprintf ('%-*s', $width, $field); | |||||
4081 | 0 | $self->text_wrap ($value, $field) | |||||
4082 | } @fields; | ||||||
4083 | 0 | 0 | if ($want_html) { | ||||
4084 | 0 | return " \n" |
|||||
4085 | 0 | . join(" \n", map {$Entitize{$_}} @fields) |
|||||
4086 | . "\n\n"; | ||||||
4087 | } else { | ||||||
4088 | 0 | return "\n" | |||||
4089 | . join("\n", @fields) | ||||||
4090 | . "\n"; | ||||||
4091 | } | ||||||
4092 | } | ||||||
4093 | |||||||
4094 | sub item_unknowns { | ||||||
4095 | 0 | 0 | 0 | my ($self, $item, $want_html) = @_; | |||
4096 | ### item_unknowns() ... | ||||||
4097 | |||||||
4098 | 0 | my $xml = ''; | |||||
4099 | 0 | 0 | foreach my $elt (map {$_->tag eq 'media:group' # descend into media:group | ||||
0 | |||||||
4100 | ? $_->children : $_} | ||||||
4101 | $item->children) { | ||||||
4102 | 0 | 0 | next if $elt->tag =~ /^#/; # text | ||||
4103 | 0 | 0 | next if App::RSS2Leafnode::XML::Twig::Other::elt_is_empty($elt); | ||||
4104 | 0 | my $path = $elt->path; | |||||
4105 | 0 | $path =~ s{^/(rss|channel)/channel}{/channel}; | |||||
4106 | 0 | $path =~ s{^/(feed|rdf:RDF)}{/channel}; | |||||
4107 | 0 | $path =~ s{^/channel/entry}{/channel/item}; | |||||
4108 | 0 | 0 | next if $path =~ m{/xhtml}; | ||||
4109 | 0 | 0 | next if $path =~ m{^/channel/item/(description|content:encoded)/}; | ||||
4110 | 0 | 0 | next if exists $known{$path}; | ||||
4111 | ### unknown path: $path | ||||||
4112 | |||||||
4113 | 0 | require Text::Wrap; | |||||
4114 | 0 | my $part = do { | |||||
4115 | 0 | local $Text::Wrap::columns = $self->{'render_width'} + 1 + 4; | |||||
4116 | 0 | local $Text::Wrap::huge = 'overflow'; # don't break long words | |||||
4117 | 0 | local $Text::Wrap::unexpand = 0; # no tabs in output | |||||
4118 | 0 | $elt->sprint | |||||
4119 | }; | ||||||
4120 | 0 | $part =~ s/^ //mg; # indentation from element depth | |||||
4121 | 0 | $part =~ s/^\n+//; # leading blank lines | |||||
4122 | 0 | $xml .= $part; | |||||
4123 | } | ||||||
4124 | 0 | 0 | if ($xml eq '') { | ||||
4125 | 0 | return ''; | |||||
4126 | } | ||||||
4127 | ### $xml | ||||||
4128 | |||||||
4129 | 0 | 0 | if ($want_html) { | ||||
4130 | 0 | return "\n \n" . __('Further feed XML:') . " |
|||||
4131 | . "$Entitize{$xml}\n\n"; |
||||||
4132 | } else { | ||||||
4133 | 0 | return "\n" . __('Further feed XML:') . "\n" . $xml; | |||||
4134 | } | ||||||
4135 | } | ||||||
4136 | |||||||
4137 | @known{qw(/channel/item/media:group/media:title | ||||||
4138 | /channel/item/media:group/media:description | ||||||
4139 | /channel/item/media:group/media:credit | ||||||
4140 | /channel/item/media:group/media:player | ||||||
4141 | /channel/item/media:group/media:thumbnail | ||||||
4142 | /channel/item/media:group/media:content | ||||||
4143 | /channel/item/media:group/media:copyright | ||||||
4144 | |||||||
4145 | --ENHANCE-ME--nothing-for-these-yet | ||||||
4146 | /channel/item/media:group/media:category | ||||||
4147 | /channel/item/media:group/media:rating | ||||||
4148 | )} = (); # hash slice | ||||||
4149 | |||||||
4150 | sub media_group_to_html { | ||||||
4151 | 0 | 0 | 0 | my ($self, $group) = @_; | |||
4152 | ### media_group_to_html(): "$group" | ||||||
4153 | |||||||
4154 | 0 | my $ret = " \n"; |
|||||
4155 | 0 | my @lines; | |||||
4156 | |||||||
4157 | 0 | foreach my $elt ($group->children('media:title'), | |||||
4158 | $group->children('media:description')) { | ||||||
4159 | 0 | push @lines, elt_to_html($elt); | |||||
4160 | } | ||||||
4161 | |||||||
4162 | 0 | foreach my $elt ($group->children('media:credit')) { | |||||
4163 | 0 | my $html = elt_to_html($elt); | |||||
4164 | 0 | 0 | if (defined (my $role = non_empty($elt->att('role')))) { | ||||
4165 | 0 | $html .= " ($Entitize{$role})"; | |||||
4166 | } | ||||||
4167 | 0 | push @lines, $html; | |||||
4168 | } | ||||||
4169 | 0 | foreach my $elt ($group->children('media:player'), | |||||
4170 | $group->children('media:thumbnail'), | ||||||
4171 | $group->children('media:content')) { | ||||||
4172 | 0 | 0 | my $url = $elt->att('url') // next; | ||||
4173 | 0 | my $abs_url = App::RSS2Leafnode::XML::Twig::Other::elt_xml_based_uri | |||||
4174 | ($group, $url); | ||||||
4175 | |||||||
4176 | 0 | my $html = " | |||||
4177 | 0 | 0 | if (defined (my $type = non_empty($elt->att('type')))) { | ||||
4178 | 0 | $html .= " type=\"$Entitize{$type}\""; | |||||
4179 | } | ||||||
4180 | 0 | 0 | if (defined (my $lang = non_empty($elt->att('lang')))) { | ||||
4181 | 0 | $html .= " hreflang=\"$Entitize{$lang}\""; | |||||
4182 | } | ||||||
4183 | 0 | $html .= ">$Entitize{$url}$url"; | |||||
4184 | { | ||||||
4185 | 0 | my @paren; | |||||
0 | |||||||
4186 | 0 | 0 | if (defined (my $size = non_empty($elt->att('fileSize')))) { | ||||
4187 | 0 | push @paren, $self->format_size_in_bytes($size); | |||||
4188 | } | ||||||
4189 | 0 | 0 | if (defined (my $duration = non_empty($elt->att('duration')))) { | ||||
4190 | 0 | 0 | if ($duration !~ /:/) { | ||||
4191 | 0 | $duration = __px('s-for-seconds', '{duration}s', | |||||
4192 | duration => $duration); | ||||||
4193 | } | ||||||
4194 | 0 | push @paren, $duration; | |||||
4195 | } | ||||||
4196 | 0 | 0 | if (@paren) { | ||||
4197 | 0 | $html .= $Entitize{' (' . join(', ',@paren). ')'}; | |||||
4198 | } | ||||||
4199 | } | ||||||
4200 | 0 | $html .= "\n"; | |||||
4201 | 0 | push @lines, $html; | |||||
4202 | } | ||||||
4203 | |||||||
4204 | 0 | foreach my $elt ($group->children('media:copyright')) { | |||||
4205 | 0 | push @lines, "Copyright: ".elt_to_html($elt); | |||||
4206 | } | ||||||
4207 | |||||||
4208 | ### total lines: scalar(@lines) | ||||||
4209 | 0 | return " \n" . join(" |
|||||
4210 | } | ||||||
4211 | |||||||
4212 | sub elt_to_html { | ||||||
4213 | 0 | 0 | 0 | my ($elt) = @_; | |||
4214 | 0 | 0 | defined $elt or return; | ||||
4215 | |||||||
4216 | 0 | my $type = elt_content_type ($elt); | |||||
4217 | 0 | 0 | if ($type eq 'xhtml') { | ||||
4218 | 0 | return elt_xhtml_to_html($elt); | |||||
4219 | } | ||||||
4220 | 0 | my $str = elt_subtext($elt); | |||||
4221 | 0 | 0 | if ($type eq 'html') { | ||||
4222 | 0 | return $str; | |||||
4223 | } else { | ||||||
4224 | 0 | return $Entitize{$str}; | |||||
4225 | } | ||||||
4226 | } | ||||||
4227 | |||||||
4228 | # $body construction below | ||||||
4229 | @known{qw(/channel/item/description | ||||||
4230 | /channel/item/dc:description | ||||||
4231 | /channel/item/itunes:summary | ||||||
4232 | /channel/item/content:encoded | ||||||
4233 | /channel/item/summary | ||||||
4234 | )} = (); | ||||||
4235 | |||||||
4236 | # $item is an XML::Twig::Elt | ||||||
4237 | # | ||||||
4238 | sub fetch_rss_process_one_item { | ||||||
4239 | 0 | 0 | 0 | my ($self, $item) = @_; | |||
4240 | 0 | my $subject = $self->item_to_subject ($item); | |||||
4241 | 0 | $self->verbose (1, ' ', __x('item: {subject}', subject => $subject)); | |||||
4242 | |||||||
4243 | 0 | my $msgid = $self->item_to_msgid ($item); | |||||
4244 | 0 | my $new = 0; | |||||
4245 | |||||||
4246 | 0 | 0 | if (! $self->nntp_message_id_exists ($msgid)) { | ||||
4247 | 0 | my $channel = item_to_channel($item); | |||||
4248 | 0 | my ($from, $sender, @from_links) = $self->item_to_from($item); | |||||
4249 | 0 | my @links = ($self->item_to_links ($item), | |||||
4250 | @from_links); | ||||||
4251 | |||||||
4252 | # For comments feeds show "Re: Foo" as the subject. Haven't seen a | ||||||
4253 | # comments feed with anything useful in the |
||||||
4254 | # including it at the start of the message body if it was any good. | ||||||
4255 | # | ||||||
4256 | # | ||||||
4257 | # http://www.netzpolitik.org/feed/ has |
||||||
4258 | # just "Von: Foo" where Foo is the poster's name. | ||||||
4259 | # | ||||||
4260 | # my $dummy = $self->DUMMY_EMAIL_ADDRESS; | ||||||
4261 | # if ($from =~ /(.*) <\Q$dummy\E>$/ | ||||||
4262 | # && $subject eq "Von: $1") { | ||||||
4263 | # $subject = $self->{'getting_rss_comments'}; | ||||||
4264 | # } | ||||||
4265 | # | ||||||
4266 | 0 | 0 | if (defined $self->{'getting_rss_comments'}) { | ||||
4267 | 0 | $subject = $self->{'getting_rss_comments'}; | |||||
4268 | } | ||||||
4269 | |||||||
4270 | 0 | my $list_post = googlegroups_link_email(@links); | |||||
4271 | 0 | 0 | my $precedence = (defined $list_post ? 'list' : undef); | ||||
4272 | 0 | my $language = $self->item_to_language($item); | |||||
4273 | |||||||
4274 | # RSS |
||||||
4275 | # http://www.w3.org/TR/REC-PICS-labels | ||||||
4276 | # ENHANCE-ME: Maybe transform |
||||||
4277 | # PICS too maybe, unless it only applies to the enclosure as such. Maybe | ||||||
4278 | # |
||||||
4279 | 0 | my $pics_label = collapse_whitespace ($channel->first_child_text('rating')); | |||||
4280 | |||||||
4281 | # Crib: an undef value for a header means omit that header, which is good | ||||||
4282 | # for say the merely optional "Content-Language" | ||||||
4283 | # | ||||||
4284 | # there can be multiple "feed" links from Atom ... | ||||||
4285 | # 'X-RSS-Feed-Link:' => $channel->{'link'}, | ||||||
4286 | # | ||||||
4287 | 0 | my %headers | |||||
4288 | = ('Path:' => scalar ($self->uri_to_host), | ||||||
4289 | 'Newsgroups:' => $self->{'nntp_group'}, | ||||||
4290 | From => $from, | ||||||
4291 | Sender => $from, | ||||||
4292 | Subject => $subject, | ||||||
4293 | Keywords => scalar ($self->item_to_keywords($item)), | ||||||
4294 | Date => scalar ($self->item_to_date($item)), | ||||||
4295 | 'In-Reply-To:' => scalar ($self->item_to_in_reply_to($item)), | ||||||
4296 | References => $self->{'References:'}, | ||||||
4297 | 'Message-ID' => $msgid, | ||||||
4298 | 'Content-Language:' => $language, | ||||||
4299 | 'Importance:' => scalar ($self->item_to_importance($item)), | ||||||
4300 | 'Priority:' => scalar ($self->item_to_priority($item)), | ||||||
4301 | 'Face:' => scalar ($self->item_to_face($item)), | ||||||
4302 | 'List-Post:' => $list_post, | ||||||
4303 | 'Precedence:' => $precedence, | ||||||
4304 | 'PICS-Label:' => $pics_label, | ||||||
4305 | 'X-Copyright:' => scalar ($self->item_to_copyright($item)), | ||||||
4306 | 'X-RSS-URL:' => scalar ($self->{'uri'}->as_string), | ||||||
4307 | 'X-RSS-Feedburner:' => scalar ($self->item_to_feedburner($item)), | ||||||
4308 | 'X-RSS-Generator:' => scalar ($self->item_to_generator($item)), | ||||||
4309 | ); | ||||||
4310 | |||||||
4311 | 0 | my $attach_elt; | |||||
4312 | |||||||
4313 | # |
||||||
4314 | # a copy of |
||||||
4315 | # | ||||||
4316 | # ENHANCE-ME: |
||||||
4317 | # as well as |
||||||
4318 | # | ||||||
4319 | my $body = ( | ||||||
4320 | # |
||||||
4321 | # |
||||||
4322 | $item->first_child('content:encoded') | ||||||
4323 | || $item->first_child('description') | ||||||
4324 | || $item->first_child('dc:description') | ||||||
4325 | || $item->first_child('itunes:summary') | ||||||
4326 | 0 | 0 | || do { | ||||
4327 | # Atom spec is for no more than one |
||||||
4328 | # Exclude "link", and leave "attach" to code below. | ||||||
4329 | my $elt = $item->first_child('content'); | ||||||
4330 | my $flavour = atom_content_flavour($elt); | ||||||
4331 | ($flavour eq 'link' ? undef | ||||||
4332 | : $flavour eq 'attach' ? do { $attach_elt = $elt; undef } | ||||||
4333 | : $elt) | ||||||
4334 | } | ||||||
4335 | || $item->first_child('summary')); # Atom | ||||||
4336 | |||||||
4337 | 0 | my $body_type = elt_content_type ($body); | |||||
4338 | 0 | $self->verbose (3, ' body_type from elt: ', $body_type); | |||||
4339 | 0 | my $body_charset = 'utf-8'; | |||||
4340 | 0 | my $body_base_url = App::RSS2Leafnode::XML::Twig::Other::elt_xml_base ($body); | |||||
4341 | 0 | 0 | if (! defined $body_type) { # no $body element at all | ||||
0 | |||||||
0 | |||||||
0 | |||||||
0 | |||||||
4342 | 0 | $body = ''; | |||||
4343 | 0 | $body_type = 'text/plain'; | |||||
4344 | |||||||
4345 | } elsif ($body_type eq 'xhtml') { # Atom | ||||||
4346 | 0 | $body = elt_xhtml_to_html ($body); | |||||
4347 | 0 | $body_type = 'html'; | |||||
4348 | |||||||
4349 | } elsif ($body_type eq 'html') { # RSS or Atom | ||||||
4350 | 0 | $body = elt_subtext($body); | |||||
4351 | |||||||
4352 | } elsif ($body_type eq 'text') { # Atom 'text' to be flowed | ||||||
4353 | # should be text-only, no sub-elements, but extract sub-elements to | ||||||
4354 | # cope with dodgy feeds with improperly escaped html etc | ||||||
4355 | 0 | $body = $self->text_wrap (elt_subtext ($body)); | |||||
4356 | 0 | $body_type = 'text/plain'; | |||||
4357 | } elsif ($body_type =~ m{^text/}) { # Atom mime text type | ||||||
4358 | 0 | $body = elt_subtext ($body); | |||||
4359 | |||||||
4360 | } else { # Atom base64 something | ||||||
4361 | 0 | $body = MIME::Base64::decode ($body->text); | |||||
4362 | 0 | $body_charset = undef; | |||||
4363 | } | ||||||
4364 | 0 | 0 | $self->verbose (3, " body: $body_type charset=", | ||||
4365 | $body_charset//'undef', "\n", | ||||||
4366 | "$body\n"); | ||||||
4367 | |||||||
4368 | 0 | 0 | my $body_is_html = ($body_type eq 'html'|| $body_type eq 'text/html'); | ||||
4369 | 0 | 0 | my $links_want_html = ($body_is_html && ! $self->{'render'}); | ||||
4370 | 0 | 0 | $self->verbose (3, " links_want_html: ", | ||||
4371 | ($links_want_html ? "yes" : "no")); | ||||||
4372 | |||||||
4373 | # sort downloadables to the start, then by "priority" | ||||||
4374 | 1 | 1 | 1044 | use sort 'stable'; | |||
1 | 710 | ||||||
1 | 7 | ||||||
4375 | 0 | 0 | 0 | @links = sort {($b->{'download'}||0) <=> ($a->{'download'}||0) | |||
0 | 0 | ||||||
0 | |||||||
0 | |||||||
4376 | || ($b->{'priority'}||0) <=> ($a->{'priority'}||0)} | ||||||
4377 | @links; | ||||||
4378 | 0 | 0 | my $links_str = ($links_want_html | ||||
4379 | ? links_to_html(@links) | ||||||
4380 | : links_to_text(@links)); | ||||||
4381 | 0 | $links_str .= $self->item_common_alert_protocol($item, $links_want_html); | |||||
4382 | 0 | my @parts; | |||||
4383 | |||||||
4384 | # |
||||||
4385 | { | ||||||
4386 | 0 | my $content = join ("\n", | |||||
0 | |||||||
4387 | 0 | map {$self->media_group_to_html($_)} | |||||
4388 | $item->children('media:group')); | ||||||
4389 | 0 | 0 | if (is_non_empty($content)) { | ||||
4390 | 0 | ($content, my $charset) = html_wrap_fragment ($item, $content); | |||||
4391 | 0 | my $content_type = 'text/html'; | |||||
4392 | 0 | ($content, $content_type, $charset, my $rendered) | |||||
4393 | = $self->render_maybe ($content, $content_type, $charset, | ||||||
4394 | $body_base_url); | ||||||
4395 | ### media group content: $content | ||||||
4396 | 0 | 0 | if ($content_type eq 'text/plain') { | ||||
4397 | 0 | $links_str .= $content; | |||||
4398 | } else { | ||||||
4399 | 0 | $content = Encode::encode ($charset, $content); | |||||
4400 | 0 | push @parts, $self->mime_build ({}, # headers | |||||
4401 | Type => $content_type, | ||||||
4402 | Charset => $charset, | ||||||
4403 | Data => $content); | ||||||
4404 | } | ||||||
4405 | } | ||||||
4406 | } | ||||||
4407 | |||||||
4408 | 0 | 0 | if (is_non_empty(my $content | ||||
4409 | = $self->item_unknowns($item, $links_want_html))) { | ||||||
4410 | 0 | 0 | my $content_type = ($links_want_html ? 'text/html' : 'text/plain'); | ||||
4411 | 0 | 0 | if (@parts) { | ||||
4412 | 0 | 0 | my $charset = (is_ascii($content) ? 'us-ascii' : 'utf-8'); | ||||
4413 | 0 | $content = Encode::encode ($charset, $content); | |||||
4414 | 0 | push @parts, $self->mime_build ({}, # headers | |||||
4415 | Type => $content_type, | ||||||
4416 | Charset => $charset, | ||||||
4417 | Data => $content); | ||||||
4418 | } else { | ||||||
4419 | 0 | $links_str .= $content; | |||||
4420 | } | ||||||
4421 | } | ||||||
4422 | |||||||
4423 | 0 | 0 | if ($self->{'rss_get_links'}) { | ||||
4424 | 0 | foreach my $l (@links) { | |||||
4425 | 0 | 0 | next if ! $l->{'download'}; | ||||
4426 | 0 | my $url = $l->{'uri'}; | |||||
4427 | 0 | $self->verbose (1, ' ', __x('link: "{name}" {url}', | |||||
4428 | name => $l->{'name'}, | ||||||
4429 | url => $url)); | ||||||
4430 | 0 | require HTTP::Request; | |||||
4431 | 0 | my $req = HTTP::Request->new (GET => $url); | |||||
4432 | 0 | my $resp = $self->ua->request($req); | |||||
4433 | 0 | $resp = $self->aireview_follow ($url, $resp); | |||||
4434 | |||||||
4435 | 0 | 0 | if (! $resp->is_success) { | ||||
4436 | 0 | print __x("rss2leafnode: {url}\n {status}\n", | |||||
4437 | url => $l->{'uri'}, | ||||||
4438 | status => $resp->status_line); | ||||||
4439 | 0 | my $msg = __x("Cannot download link {url}\n {status}", | |||||
4440 | url => $l->{'uri'}, | ||||||
4441 | status => $resp->status_line); | ||||||
4442 | 0 | 0 | if ($links_want_html) { | ||||
4443 | 0 | $msg = $Entitize{$msg}; | |||||
4444 | 0 | $msg =~ s/\n/ /; |
|||||
4445 | 0 | $links_str .= " $msg\n \n"; |
|||||
4446 | } else { | ||||||
4447 | 0 | $links_str .= "\n$msg\n"; | |||||
4448 | } | ||||||
4449 | 0 | next; | |||||
4450 | } | ||||||
4451 | |||||||
4452 | # suspect little value in a description when inlined | ||||||
4453 | # 'Content-Description:' => mimewords_non_ascii($l->{'title'}) | ||||||
4454 | # favicon used for Face if nothing in the item | ||||||
4455 | # | ||||||
4456 | 0 | $self->enforce_html_charset_from_content ($resp); | |||||
4457 | 0 | 0 | $headers{'Face:'} ||= $self->http_resp_to_face($resp); | ||||
4458 | 0 | $self->http_resp_extract_main($resp); | |||||
4459 | 0 | push @parts, $self->mime_part_from_response($resp); | |||||
4460 | } | ||||||
4461 | } | ||||||
4462 | 0 | 0 | 0 | if ($links_want_html && $body_type eq 'html') { | |||
4463 | # append to html fragment | ||||||
4464 | 0 | $body .= $links_str; | |||||
4465 | 0 | undef $links_str; | |||||
4466 | } | ||||||
4467 | |||||||
4468 | 0 | 0 | if ($body_type eq 'html') { | ||||
4469 | 0 | ($body, $body_charset) = html_wrap_fragment ($item, $body, $language); | |||||
4470 | 0 | $body_type = 'text/html'; | |||||
4471 | } | ||||||
4472 | 0 | 0 | if (defined $body_charset) { | ||||
4473 | 0 | $body = Encode::encode ($body_charset, $body); | |||||
4474 | } | ||||||
4475 | |||||||
4476 | 0 | ($body, $body_type, $body_charset) | |||||
4477 | = $self->render_maybe ($body, $body_type, $body_charset, $body_base_url); | ||||||
4478 | |||||||
4479 | 0 | 0 | if ($body_type eq 'text/plain') { | ||||
4480 | # remove trailing whitespace from any text | ||||||
4481 | 0 | $body =~ s/\s+$//; | |||||
4482 | 0 | $body .= "\n"; | |||||
4483 | |||||||
4484 | 0 | 0 | if (! $links_want_html) { | ||||
4485 | # append to text/plain, either atom type=text or rendered html | ||||||
4486 | 0 | 0 | unless (is_empty ($links_str)) { | ||||
4487 | 0 | $links_str = Encode::encode ($body_charset, $links_str); | |||||
4488 | 0 | $body .= "\n$links_str\n"; | |||||
4489 | } | ||||||
4490 | 0 | undef $links_str; | |||||
4491 | } | ||||||
4492 | } | ||||||
4493 | |||||||
4494 | 0 | 0 | unless (is_empty ($links_str)) { | ||||
4495 | 0 | my $links_type; | |||||
4496 | my $links_charset; | ||||||
4497 | 0 | 0 | if ($links_want_html) { | ||||
4498 | 0 | $links_type = 'text/html'; | |||||
4499 | 0 | ($links_str, $links_charset) = html_wrap_fragment ($item, $links_str); | |||||
4500 | } else { | ||||||
4501 | 0 | $links_type = 'text/plain'; | |||||
4502 | 0 | 0 | $links_charset = (is_ascii($links_str) ? 'us-ascii' : 'utf-8'); | ||||
4503 | } | ||||||
4504 | 0 | $links_str = Encode::encode ($links_charset, $links_str); | |||||
4505 | 0 | unshift @parts, $self->mime_build ({}, | |||||
4506 | Type => $links_type, | ||||||
4507 | Encoding => $links_charset, | ||||||
4508 | Data => $links_str); | ||||||
4509 | } | ||||||
4510 | |||||||
4511 | |||||||
4512 | 0 | my $top = $self->mime_build (\%headers, | |||||
4513 | Top => 1, | ||||||
4514 | Type => $body_type, | ||||||
4515 | Charset => $body_charset, | ||||||
4516 | Data => $body); | ||||||
4517 | |||||||
4518 | # Atom |
||||||
4519 | 0 | 0 | if ($attach_elt) { | ||||
4520 | # ENHANCE-ME: this decodes base64 from the xml and then re-encodes for | ||||||
4521 | # the mime, is it possible to pass straight in? | ||||||
4522 | 0 | 0 | unshift @parts, $self->mime_build | ||||
4523 | ({ 'Content-Location:' => $self->{'uri'}->as_string }, | ||||||
4524 | Type => scalar ($attach_elt->att('atom:type') | ||||||
4525 | // $attach_elt->att('type')), | ||||||
4526 | Encoding => 'base64', | ||||||
4527 | Data => MIME::Base64::decode($attach_elt->text)); | ||||||
4528 | } | ||||||
4529 | |||||||
4530 | 0 | $self->verbose (2, 'parts count: ',scalar(@parts)); | |||||
4531 | 0 | foreach my $part (@parts) { | |||||
4532 | 0 | $top->make_multipart; | |||||
4533 | 0 | $top->add_part ($part); | |||||
4534 | } | ||||||
4535 | |||||||
4536 | 0 | mime_entity_lines($top); | |||||
4537 | 0 | 0 | $self->nntp_post($top) || return 0; | ||||
4538 | 0 | $self->verbose (1, ' ', __('posted')); | |||||
4539 | 0 | $new++; | |||||
4540 | } | ||||||
4541 | |||||||
4542 | # ENHANCE-ME: check the replies count to see if more to fetch | ||||||
4543 | 0 | 0 | if ($self->{'rss_get_comments'}) { | ||||
4544 | 0 | my ($comments_rss_url, $comments_count) | |||||
4545 | = $self->item_to_comments_rss($item); | ||||||
4546 | ### rss_get_comments: $comments_rss_url, $comments_count | ||||||
4547 | 0 | 0 | if (defined $comments_rss_url) { | ||||
4548 | |||||||
4549 | # ENHANCE-ME: There's also a thr:updated in RFC 4685, but haven't seen | ||||||
4550 | # that ever actually used. | ||||||
4551 | 0 | my $status = $self->status_geturl ($comments_rss_url); | |||||
4552 | 0 | 0 | 0 | if (defined $status->{'comments_count'} | |||
0 | |||||||
4553 | && defined $comments_count | ||||||
4554 | && $status->{'comments_count'} == $comments_count) { | ||||||
4555 | 0 | $self->verbose (1, ' ', __x('comments count unchanged: {count}', | |||||
4556 | count => $comments_count)); | ||||||
4557 | |||||||
4558 | } else { | ||||||
4559 | 0 | local $self->{'rss_get_links'} = 0; | |||||
4560 | 0 | local $self->{'rss_get_comments'} = 0; | |||||
4561 | 0 | local $self->{'comments_count'} = $comments_count; | |||||
4562 | # "Re:" is not translated, variants of that are very annoying | ||||||
4563 | 0 | local $self->{'getting_rss_comments'} = "Re: $subject"; | |||||
4564 | 0 | local $self->{'References:'} = $msgid; | |||||
4565 | 0 | $new += fetch_rss ($self, $self->{'nntp_group'}, $comments_rss_url); | |||||
4566 | } | ||||||
4567 | } | ||||||
4568 | } | ||||||
4569 | 0 | return $new; | |||||
4570 | } | ||||||
4571 | |||||||
4572 | sub item_to_comments_rss { | ||||||
4573 | 0 | 0 | 0 | my ($self, $item) = @_; | |||
4574 | 0 | my ($url, $url_elt); | |||||
4575 | |||||||
4576 | # Atom | ||||||
4577 | # href='http:/...' /> | ||||||
4578 | 0 | foreach my $elt ($item->children('link')) { | |||||
4579 | 0 | 0 | my $rel = ($elt->att('rel') | ||||
0 | |||||||
4580 | // $elt->att('atom:rel') | ||||||
4581 | // next); | ||||||
4582 | 0 | 0 | $rel eq 'replies' or next; | ||||
4583 | 0 | 0 | $self->atom_link_is_rss($elt) or next; | ||||
4584 | 0 | 0 | my $href = ($elt->att('href') | ||||
4585 | // $elt->att('atom:href')); | ||||||
4586 | 0 | 0 | if (is_non_empty ($href)) { | ||||
4587 | 0 | $url = $href; | |||||
4588 | 0 | $elt = $url_elt; | |||||
4589 | } | ||||||
4590 | } | ||||||
4591 | |||||||
4592 | # |
||||||
4593 | # it appeared in the spec page as wfw:commentRSS, so ignore case | ||||||
4594 | 0 | 0 | if (! defined $url) { | ||||
4595 | 0 | my $u = $item->first_child_trimmed_text (qr/^wfw:commentRss$/i); | |||||
4596 | 0 | 0 | if (is_non_empty ($u)) { | ||||
4597 | 0 | $url = $u; | |||||
4598 | } | ||||||
4599 | } | ||||||
4600 | |||||||
4601 | 0 | 0 | return ($url, | ||||
4602 | (defined($url) && $self->item_elt_comments_count($item,$url_elt))); | ||||||
4603 | } | ||||||
4604 | |||||||
4605 | # |
||||||
4606 | # are which are replies, there's no comments link as such for it to refer | ||||||
4607 | # to, it seems | ||||||
4608 | sub item_elt_comments_count { | ||||||
4609 | 0 | 0 | 0 | my ($self, $item, $elt) = @_; | |||
4610 | 0 | 0 | return (($elt && $elt->att('thr:count')) | ||||
0 | |||||||
0 | |||||||
0 | |||||||
0 | |||||||
0 | |||||||
0 | |||||||
4611 | // ($elt && $elt->att('count')) | ||||||
4612 | // ($elt && $elt->att('atom:count')) | ||||||
4613 | // non_empty ($item->first_child_trimmed_text('thr:total')) | ||||||
4614 | // non_empty ($item->first_child_trimmed_text('slash:comments'))); | ||||||
4615 | } | ||||||
4616 | @known{qw(/channel/item/jf:replyCount | ||||||
4617 | )} = (); | ||||||
4618 | |||||||
4619 | # $group is a string, the name of a local newsgroup | ||||||
4620 | # $url is a string, an RSS feed to be read | ||||||
4621 | # | ||||||
4622 | sub fetch_rss { | ||||||
4623 | 0 | 0 | 1 | my ($self, $group, $url, %options) = @_; | |||
4624 | 0 | local @{$self}{keys %options} = values %options; # hash slice | |||||
0 | |||||||
4625 | 0 | $self->verbose (2, "fetch_rss: $group $url"); | |||||
4626 | |||||||
4627 | 0 | my $group_uri = URI->new($group,'news'); | |||||
4628 | 0 | local $self->{'nntp_host'} = uri_to_nntp_host ($group_uri); | |||||
4629 | 0 | local $self->{'nntp_group'} = $group = $group_uri->group; | |||||
4630 | 0 | 0 | $self->nntp_group_check($group) or return 0; | ||||
4631 | |||||||
4632 | # an in-memory cookie jar, used only per-RSS feed and then discarded, | ||||||
4633 | # which means only kept for fetching for $self->{'rss_get_links'} from a | ||||||
4634 | # feed | ||||||
4635 | 0 | $self->ua->cookie_jar({}); | |||||
4636 | |||||||
4637 | 0 | 0 | if (defined $self->{'getting_rss_comments'}) { | ||||
4638 | 0 | $self->verbose (1, ' ', __x('rss comments: {url}', url => $url)); | |||||
4639 | } else { | ||||||
4640 | 0 | $self->verbose (1, __x('feed: {url}', url => $url)); | |||||
4641 | } | ||||||
4642 | 0 | require HTTP::Request; | |||||
4643 | 0 | my $req = HTTP::Request->new (GET => $url); | |||||
4644 | 0 | 0 | $self->status_etagmod_req($req,1) || return 0; | ||||
4645 | |||||||
4646 | # $req->uri can be a URI object or a string | ||||||
4647 | 0 | local $self->{'uri'} = URI->new ($req->uri); | |||||
4648 | |||||||
4649 | 0 | my $resp = $self->ua->request($req); | |||||
4650 | 0 | 0 | if ($resp->code == 304) { | ||||
4651 | 0 | $self->status_unchanged ($url); | |||||
4652 | 0 | return 0; | |||||
4653 | } | ||||||
4654 | 0 | 0 | if (! $resp->is_success) { | ||||
4655 | 0 | print __x("rss2leafnode: {url}\n {status}\n", | |||||
4656 | url => $url, | ||||||
4657 | status => $resp->status_line); | ||||||
4658 | 0 | return 0; | |||||
4659 | } | ||||||
4660 | 0 | local $self->{'resp'} = $resp; | |||||
4661 | 0 | my $bytes = length($resp->as_string); | |||||
4662 | |||||||
4663 | 0 | $self->verbose (3, "response:", $resp->dump, "\n"); # extra newline | |||||
4664 | 0 | 0 | $resp->decode | ||||
4665 | or die "Oops, cannot decode Content-Encoding: ", | ||||||
4666 | $self->header("Content-Encoding"); | ||||||
4667 | |||||||
4668 | 0 | my $xml = $resp->content; # raw bytes | |||||
4669 | 0 | $xml = $self->enforce_rss_charset_override ($xml); | |||||
4670 | |||||||
4671 | 0 | my ($twig, $err) = $self->twig_parse($xml); | |||||
4672 | 0 | 0 | if (defined $err) { | ||||
4673 | 0 | my $message = __x("XML::Twig parse error on\n\n {url}\n\n", | |||||
4674 | url => $url); | ||||||
4675 | 0 | 0 | if ($resp->request->uri ne $url) { | ||||
4676 | 0 | $message .= __x("which redirected to\n\n {url}\n\n", | |||||
4677 | url => $resp->request->uri); | ||||||
4678 | } | ||||||
4679 | 0 | $message .= $err . "\n\n" . __("Raw XML below.\n") . "\n"; | |||||
4680 | 0 | $self->error_message | |||||
4681 | (__x("Error parsing {url}", url => $url), | ||||||
4682 | $message, $xml); | ||||||
4683 | # after successful error message to news | ||||||
4684 | 0 | $self->status_etagmod_resp ($url, $resp); | |||||
4685 | 0 | return 0; | |||||
4686 | } | ||||||
4687 | 0 | 0 | if ($self->{'verbose'} >= 3) { | ||||
4688 | 0 | require Data::Dumper; | |||||
4689 | 0 | $self->verbose (3, | |||||
4690 | Data::Dumper->new([$twig->root],['root']) | ||||||
4691 | ->Indent(1)->Sortkeys(1)->Dump); | ||||||
4692 | } | ||||||
4693 | |||||||
4694 | # "item" for RSS/RDF, "entry" for Atom | ||||||
4695 | 0 | my @items = $twig->descendants(qr/^(item|entry)$/); | |||||
4696 | |||||||
4697 | 0 | @items = $self->rss_newest_only_items(@items); | |||||
4698 | |||||||
4699 | 0 | my $new = 0; | |||||
4700 | 0 | foreach my $item (@items) { | |||||
4701 | 0 | $new += $self->fetch_rss_process_one_item ($item); | |||||
4702 | } | ||||||
4703 | |||||||
4704 | 0 | 0 | if ($self->{'verbose'} >= 2) { | ||||
4705 | 0 | my $jar = $self->ua->cookie_jar; | |||||
4706 | 0 | 0 | 0 | if ($jar && (my $str = $jar->as_string ne '')) { | |||
4707 | 0 | $self->verbose (2, "accumulated cookies from this feed:\n", $str); | |||||
4708 | } else { | ||||||
4709 | 0 | $self->verbose (2, 'no cookies from this feed'); | |||||
4710 | } | ||||||
4711 | } | ||||||
4712 | 0 | $self->ua->cookie_jar (undef); | |||||
4713 | |||||||
4714 | 0 | $self->status_etagmod_resp ($url, $resp, $twig); | |||||
4715 | 0 | say __xn('{group}: {count} new article, from {bytes} bytes feed', | |||||
4716 | '{group}: {count} new articles, from {bytes} bytes feed', | ||||||
4717 | $new, | ||||||
4718 | group => $group, | ||||||
4719 | count => $new, | ||||||
4720 | bytes => $bytes); | ||||||
4721 | |||||||
4722 | 0 | return $new; | |||||
4723 | } | ||||||
4724 | |||||||
4725 | 1; | ||||||
4726 | __END__ |