blib/lib/App/RSS2Leafnode.pm | |||
---|---|---|---|
Criterion | Covered | Total | % |
statement | 83 | 1611 | 5.1 |
branch | 0 | 732 | 0.0 |
condition | 0 | 556 | 0.0 |
subroutine | 29 | 163 | 17.7 |
pod | 4 | 124 | 3.2 |
total | 116 | 3186 | 3.6 |
line | stmt | bran | cond | sub | pod | time | code |
---|---|---|---|---|---|---|---|
1 | # Copyright 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2017 Kevin Ryde | ||||||
2 | # | ||||||
3 | # This file is part of RSS2Leafnode. | ||||||
4 | # | ||||||
5 | # RSS2Leafnode is free software; you can redistribute it and/or modify it | ||||||
6 | # under the terms of the GNU General Public License as published by the Free | ||||||
7 | # Software Foundation; either version 3, or (at your option) any later | ||||||
8 | # version. | ||||||
9 | # | ||||||
10 | # RSS2Leafnode is distributed in the hope that it will be useful, but | ||||||
11 | # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY | ||||||
12 | # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License | ||||||
13 | # for more details. | ||||||
14 | # | ||||||
15 | # You should have received a copy of the GNU General Public License along | ||||||
16 | # with RSS2Leafnode. If not, see |
||||||
17 | |||||||
18 | |||||||
19 | # maybe: | ||||||
20 | # location links | ||||||
21 | # http://maps.google.com/maps?ll=-35.066667,148.1 | ||||||
22 | # http://maps.google.com/maps?ll=-35.066667,148.1&spn=0.01,0.01&t=m | ||||||
23 | # | ||||||
24 | # | ||||||
25 | # when site has mutliple names for a page, relative or absolute | ||||||
26 | |||||||
27 | |||||||
28 | |||||||
29 | package App::RSS2Leafnode; | ||||||
30 | 2 | 2 | 2097 | use 5.010; | |||
2 | 5 | ||||||
31 | 2 | 2 | 7 | use strict; | |||
2 | 2 | ||||||
2 | 31 | ||||||
32 | 2 | 2 | 14 | use warnings; | |||
2 | 3 | ||||||
2 | 44 | ||||||
33 | 2 | 2 | 6 | use Carp; | |||
2 | 2 | ||||||
2 | 98 | ||||||
34 | 2 | 2 | 487 | use Encode; | |||
2 | 7056 | ||||||
2 | 107 | ||||||
35 | 2 | 2 | 1053 | use Hash::Util::FieldHash; | |||
2 | 1330 | ||||||
2 | 83 | ||||||
36 | 2 | 2 | 9 | use List::Util 'min', 'max'; | |||
2 | 2 | ||||||
2 | 134 | ||||||
37 | 2 | 2 | 916 | use List::MoreUtils; | |||
2 | 13490 | ||||||
2 | 10 | ||||||
38 | 2 | 2 | 1511 | use POSIX (); # ENOENT, etc | |||
2 | 9257 | ||||||
2 | 43 | ||||||
39 | 2 | 2 | 14 | use Scalar::Util; | |||
2 | 2 | ||||||
2 | 65 | ||||||
40 | 2 | 2 | 755 | use Text::Trim 1.02; # version 1.02 for undef support | |||
2 | 815 | ||||||
2 | 90 | ||||||
41 | 2 | 2 | 890 | use URI; | |||
2 | 7583 | ||||||
2 | 52 | ||||||
42 | 2 | 2 | 781 | use HTML::Entities::Interpolate; | |||
2 | 5478 | ||||||
2 | 9 | ||||||
43 | |||||||
44 | 2 | 2 | 1031 | use App::RSS2Leafnode::XML::Twig::Other; | |||
2 | 3 | ||||||
2 | 75 | ||||||
45 | |||||||
46 | # version 1.17 for __p(), and version 1.16 for turn_utf_8_on() | ||||||
47 | 2 | 2 | 447 | use Locale::TextDomain 1.17; | |||
2 | 5602 | ||||||
2 | 9 | ||||||
48 | 2 | 2 | 6509 | use Locale::TextDomain ('App-RSS2Leafnode'); | |||
2 | 2 | ||||||
2 | 6 | ||||||
49 | BEGIN { | ||||||
50 | 2 | 2 | 51 | use Locale::Messages; | |||
2 | 2 | ||||||
2 | 100 | ||||||
51 | 2 | 2 | 9 | Locale::Messages::bind_textdomain_codeset ('App-RSS2Leafnode','UTF-8'); | |||
52 | 2 | 29 | Locale::Messages::bind_textdomain_filter ('App-RSS2Leafnode', | ||||
53 | \&Locale::Messages::turn_utf_8_on); | ||||||
54 | } | ||||||
55 | |||||||
56 | # uncomment this to run the ### lines | ||||||
57 | # use Smart::Comments; | ||||||
58 | |||||||
59 | our $VERSION; | ||||||
60 | BEGIN { | ||||||
61 | 2 | 2 | 1148 | $VERSION = 79; | |||
62 | } | ||||||
63 | |||||||
64 | ## no critic (ProhibitFixedStringMatches) | ||||||
65 | |||||||
66 | |||||||
67 | # Cribs: | ||||||
68 | # | ||||||
69 | # RSS | ||||||
70 | # http://my.netscape.com/publish/help/ | ||||||
71 | # RSS 0.9 spec. | ||||||
72 | # http://my.netscape.com/publish/help/mnn20/quickstart.html | ||||||
73 | # RSS 0.91 spec. | ||||||
74 | # http://purl.org/rss/1.0/ | ||||||
75 | # RSS 1.0 spec. | ||||||
76 | # http://www.rssboard.org/rss-specification | ||||||
77 | # http://www.rssboard.org/files/rss-2.0-sample.xml | ||||||
78 | # RSS 2.0 spec and sample. | ||||||
79 | # | ||||||
80 | # http://www.rssboard.org/rss-profile | ||||||
81 | # "Best practices." | ||||||
82 | # | ||||||
83 | # Dublin Core | ||||||
84 | # RFC 5013 -- summary | ||||||
85 | # http://dublincore.org/documents/dcmi-terms/ -- dc/terms | ||||||
86 | # | ||||||
87 | # Atom | ||||||
88 | # RFC 4287 -- Atom spec | ||||||
89 | # RFC 3339 -- ISO timestamps as used in Atom | ||||||
90 | # RFC 4685 -- "thr" threading extensions | ||||||
91 | # RFC 4946 -- | ||||||
92 | # RFC 5005 -- etc paging and archiving | ||||||
93 | # http://diveintomark.org/archives/2004/05/28/howto-atom-id | ||||||
94 | # Making an |
||||||
95 | # http://www.iana.org/assignments/link-relations/link-relations.xhtml | ||||||
96 | # assigned values | ||||||
97 | # | ||||||
98 | # RSS Modules: | ||||||
99 | # http://www.meatballwiki.org/wiki/ModWiki -- wiki | ||||||
100 | # http://web.resource.org/rss/1.0/modules/slash/ | ||||||
101 | # http://code.google.com/apis/feedburner/feedburner_namespace_reference.html | ||||||
102 | # http://backend.userland.com/creativeCommonsRSSModule | ||||||
103 | # | ||||||
104 | # http://web.resource.org/rss/1.0/modules/content/ | ||||||
105 | # http://www.rssboard.org/rss-profile#namespace-elements-content | ||||||
106 | # http://validator.w3.org/feed/docs/warning/NeedDescriptionBeforeContent.html | ||||||
107 | # |
||||||
108 | # | ||||||
109 | # http://www.apple.com/itunes/podcasts/specs.html | ||||||
110 | # http://www.feedforall.com/itunes.htm | ||||||
111 | # http://www.w3.org/2003/01/geo/wgs84_pos -- |
||||||
112 | # http://www.georss.org/ | ||||||
113 | # http://www.georss.org/Encodings | ||||||
114 | # http://www.georss.org/atom | ||||||
115 | # http://www.georss.org/rdf_rss1 | ||||||
116 | # | ||||||
117 | # http://activitystrea.ms/specs/atom/1.0/ | ||||||
118 | # activity: | ||||||
119 | # http://prismstandard.org/namespaces/basic/2.0/ | ||||||
120 | # http://www.prismstandard.org/specifications/2.0/PRISM_prism_namespace_2.0.pdf | ||||||
121 | # Prism | ||||||
122 | # | ||||||
123 | # URIs | ||||||
124 | # RFC 1738, RFC 2396, RFC 3986 -- URI formats (news/nntp in 1738) | ||||||
125 | # draft-ellermann-news-nntp-uri-11.txt -- news/nntp update | ||||||
126 | # RFC 2732 -- ipv6 "[]" hostnames | ||||||
127 | # RFC 2141 -- urn: | ||||||
128 | # RFC 4122 -- uuid format (as under urn:uuid:) | ||||||
129 | # RFC 4151 -- tag: | ||||||
130 | # RFC 1034, RFC 1123 -- domain names | ||||||
131 | # RFC 2606 -- reserved domain names ".invalid" | ||||||
132 | # | ||||||
133 | # XML | ||||||
134 | # http://www.w3.org/TR/xmlbase/ -- xml:base | ||||||
135 | # RFC 3023 text/xml etc media types | ||||||
136 | # | ||||||
137 | # Mail Messages | ||||||
138 | # RFC 850, RFC 1036 | ||||||
139 | # -- News message format, inc headers and rnews format | ||||||
140 | # RFC 2822, RFC 5322, RFC 5536 | ||||||
141 | # -- Email message format. | ||||||
142 | # RFC 2076, RFC 4021 -- headers summary. | ||||||
143 | # RFC 2557 -- MHTML Content-Location | ||||||
144 | # RFC 1864 -- Content-MD5 header | ||||||
145 | # RFC 2369 -- List-Post header and friends | ||||||
146 | # http://www.ietf.org/proceedings/98dec/I-D/draft-ietf-drums-mail-followup-to-00.txt | ||||||
147 | # Draft "Mail-Followup-To" header. | ||||||
148 | # | ||||||
149 | # RFC 1327 -- X.400 to RFC822 introducing Language header | ||||||
150 | # RFC 3282 -- Content-Language header | ||||||
151 | # RFC 1766, RFC 3066, RFC 4646 -- language tag form | ||||||
152 | # | ||||||
153 | # | ||||||
154 | # NNTP | ||||||
155 | # RFC 977 -- NNTP | ||||||
156 | # RFC 2616 -- HTTP/1.1 Accept-Encoding header | ||||||
157 | # RFC 2980 -- NNTP extensions | ||||||
158 | # | ||||||
159 | # RFC 4642 -- NNTP with SSL | ||||||
160 | # | ||||||
161 | # For XML in Perl there's several ways to do it! | ||||||
162 | # - XML::Parser looks likely for stream/event processing, but its builtin | ||||||
163 | # tree mode is very basic. | ||||||
164 | # - XML::Twig extends XML::Parser to a good tree, though the docs are | ||||||
165 | # slightly light on. It only does a subset of "XPath" but the | ||||||
166 | # functions/regexps are more perl-like for matching and there's various | ||||||
167 | # handy shortcuts for common operations. | ||||||
168 | # - XML::LibXML is the full blown libxml and is rather a lot to learn. | ||||||
169 | # Because it's mainly C it's not easy to find where or how you're going | ||||||
170 | # wrong when your code doesn't work. libxml also seems stricter about | ||||||
171 | # namespace matters than XML::Parser/XML::Twig. | ||||||
172 | # - XML::RSS uses XML::Parser to build its own style tree of RSS, | ||||||
173 | # including unifying differences among RSS/RDF 0.91, 1.0 and 2.0. | ||||||
174 | # Nested elements seem to need specific handling in its code, which can | ||||||
175 | # make it tricky for sub-element oddities. A fair amount of it is about | ||||||
176 | # writing RSS too. | ||||||
177 | # - XML::RSS::LibXML uses libxml for XML::RSS compatible reading and | ||||||
178 | # writing. It seems to do better on unrecognised sub-elements. | ||||||
179 | # - XML::Atom offers the basic Atom elements but doesn't seem to give | ||||||
180 | # access to extra stuff that might be in a feed. | ||||||
181 | # - XML::Feed tries to unify XML::RSS and XML::Atom but again doesn't seem | ||||||
182 | # to go much beyond the basics. It too is geared towards writing as | ||||||
183 | # well as reading. | ||||||
184 | # - XML::TreePP pure perl parser to a hash tree. | ||||||
185 | # | ||||||
186 | # The choice of XML::Twig is based on wanting both RSS and Atom, but | ||||||
187 | # XML::Feed not going far enough. Tree processing is easier than stream, | ||||||
188 | # and an RSS isn't meant to be huge. A tree may help if channel fields | ||||||
189 | # follow items or something equally unnatural, but will probably assume that | ||||||
190 | # doesn't happen and look at the twig partial-tree mode. Between the tree | ||||||
191 | # styles XML::LibXML is harder to get into than Twig. | ||||||
192 | # | ||||||
193 | |||||||
194 | #------------------------------------------------------------------------------ | ||||||
195 | # mostly generic | ||||||
196 | |||||||
197 | # return $str with a newline at the end, if it doesn't already have one | ||||||
198 | sub str_ensure_newline { | ||||||
199 | 0 | 0 | 0 | 0 | my ($str) = @_; | ||
200 | 0 | 0 | 0 | if ($str !~ /\n$/) { $str .= "\n" } | |||
0 | 0 | ||||||
201 | 0 | 0 | return $str; | ||||
202 | } | ||||||
203 | |||||||
204 | sub md5_of_utf8 { | ||||||
205 | 0 | 0 | 0 | 0 | my ($str) = @_; | ||
206 | 0 | 0 | require Digest::MD5; | ||||
207 | 0 | 0 | return Digest::MD5::md5_base64 (Encode::encode_utf8 ($str)); | ||||
208 | } | ||||||
209 | |||||||
210 | sub is_empty { | ||||||
211 | 0 | 0 | 0 | 0 | my ($str) = @_; | ||
212 | 0 | 0 | 0 | return (! defined $str || $str =~ /^\s*$/); | |||
213 | } | ||||||
214 | sub is_non_empty { | ||||||
215 | 0 | 0 | 0 | 0 | my ($str) = @_; | ||
216 | 0 | 0 | return ! is_empty($str); | ||||
217 | } | ||||||
218 | sub non_empty { | ||||||
219 | 0 | 0 | 0 | 0 | my ($str) = @_; | ||
220 | 0 | 0 | 0 | return (is_non_empty($str) ? $str : ()); | |||
221 | } | ||||||
222 | |||||||
223 | sub join_non_empty { | ||||||
224 | 0 | 0 | 0 | 0 | my $sep = shift; | ||
225 | 0 | 0 | return non_empty (join($sep, map {non_empty($_)} @_)); | ||||
0 | 0 | ||||||
226 | } | ||||||
227 | |||||||
228 | sub collapse_whitespace { | ||||||
229 | 0 | 0 | 0 | 0 | my ($str) = @_; | ||
230 | 0 | 0 | 0 | defined $str or return undef; | |||
231 | 0 | 0 | 0 | $str =~ s/(\s+)/($1 eq ' ' ? $1 : ' ')/ge; | |||
0 | 0 | ||||||
232 | 0 | 0 | return Text::Trim::trim($str); | ||||
233 | } | ||||||
234 | |||||||
235 | # return true if $str is entirely ascii chars 0 to 127 | ||||||
236 | sub is_ascii { | ||||||
237 | 0 | 0 | 0 | 0 | my ($str) = @_; | ||
238 | 0 | 0 | return ($str !~ /[^[:ascii:]]/); | ||||
239 | } | ||||||
240 | |||||||
241 | # Return the number of lines in $str. | ||||||
242 | # If $str ends with a newline then that counts as the last line, so "xyz\n" | ||||||
243 | # is one line. If $str doesn't end with a newline then the final chars are | ||||||
244 | # a line, so "abc\ndef" is two lines. | ||||||
245 | sub str_count_lines { | ||||||
246 | 0 | 0 | 0 | 0 | my ($str) = @_; | ||
247 | 0 | 0 | 0 | return scalar($str =~ tr/\n//) + (length($str) && substr($str,-1) ne "\n"); | |||
248 | } | ||||||
249 | |||||||
250 | sub File_Temp_DEBUG_saver { | ||||||
251 | 0 | 0 | 0 | 0 | my ($self, $newval) = @_; | ||
252 | 0 | 0 | require Scope::Guard; | ||||
253 | 0 | 0 | require File::Temp; | ||||
254 | 0 | 0 | my $oldval = $File::Temp::DEBUG; | ||||
255 | 0 | 0 | 0 | my $ret = Scope::Guard->new (sub { $File::Temp::DEBUG = $oldval }); | |||
0 | 0 | ||||||
256 | 0 | 0 | $File::Temp::DEBUG = $newval; | ||||
257 | 0 | 0 | return $ret; | ||||
258 | } | ||||||
259 | sub MIME_Tools_debugging { | ||||||
260 | 0 | 0 | 0 | 0 | my ($self, $newval) = @_; | ||
261 | 0 | 0 | require Scope::Guard; | ||||
262 | 0 | 0 | require MIME::Tools; | ||||
263 | 0 | 0 | my $oldval = MIME::Tools->debugging; | ||||
264 | 0 | 0 | 0 | my $ret = Scope::Guard->new (sub { MIME::Tools->debugging($oldval) }); | |||
0 | 0 | ||||||
265 | 0 | 0 | MIME::Tools->debugging ($newval); | ||||
266 | 0 | 0 | return $ret; | ||||
267 | } | ||||||
268 | |||||||
269 | sub homedir { | ||||||
270 | # my ($self) = @_; | ||||||
271 | 0 | 0 | 0 | 0 | require File::HomeDir; | ||
272 | # call each time just in case playing tricks with $ENV{HOME} in conf file | ||||||
273 | 0 | 0 | 0 | return File::HomeDir->my_home | |||
274 | // croak 'File::HomeDir says you have no home directory'; | ||||||
275 | } | ||||||
276 | |||||||
277 | #------------------------------------------------------------------------------ | ||||||
278 | # Number::Format for sizes in bytes | ||||||
279 | |||||||
280 | use constant::defer NUMBER_FORMAT => sub { | ||||||
281 | 0 | 0 | require Number::Format; | ||||
282 | 0 | 0 | Number::Format->VERSION(1.5); # for format_bytes() options params | ||||
283 | 0 | 0 | return Number::Format->new | ||||
284 | (-kilo_suffix => __p('number-format-kilobytes','K'), | ||||||
285 | -mega_suffix => __p('number-format-megabytes','M'), | ||||||
286 | -giga_suffix => __p('number-format-gigabytes','G')); | ||||||
287 | 2 | 2 | 852 | }; | |||
2 | 1001 | ||||||
2 | 14 | ||||||
288 | |||||||
289 | sub format_size_in_bytes { | ||||||
290 | 0 | 0 | 0 | 0 | my ($self, $length) = @_; | ||
291 | 0 | 0 | 0 | if ($length >= 2000) { | |||
292 | 0 | 0 | return $self->NUMBER_FORMAT()->format_bytes ($length, precision => 1); | ||||
293 | } else { | ||||||
294 | 0 | 0 | return __x('{size} bytes', size => $length); | ||||
295 | } | ||||||
296 | } | ||||||
297 | |||||||
298 | #------------------------------------------------------------------------------ | ||||||
299 | |||||||
300 | sub new { | ||||||
301 | 2 | 2 | 1 | 1626 | my $class = shift; | ||
302 | 2 | 12 | return bless { | ||||
303 | # config variables | ||||||
304 | verbose => 0, | ||||||
305 | render => 0, | ||||||
306 | render_width => 60, | ||||||
307 | rss_get_links => 0, | ||||||
308 | rss_get_comments => 0, | ||||||
309 | rss_newest_only => 0, | ||||||
310 | get_icon => 0, | ||||||
311 | html_charset_from_content => 0, | ||||||
312 | |||||||
313 | # secret extra | ||||||
314 | msgidextra => '', | ||||||
315 | |||||||
316 | @_, | ||||||
317 | }, $class; | ||||||
318 | } | ||||||
319 | |||||||
320 | sub command_line { | ||||||
321 | 0 | 0 | 1 | 0 | my ($self) = @_; | ||
322 | |||||||
323 | 0 | 0 | my $done_version; | ||||
324 | 0 | 0 | require Getopt::Long; | ||||
325 | 0 | 0 | Getopt::Long::Configure ('no_ignore_case'); | ||||
326 | Getopt::Long::GetOptions | ||||||
327 | ('config=s' => \$self->{'config_filename'}, | ||||||
328 | 'verbose:1' => \$self->{'verbose'}, | ||||||
329 | 'version' => sub { | ||||||
330 | 0 | 0 | 0 | say __x("RSS2Leafnode version {version}", version => $VERSION); | |||
331 | 0 | 0 | $done_version = 1; | ||||
332 | }, | ||||||
333 | 'bareversion' => sub { | ||||||
334 | 0 | 0 | 0 | say $VERSION; | |||
335 | 0 | 0 | $done_version = 1; | ||||
336 | }, | ||||||
337 | 'msgid=s' => \$self->{'msgidextra'}, | ||||||
338 | 'help|?' => sub { | ||||||
339 | 0 | 0 | 0 | say __x("rss2leafnode [--options]"); | |||
340 | 0 | 0 | say __x(" --config=filename configuration file (default ~/.rss2leafnode.conf)"); | ||||
341 | 0 | 0 | say __x(" --help print this help"); | ||||
342 | 0 | 0 | say __x(" --verbose describe what's done"); | ||||
343 | 0 | 0 | say __x(" --verbose=2 show technical details of what's done"); | ||||
344 | 0 | 0 | say __x(" --version print program version number"); | ||||
345 | 0 | 0 | exit 0; | ||||
346 | 0 | 0 | 0 | }) or return 1; | |||
347 | 0 | 0 | 0 | if (! $done_version) { | |||
348 | 0 | 0 | $self->do_config_file; | ||||
349 | 0 | 0 | $self->nntp_close; | ||||
350 | } | ||||||
351 | 0 | 0 | return 0; | ||||
352 | } | ||||||
353 | |||||||
354 | sub verbose { | ||||||
355 | 0 | 0 | 0 | 0 | my $self = shift; | ||
356 | 0 | 0 | my $count = shift; | ||||
357 | 0 | 0 | 0 | if ($self->{'verbose'} >= $count) { | |||
358 | 0 | 0 | say @_; | ||||
359 | } | ||||||
360 | } | ||||||
361 | |||||||
362 | sub config_filename { | ||||||
363 | 0 | 0 | 0 | 0 | my ($self) = @_; | ||
364 | 0 | 0 | 0 | return $self->{'config_filename'} // do { | |||
365 | 0 | 0 | require File::Spec; | ||||
366 | 0 | 0 | File::Spec->catfile ($self->homedir, '.rss2leafnode.conf'); | ||||
367 | }; | ||||||
368 | } | ||||||
369 | sub status_filename { | ||||||
370 | 0 | 0 | 0 | 0 | my ($self) = @_; | ||
371 | 0 | 0 | 0 | return $self->{'status_filename'} // do { | |||
372 | 0 | 0 | require File::Spec; | ||||
373 | 0 | 0 | File::Spec->catfile ($self->homedir, '.rss2leafnode.status'); | ||||
374 | }; | ||||||
375 | } | ||||||
376 | |||||||
377 | sub do_config_file { | ||||||
378 | 0 | 0 | 0 | 0 | my ($self) = @_; | ||
379 | 0 | 0 | my @guards; | ||||
380 | |||||||
381 | 0 | 0 | 0 | open STDERR, '>&STDOUT' or die "Oops, can't join STDERR to STDOUT"; | |||
382 | |||||||
383 | # File::Temp::DEBUG for possible temp files used by HTML::FormatExternal | ||||||
384 | # these debugs turned on only for the duration of running the config file | ||||||
385 | # and the downloading etc in it | ||||||
386 | 0 | 0 | 0 | if ($self->{'verbose'} >= 2) { | |||
387 | 0 | 0 | push @guards, $self->File_Temp_DEBUG_saver(1); | ||||
388 | 0 | 0 | push @guards, $self->MIME_Tools_debugging(1); | ||||
389 | } | ||||||
390 | |||||||
391 | 0 | 0 | my $config_filename = $self->config_filename; | ||||
392 | 0 | 0 | $self->verbose (1, "config: ", $config_filename); | ||||
393 | |||||||
394 | 0 | 0 | require App::RSS2Leafnode::Conf; | ||||
395 | 0 | 0 | local $App::RSS2Leafnode::Conf::r2l = $self; | ||||
396 | 0 | 0 | 0 | if (! defined (do { package App::RSS2Leafnode::Conf; | |||
397 | 0 | 0 | do $config_filename; | ||||
398 | })) { | ||||||
399 | 0 | 0 | 0 | if (! -e $config_filename) { | |||
400 | 0 | 0 | croak "rss2leafnode: config file $config_filename doesn't exist\n"; | ||||
401 | } else { | ||||||
402 | 0 | 0 | croak $@; | ||||
403 | } | ||||||
404 | } | ||||||
405 | } | ||||||
406 | |||||||
407 | #------------------------------------------------------------------------------ | ||||||
408 | # LWP stuff | ||||||
409 | |||||||
410 | sub user_agent { | ||||||
411 | 0 | 0 | 0 | 0 | my ($self) = @_; | ||
412 | 0 | 0 | 0 | if (defined $self->{'user_agent'}) { | |||
413 | 0 | 0 | return $self->{'user_agent'}; | ||||
414 | } else { | ||||||
415 | 0 | 0 | return 'RSS2leafnode/' . $self->VERSION . ' '; | ||||
416 | } | ||||||
417 | } | ||||||
418 | |||||||
419 | sub ua { | ||||||
420 | 0 | 0 | 0 | 0 | my ($self) = @_; | ||
421 | 0 | 0 | 0 | return ($self->{'ua'} ||= do { | |||
422 | 0 | 0 | require LWP::UserAgent; | ||||
423 | 0 | 0 | LWP::UserAgent->VERSION(5.832); # 5.832 for content_charset() | ||||
424 | |||||||
425 | # one connection kept alive | ||||||
426 | 0 | 0 | my $ua = LWP::UserAgent->new (keep_alive => 1); | ||||
427 | 0 | 0 | Scalar::Util::weaken ($ua->{(__PACKAGE__)} = $self); | ||||
428 | 0 | 0 | $ua->agent ($self->user_agent); | ||||
429 | |||||||
430 | 0 | 0 | Scalar::Util::weaken (my $weak_self = $self); | ||||
431 | 0 | 0 | $ua->add_handler (request_send => \&lwp_request_send__verbose); | ||||
432 | $ua->add_handler (response_done => sub { | ||||||
433 | 0 | 0 | 0 | lwp_response_done__check_md5 ($weak_self, @_); | |||
434 | 0 | 0 | }); | ||||
435 | |||||||
436 | # ask for everything $resp->decode() / $resp->decoded_content() can cope | ||||||
437 | # with, in particular "gzip" and "deflate" compression if Compress::Zlib | ||||||
438 | # etc is available | ||||||
439 | # | ||||||
440 | 0 | 0 | require HTTP::Message; | ||||
441 | 0 | 0 | my $decodable = HTTP::Message::decodable(); | ||||
442 | 0 | 0 | $self->verbose (2, "HTTP decodable: ", $decodable); | ||||
443 | 0 | 0 | $ua->default_header ('Accept-Encoding' => $decodable); | ||||
444 | |||||||
445 | 0 | 0 | $ua | ||||
446 | }); | ||||||
447 | } | ||||||
448 | |||||||
449 | sub lwp_request_send__verbose { | ||||||
450 | 0 | 0 | 0 | 0 | my ($req, $ua, $h) = @_; | ||
451 | 0 | 0 | my $self = $ua->{(__PACKAGE__)}; | ||||
452 | 0 | 0 | $self->verbose (2, "request_send:", $req->dump, "\n"); # extra newline | ||||
453 | 0 | 0 | return; # continue processing | ||||
454 | } | ||||||
455 | |||||||
456 | sub lwp_response_done__check_md5 { | ||||||
457 | 0 | 0 | 0 | 0 | my ($self, $resp, $ua, $h) = @_; | ||
458 | 0 | 0 | 0 | $self || return; | |||
459 | ### lwp_response_done__check_md5() ... | ||||||
460 | 0 | 0 | 0 | my $want = $resp->header('Content-MD5') // do { | |||
461 | 0 | 0 | $self->verbose (2, 'no Content-MD5 header'); | ||||
462 | 0 | 0 | return; | ||||
463 | }; | ||||||
464 | 0 | 0 | $resp->decode; | ||||
465 | 0 | 0 | my $cref = $resp->content_ref; | ||||
466 | 0 | 0 | require Digest::MD5; | ||||
467 | 0 | 0 | my $got = Digest::MD5::md5_hex($$cref); | ||||
468 | 0 | 0 | 0 | if ($got ne $want) { | |||
469 | 0 | 0 | print __x("Warning, MD5 checksum mismatch on download {url}\n", | ||||
470 | url => $resp->request->uri); | ||||||
471 | } else { | ||||||
472 | 0 | 0 | $self->verbose(2, 'Content-MD5 ok'); | ||||
473 | } | ||||||
474 | } | ||||||
475 | |||||||
476 | # $resp is a HTTP::Response object. Modify its headers to apply our | ||||||
477 | # $html_charset_from_content option, which means if it's set then prefer the | ||||||
478 | # document's Content-Type over what the server says. | ||||||
479 | # | ||||||
480 | # The LWP::UserAgent parse_head option appends the document bits to | ||||||
481 | # the message headers. If the server and the document both offer a | ||||||
482 | # Content-Type then there's two, with the document one last, so all we have | ||||||
483 | # to do is change to make the last one the only one. | ||||||
484 | # | ||||||
485 | sub enforce_html_charset_from_content { | ||||||
486 | 0 | 0 | 0 | 0 | my ($self, $resp) = @_; | ||
487 | 0 | 0 | 0 | 0 | if ($self->{'html_charset_from_content'} | ||
488 | && $resp->headers->content_is_html) { | ||||||
489 | 0 | 0 | my $old = $resp->header('Content-Type'); | ||||
490 | 0 | 0 | $resp->header('Content-Type' => $resp->headers->content_type); | ||||
491 | |||||||
492 | 0 | 0 | $self->verbose (2, 'html_charset_from_content mangled Content-Type from'); | ||||
493 | 0 | 0 | $self->verbose (2, " from ", $old); | ||||
494 | 0 | 0 | $self->verbose (2, " to ", $resp->header('Content-Type')); | ||||
495 | 0 | 0 | $self->verbose (2, " giving charset ", $resp->content_charset); | ||||
496 | } | ||||||
497 | } | ||||||
498 | |||||||
499 | |||||||
500 | #------------------------------------------------------------------------------ | ||||||
501 | my %known; | ||||||
502 | |||||||
503 | # |
||||||
504 | # ENHANCE-ME: is this something to work into the skipdays? or a message expiry? | ||||||
505 | # | ||||||
506 | $known{'/channel/item/dcterms:valid'} = undef; | ||||||
507 | |||||||
508 | # |
||||||
509 | $known{'/channel/item/dcterms:audience'} = undef; | ||||||
510 | |||||||
511 | # |
||||||
512 | # |
||||||
513 | @known{qw(/channel/item/eq:depth | ||||||
514 | /channel/item/eq:seconds)} = (); | ||||||
515 | |||||||
516 | # rdf structure stuff | ||||||
517 | @known{qw(/channel/items | ||||||
518 | /channel/items/rdf:Seq | ||||||
519 | /channel/items/rdf:Seq/rdf:li)} = (); | ||||||
520 | |||||||
521 | @known{('/channel/cloud', | ||||||
522 | '/channel/link', | ||||||
523 | '/channel/docs', | ||||||
524 | '/channel/generator', | ||||||
525 | '/channel/rating', | ||||||
526 | '/channel/id', | ||||||
527 | '/channel/description', | ||||||
528 | '/channel/tagline', | ||||||
529 | '/channel/info', # atom something freeform | ||||||
530 | '/channel/itunes:summary', | ||||||
531 | '/channel/feedburner:info', | ||||||
532 | |||||||
533 | # nothing much in these as yet eg. rssboard | ||||||
534 | '/channel/item/sitemap:priority', | ||||||
535 | '/channel/item/sitemap:changefreq', | ||||||
536 | |||||||
537 | # feedburner junk | ||||||
538 | '/channel/feedburner:feedFlare', | ||||||
539 | |||||||
540 | # images | ||||||
541 | '/channel/itunes:owner', | ||||||
542 | '/channel/itunes:owner/itunes:name', | ||||||
543 | '/channel/itunes:owner/itunes:email', | ||||||
544 | |||||||
545 | '/channel/textInput', | ||||||
546 | '/channel/textInput/description', | ||||||
547 | '/channel/textInput/link', | ||||||
548 | '/channel/textInput/name', | ||||||
549 | '/channel/textInput/title', | ||||||
550 | '/channel/textinput', | ||||||
551 | '/channel/textinput/title', | ||||||
552 | '/channel/textinput/description', | ||||||
553 | '/channel/textinput/name', | ||||||
554 | '/channel/textinput/link', | ||||||
555 | |||||||
556 | '/channel/openSearch:totalResults', | ||||||
557 | '/channel/openSearch:startIndex', | ||||||
558 | '/channel/openSearch:itemsPerPage', | ||||||
559 | |||||||
560 | '/channel/item', | ||||||
561 | '/channel/item/source', | ||||||
562 | |||||||
563 | '/channel/item/twitter:source', | ||||||
564 | |||||||
565 | # something from radio free france | ||||||
566 | # eg. http://radiofrance-podcast.net/podcast09/rss_10193.xml | ||||||
567 | '/channel/item/podcastRF:businessReference', | ||||||
568 | |||||||
569 | # google documents stuff | ||||||
570 | '/channel/item/gd:extendedProperty', | ||||||
571 | |||||||
572 | # |
||||||
573 | # eg. RBA http://www.rba.gov.au/rss/rss-cb-exchange-rates.xml | ||||||
574 | 'channel/item/cb:statistics', | ||||||
575 | |||||||
576 | # |
||||||
577 | # eg. Fed Reserve http://www.federalreserve.gov/feeds/press_taf.xml | ||||||
578 | '/channel/item/cb:news', | ||||||
579 | |||||||
580 | # FIXME: |
||||||
581 | # Fed eg. http://www.federalreserve.gov/feeds/speeches.xml | ||||||
582 | '/channel/item/cb:speech', | ||||||
583 | |||||||
584 | # |
||||||
585 | # |
||||||
586 | # Fed eg. http://www.federalreserve.gov/feeds/ifdp.xml | ||||||
587 | # FIXME: except |
||||||
588 | '/channel/item/cb:paper', | ||||||
589 | '/channel/item/cb:event', | ||||||
590 | |||||||
591 | # |
||||||
592 | '/channel/item/media:hash', | ||||||
593 | |||||||
594 | # not sure what these are, but don't seem very interesting | ||||||
595 | '/channel/item/slate:slate_plus', # |
||||||
596 | '/channel/item/slate:paywall', # |
||||||
597 | '/channel/item/slate:sponsored', # |
||||||
598 | )} = (); | ||||||
599 | |||||||
600 | # weather | ||||||
601 | # '/channel/item/w:current', | ||||||
602 | # '/channel/item/w:forecast', | ||||||
603 | # '/channel/yweather:location', | ||||||
604 | # '/channel/yweather:units', | ||||||
605 | # '/channel/yweather:wind', | ||||||
606 | # '/channel/yweather:atmosphere', | ||||||
607 | # '/channel/yweather:astronomy', | ||||||
608 | # '/channel/item/yweather:condition', | ||||||
609 | # '/channel/item/yweather:forecast', | ||||||
610 | |||||||
611 | # --central-bank | ||||||
612 | # /channel/item/cb:statistics | ||||||
613 | # /channel/item/cb:statistics/cb:country | ||||||
614 | # /channel/item/cb:statistics/cb:institutionAbbrev | ||||||
615 | # /channel/item/cb:statistics/cb:exchangeRate | ||||||
616 | # /channel/item/cb:statistics/cb:exchangeRate/cb:value | ||||||
617 | # /channel/item/cb:statistics/cb:exchangeRate/cb:baseCurrency | ||||||
618 | # /channel/item/cb:statistics/cb:exchangeRate/cb:targetCurrency | ||||||
619 | # /channel/item/cb:statistics/cb:exchangeRate/cb:rateType | ||||||
620 | # /channel/item/cb:statistics/cb:exchangeRate/cb:observationPeriod | ||||||
621 | # /channel/item/cb:speech | ||||||
622 | # /channel/item/cb:speech/cb:simpleTitle | ||||||
623 | # /channel/item/cb:speech/cb:occurrenceDate | ||||||
624 | # /channel/item/cb:speech/cb:person | ||||||
625 | # /channel/item/cb:speech/cb:person/cb:givenName | ||||||
626 | # /channel/item/cb:speech/cb:person/cb:surname | ||||||
627 | # /channel/item/cb:speech/cb:person/cb:personalTitle | ||||||
628 | # /channel/item/cb:speech/cb:person/cb:nameAsWritten | ||||||
629 | # /channel/item/cb:speech/cb:person/cb:role | ||||||
630 | # /channel/item/cb:speech/cb:person/cb:role/cb:jobTitle | ||||||
631 | # /channel/item/cb:speech/cb:person/cb:role/cb:affiliation | ||||||
632 | # /channel/item/cb:speech/cb:venue | ||||||
633 | |||||||
634 | |||||||
635 | #------------------------------------------------------------------------------ | ||||||
636 | # dates | ||||||
637 | |||||||
638 | 2 | 2 | 1908 | use constant RFC822_STRFTIME_FORMAT => '%a, %d %b %Y %H:%M:%S %z'; | |||
2 | 2 | ||||||
2 | 2856 | ||||||
639 | |||||||
640 | # return a string which is current time in RFC 822 format | ||||||
641 | sub rfc822_time_now { | ||||||
642 | 0 | 0 | 0 | 0 | return POSIX::strftime (RFC822_STRFTIME_FORMAT, localtime(time())); | ||
643 | } | ||||||
644 | |||||||
645 | sub isodate_to_rfc822 { | ||||||
646 | 0 | 0 | 0 | 0 | my ($isodate) = @_; | ||
647 | 0 | 0 | 0 | if (! defined $isodate) { return undef; } | |||
0 | 0 | ||||||
648 | 0 | 0 | my $date = $isodate; # the original goes through if unrecognised | ||||
649 | |||||||
650 | 0 | 0 | 0 | 0 | if ($isodate =~ /\dT\d/ || $isodate =~ /^\d{4}-\d{2}-\d{2}$/) { | ||
651 | # eg. "2000-01-01T12:00+00:00" | ||||||
652 | # "2000-01-01T12:00:00Z" | ||||||
653 | # "2000-01-01" | ||||||
654 | 0 | 0 | 0 | my $zonestr = ($isodate =~ s/([+-][0-9][0-9]):([0-9][0-9])$// ? " $1$2" | |||
0 | |||||||
655 | : $isodate =~ s/Z$// ? ' +0000' | ||||||
656 | : ''); | ||||||
657 | 0 | 0 | require Date::Parse; | ||||
658 | 0 | 0 | my $time_t = Date::Parse::str2time($isodate); | ||||
659 | 0 | 0 | 0 | if (defined $time_t) { | |||
660 | 0 | 0 | $date = POSIX::strftime ("%a, %d %b %Y %H:%M:%S$zonestr", | ||||
661 | localtime ($time_t)); | ||||||
662 | } | ||||||
663 | } | ||||||
664 | 0 | 0 | return $date; | ||||
665 | } | ||||||
666 | |||||||
667 | # Return an RFC822 date string, or undef if nothing known. | ||||||
668 | # This gets a sensible sort-by-date in the newsreader. | ||||||
669 | # |
||||||
670 | # redundant. | ||||||
671 | # | ||||||
672 | sub item_to_date { | ||||||
673 | 0 | 0 | 0 | 0 | my ($self, $item) = @_; | ||
674 | 0 | 0 | my $date; | ||||
675 | 0 | 0 | foreach my $elt ($item, elt_to_channel($item)) { | ||||
676 | 0 | 0 | 0 | $date = (non_empty ($elt->first_child_trimmed_text('pubDate')) | |||
0 | |||||||
0 | |||||||
0 | |||||||
0 | |||||||
0 | |||||||
0 | |||||||
0 | |||||||
0 | |||||||
0 | |||||||
677 | // non_empty ($elt->first_child_trimmed_text('dc:date')) | ||||||
678 | // non_empty ($elt->first_child_trimmed_text('jf:creationDate')) | ||||||
679 | # Atom | ||||||
680 | // non_empty ($elt->first_child_trimmed_text('modified')) | ||||||
681 | // non_empty ($elt->first_child_trimmed_text('updated')) | ||||||
682 | // non_empty ($elt->first_child_trimmed_text('issued')) | ||||||
683 | // non_empty ($elt->first_child_trimmed_text('dcterms:issued')) | ||||||
684 | // non_empty ($elt->first_child_trimmed_text('created')) | ||||||
685 | # channel | ||||||
686 | // non_empty ($elt->first_child_trimmed_text('lastBuildDate')) | ||||||
687 | # Atom | ||||||
688 | // non_empty ($elt->first_child_trimmed_text('published')) | ||||||
689 | # from Nature have dc:date anyway | ||||||
690 | // non_empty ($elt->first_child_trimmed_text('prism:publicationDate')) | ||||||
691 | ); | ||||||
692 | 0 | 0 | 0 | last if defined $date; | |||
693 | } | ||||||
694 | 0 | 0 | return isodate_to_rfc822($date); | ||||
695 | } | ||||||
696 | @known{qw(/channel/dc:date | ||||||
697 | /channel/lastBuildDate | ||||||
698 | /channel/pubDate | ||||||
699 | /channel/updated | ||||||
700 | /channel/modified | ||||||
701 | |||||||
702 | /channel/item/dc:date | ||||||
703 | /channel/item/pubDate | ||||||
704 | /channel/item/updated | ||||||
705 | /channel/item/published | ||||||
706 | /channel/item/modified | ||||||
707 | /channel/item/created | ||||||
708 | /channel/item/issued | ||||||
709 | /channel/item/dcterms:issued | ||||||
710 | |||||||
711 | /channel/item/jf:creationDate --java-locale-human-readable | ||||||
712 | /channel/item/jf:modificationDate | ||||||
713 | /channel/item/jf:date --free-form | ||||||
714 | )} = (); | ||||||
715 | |||||||
716 | |||||||
717 | sub item_to_timet { | ||||||
718 | 0 | 0 | 0 | 0 | my ($self, $item) = @_; | ||
719 | ### item_to_timet() ... | ||||||
720 | 0 | 0 | 0 | my $str = $self->item_to_date($item) | |||
721 | // return - POSIX::DBL_MAX(); # no date fields | ||||||
722 | |||||||
723 | 0 | 0 | require Date::Parse; | ||||
724 | ### $str | ||||||
725 | # print Date::Parse::str2time($str)," $str\n"; | ||||||
726 | return (Date::Parse::str2time($str) | ||||||
727 | 0 | 0 | 0 | // do { | |||
728 | say __x('Unrecognised date "{date}" from {url}', | ||||||
729 | date => $str, | ||||||
730 | 0 | 0 | url => $self->{'uri'}); | ||||
731 | 0 | 0 | - POSIX::DBL_MAX(); | ||||
732 | }); | ||||||
733 | } | ||||||
734 | |||||||
735 | #----------------------------------------------------------------------------- | ||||||
736 | # Message-ID | ||||||
737 | |||||||
738 | # Return a message ID for something at $uri, optionally uniquified by $str. | ||||||
739 | # $uri is either a URI object or a url string. | ||||||
740 | # Weird chars in $uri or $str are escaped as necessary. | ||||||
741 | # Secret $self->{'msgidextra'} can make different message ids for the same | ||||||
742 | # content when testing. | ||||||
743 | # | ||||||
744 | # The path from $uri is incorporated in the result. fetch_html() needs this | ||||||
745 | # since the ETag identifier is only per-url, not globally unique. Suspect | ||||||
746 | # fetch_rss() needs it for a guid too (a non-permaLink one), as think the | ||||||
747 | # guid is only unique within the particular $uri feed, not globally and not | ||||||
748 | # even across multiple feeds on the same server. | ||||||
749 | # | ||||||
750 | sub url_to_msgid { | ||||||
751 | 0 | 0 | 0 | 0 | my ($self, $url, $str) = @_; | ||
752 | |||||||
753 | 0 | 0 | my $host; | ||||
754 | 0 | 0 | my $pathbit = $url; | ||||
755 | |||||||
756 | 0 | 0 | 0 | if (my $uri = eval { URI->new($url) }) { | |||
0 | 0 | ||||||
757 | 0 | 0 | $uri = $uri->canonical; | ||||
758 | 0 | 0 | 0 | if ($uri->can('host')) { | |||
0 | |||||||
759 | 0 | 0 | $host = $uri->host; | ||||
760 | 0 | 0 | $uri->host(''); | ||||
761 | 0 | 0 | $pathbit = $uri->as_string; | ||||
762 | |||||||
763 | # If the $uri schema has a host part but it's empty or "localhost" | ||||||
764 | # then try expanding that to hostname(). | ||||||
765 | # | ||||||
766 | # $uri schemas without a host part, like "urn:" in an Atom |
||||||
767 | # get hostname(), since want the generated msgid to come out the same | ||||||
768 | # if such a urn: appears from different downloaded locations. | ||||||
769 | # | ||||||
770 | 0 | 0 | 0 | 0 | if (is_empty($host) || $host eq 'localhost') { | ||
771 | 0 | 0 | require Sys::Hostname; | ||||
772 | 0 | 0 | eval { $host = Sys::Hostname::hostname() }; | ||||
0 | 0 | ||||||
773 | } | ||||||
774 | |||||||
775 | } elsif ($uri->can('authority')) { | ||||||
776 | # the "authority" part of a "tag:" schema | ||||||
777 | 0 | 0 | $host = $uri->authority; | ||||
778 | 0 | 0 | $uri->authority(''); | ||||
779 | 0 | 0 | $pathbit = $uri->as_string; | ||||
780 | } | ||||||
781 | } | ||||||
782 | |||||||
783 | # $host can be empty if running from a file:/// | ||||||
784 | # "localhost" is a bit bogus and in particular leafnode won't accept it. | ||||||
785 | # ".invalid" as per RFC 2606 | ||||||
786 | 0 | 0 | 0 | 0 | if (is_empty($host) || $host eq 'localhost') { | ||
787 | 0 | 0 | $host = 'rss2leafnode.invalid'; | ||||
788 | } | ||||||
789 | |||||||
790 | # ipv6 dotted hostname "[1234:5678::0000]" -> "1234.5678.0000..ipv6", | ||||||
791 | # because [ and : are not allowed (RFC 2822 "Atom" atext) | ||||||
792 | # $uri->canonical above lower cases any hex, for consistency | ||||||
793 | 0 | 0 | 0 | if (($host =~ s/^\[|\]$//g) | ($host =~ tr/:/./)) { | |||
794 | 0 | 0 | $host .= '.ipv6'; | ||||
795 | } | ||||||
796 | |||||||
797 | # leafnode 2.0.0.alpha20070602a seems to insist on a "." in the host name | ||||||
798 | 0 | 0 | 0 | unless ($host =~ /\./) { | |||
799 | 0 | 0 | $host .= '.withadot'; | ||||
800 | } | ||||||
801 | |||||||
802 | return ('<' | ||||||
803 | . msgid_chars(join_non_empty('.', | ||||||
804 | 0 | 0 | "rss2leafnode" . $self->{'msgidextra'}, | ||||
805 | $pathbit, | ||||||
806 | $str)) | ||||||
807 | . '@' | ||||||
808 | . msgid_chars($host) | ||||||
809 | . '>'); | ||||||
810 | } | ||||||
811 | # msgid_chars($str) returns $str with invalid Message-ID characters munged. | ||||||
812 | # Per RFC850 must be printing ascii and not < > or whitespace, but for | ||||||
813 | # safety reduce that a bit, in particular excluding ' and ". | ||||||
814 | sub msgid_chars { | ||||||
815 | 0 | 0 | 0 | 0 | my ($str) = @_; | ||
816 | 0 | 0 | require URI::Escape; | ||||
817 | 0 | 0 | return URI::Escape::uri_escape_utf8 ($str, "^A-Za-z0-9\\-_.!~*/:"); | ||||
818 | } | ||||||
819 | |||||||
820 | #------------------------------------------------------------------------------ | ||||||
821 | # news posting | ||||||
822 | # | ||||||
823 | # This used to run the "rnews" program, which in leafnode 2 does some direct | ||||||
824 | # writing to the spool. But that requires user "news" perms, and as of the | ||||||
825 | # June 2007 leafnode beta it tends to be a good deal slower because it reads | ||||||
826 | # the whole groupinfo file. It has the advantage of not being picky about | ||||||
827 | # message ID hostnames, and allowing read-only groups to be filled. But | ||||||
828 | # apart from that plain POST seems much easier for being "server neutral". | ||||||
829 | # | ||||||
830 | # IHAVE instead of POST would be a possibility, when available, though POST | ||||||
831 | # is probably more accurate in the sense it's a new article coming into the | ||||||
832 | # news system. | ||||||
833 | # | ||||||
834 | # Net::NNTP looks at $ENV{NNTPSERVER}, $ENV{NEWSHOST} and Net::Config | ||||||
835 | # nntp_hosts list for the news server. Maybe could have that here too, | ||||||
836 | # instead of always defaulting to localhost (in $self->{'nntp_host'}). | ||||||
837 | # Would want to find out the name chosen to show in diagnostics though. | ||||||
838 | |||||||
839 | # return a string "host:port", suitable for the Host arg to Net::NNTP->new | ||||||
840 | sub uri_to_nntp_host { | ||||||
841 | 0 | 0 | 0 | 0 | my ($uri) = @_; | ||
842 | 0 | 0 | 0 | return (non_empty($uri->host) // 'localhost') . ':' . $uri->port; | |||
843 | } | ||||||
844 | |||||||
845 | sub nntp { | ||||||
846 | 0 | 0 | 0 | 0 | my ($self) = @_; | ||
847 | # reopen if different 'nntp_host' | ||||||
848 | 0 | 0 | 0 | 0 | if (! $self->{'nntp'} | ||
849 | || $self->{'nntp'}->host ne $self->{'nntp_host'}) { | ||||||
850 | 0 | 0 | my $host = $self->{'nntp_host'}; | ||||
851 | 0 | 0 | $self->verbose (1, __x("nntp: {host}", host => $host)); | ||||
852 | 0 | 0 | require Net::NNTP; | ||||
853 | my $nntp = $self->{'nntp'} | ||||||
854 | 0 | 0 | 0 | = Net::NNTP->new ($host, ($self->{'verbose'} >= 2 | |||
855 | ? (Debug => 1) | ||||||
856 | : ())); | ||||||
857 | 0 | 0 | 0 | if (! $nntp) { | |||
858 | 0 | 0 | croak __x("Cannot connect to NNTP on \"{host}\"\n", host => $host); | ||||
859 | } | ||||||
860 | 0 | 0 | 0 | if (! $nntp->postok) { | |||
861 | 0 | 0 | $self->verbose (1, "Hmm, ", $nntp->host, " doesn't say \"posting ok\" ..."); | ||||
862 | } | ||||||
863 | } | ||||||
864 | 0 | 0 | return $self->{'nntp'}; | ||||
865 | } | ||||||
866 | |||||||
867 | sub nntp_close { | ||||||
868 | 0 | 0 | 0 | 0 | my ($self) = @_; | ||
869 | 0 | 0 | 0 | if (my $nntp = delete $self->{'nntp'}) { | |||
870 | 0 | 0 | 0 | if (! $nntp->quit) { | |||
871 | 0 | 0 | say "Error closing nntp: ",$nntp->message; | ||||
872 | } | ||||||
873 | } | ||||||
874 | } | ||||||
875 | |||||||
876 | # check that $group exists in the NNTP, return 1 if so, or 0 if not | ||||||
877 | sub nntp_group_check { | ||||||
878 | 0 | 0 | 0 | 0 | my ($self, $group) = @_; | ||
879 | 0 | 0 | my $nntp = $self->nntp; | ||||
880 | 0 | 0 | 0 | if (! $nntp->group($group)) { | |||
881 | 0 | 0 | print __x("rss2leafnode: no group \"{group}\" on host \"{host}\" | ||||
882 | (See the rss2leafnode man page for notes on creating groups.) | ||||||
883 | ", | ||||||
884 | host => $nntp->host, | ||||||
885 | group => $group); | ||||||
886 | 0 | 0 | return 0; | ||||
887 | } | ||||||
888 | |||||||
889 | 0 | 0 | return 1; | ||||
890 | } | ||||||
891 | |||||||
892 | sub nntp_message_id_exists { | ||||||
893 | 0 | 0 | 0 | 0 | my ($self, $msgid) = @_; | ||
894 | 0 | 0 | my $ret = $self->nntp->nntpstat($msgid); | ||||
895 | 0 | 0 | 0 | if ($self->{'verbose'} >= 2) { | |||
0 | |||||||
896 | 0 | 0 | 0 | $self->verbose (2, "'$msgid' ", ($ret ? 'exists already' : 'new')); | |||
897 | } elsif ($self->{'verbose'} >= 1) { | ||||||
898 | 0 | 0 | 0 | if ($ret) { | |||
899 | 0 | 0 | $self->verbose (1, ' ', __('exists already')); | ||||
900 | } | ||||||
901 | } | ||||||
902 | 0 | 0 | return $ret; | ||||
903 | } | ||||||
904 | |||||||
905 | # post $msg to NNTP, return true if successful | ||||||
906 | sub nntp_post { | ||||||
907 | 0 | 0 | 0 | 0 | my ($self, $msg) = @_; | ||
908 | 0 | 0 | my $nntp = $self->nntp; | ||||
909 | 0 | 0 | 0 | if (! $nntp->post ($msg->as_string)) { | |||
910 | 0 | 0 | say __x('Cannot post: {message}', | ||||
911 | message => scalar($nntp->message)); | ||||||
912 | 0 | 0 | return 0; | ||||
913 | } | ||||||
914 | 0 | 0 | return 1; | ||||
915 | } | ||||||
916 | |||||||
917 | |||||||
918 | #------------------------------------------------------------------------------ | ||||||
919 | # HTML title | ||||||
920 | |||||||
921 | # extra data associated against a HTTP::Response object | ||||||
922 | Hash::Util::FieldHash::fieldhash (my %resp_exiftool_info); | ||||||
923 | |||||||
924 | # return hashref { Title => $str, ... }, or empty {} if no exiftool etc | ||||||
925 | sub resp_exiftool_info { | ||||||
926 | 0 | 0 | 0 | 0 | my ($resp) = @_; | ||
927 | 0 | 0 | 0 | defined $resp or return {}; | |||
928 | 0 | 0 | 0 | if (! exists $resp_exiftool_info{$resp}) { | |||
929 | 0 | 0 | $resp_exiftool_info{$resp} = _resp_exiftool_info($resp); | ||||
930 | ### exiftool info: $resp_exiftool_info{$resp} | ||||||
931 | } | ||||||
932 | 0 | 0 | return $resp_exiftool_info{$resp}; | ||||
933 | } | ||||||
934 | sub _resp_exiftool_info { | ||||||
935 | 0 | 0 | 0 | my ($resp) = @_; | |||
936 | |||||||
937 | # Want ExifTool 8.22 to have PNG tEXt returned as utf8, but don't bother | ||||||
938 | # to enforce that. | ||||||
939 | # | ||||||
940 | # The returned fields from image formats with a defined charset are | ||||||
941 | # converted to the exiftool default "Charset" of utf8, and from other | ||||||
942 | # image formats the fields are bytes of something unknown. Might slightly | ||||||
943 | # like to know which is the case, and show raw bytes different from "bytes | ||||||
944 | # which ought to be utf8", but for now just Encode::decode_utf8() and let | ||||||
945 | # its Encode::FB_DEFAULT() put substitution chars for non-ascii non-utf8. | ||||||
946 | # | ||||||
947 | 0 | 0 | 0 | eval { require Image::ExifTool; 1 } || return {}; | |||
0 | 0 | ||||||
0 | 0 | ||||||
948 | 0 | 0 | $resp->decode; | ||||
949 | 0 | 0 | my $cref = $resp->content_ref; | ||||
950 | 0 | 0 | return Image::ExifTool::ImageInfo | ||||
951 | ($cref, | ||||||
952 | ['Title','Author','Copyright','ImageSize'], # just these tags | ||||||
953 | {List => 0}); # get list values as comma separated | ||||||
954 | } | ||||||
955 | |||||||
956 | # $resp is a HTTP::Response, return title | ||||||
957 | sub html_title { | ||||||
958 | 0 | 0 | 0 | 0 | my ($resp) = @_; | ||
959 | |||||||
960 | return (# for images prefer filename+size over URI::Title just filename | ||||||
961 | 0 | 0 | 0 | non_empty (html_title_exiftool_image($resp)) | |||
0 | |||||||
0 | |||||||
962 | |||||||
963 | // non_empty (html_title_urititle($resp)) | ||||||
964 | // non_empty (html_title_exiftool($resp)) | ||||||
965 | // $resp->title); | ||||||
966 | } | ||||||
967 | sub html_title_urititle { | ||||||
968 | 0 | 0 | 0 | 0 | my ($resp) = @_; | ||
969 | 0 | 0 | 0 | eval { require URI::Title } or return undef; | |||
0 | 0 | ||||||
970 | |||||||
971 | # suppress some dodginess in URI::Title 1.82 | ||||||
972 | local $SIG{'__WARN__'} = sub { | ||||||
973 | 0 | 0 | 0 | my ($msg) = @_; | |||
974 | 0 | 0 | 0 | $msg =~ /Use of uninitialized value/ or warn @_; | |||
975 | 0 | 0 | }; | ||||
976 | 0 | 0 | $resp->decode; | ||||
977 | 0 | 0 | 0 | return URI::Title::title | |||
978 | ({ url => ($resp->request->uri // ''), | ||||||
979 | data => $resp->content}); | ||||||
980 | } | ||||||
981 | sub html_title_exiftool_image { | ||||||
982 | 0 | 0 | 0 | 0 | my ($resp) = @_; | ||
983 | 0 | 0 | 0 | $resp->content_type =~ m{^image/} or return; | |||
984 | 0 | 0 | 0 | if (defined (my $title = html_title_exiftool($resp))) { | |||
985 | 0 | 0 | return $title; | ||||
986 | } | ||||||
987 | 0 | 0 | 0 | my $info = resp_exiftool_info($resp) // return; | |||
988 | ### html_title_exiftool_image() on: $info | ||||||
989 | 0 | 0 | 0 | defined $info->{'ImageSize'} or return; | |||
990 | 0 | 0 | return $resp->filename.' '.$info->{'ImageSize'}; | ||||
991 | } | ||||||
992 | sub html_title_exiftool { | ||||||
993 | 0 | 0 | 0 | 0 | my ($resp) = @_; | ||
994 | 0 | 0 | 0 | my $title = resp_exiftool_info($resp)->{'Title'} // return; | |||
995 | 0 | 0 | return Encode::decode_utf8 ($title); | ||||
996 | } | ||||||
997 | |||||||
998 | |||||||
999 | #------------------------------------------------------------------------------ | ||||||
1000 | # mime | ||||||
1001 | |||||||
1002 | # prepended to "X-Mailer" header | ||||||
1003 | 2 | 2 | 11 | use constant mime_mailer_extra => "RSS2Leafnode $VERSION"; | |||
2 | 3 | ||||||
2 | 2543 | ||||||
1004 | |||||||
1005 | # $body is a MIME::Body object, append $str to it | ||||||
1006 | sub mime_body_append { | ||||||
1007 | 0 | 0 | 0 | 0 | my ($body, $str) = @_; | ||
1008 | 0 | 0 | $str = $body->as_string . "\n" . str_ensure_newline ($str); | ||||
1009 | 0 | 0 | 0 | my $IO = $body->open('w') | |||
1010 | or die "rss2leafnode: body I/O open: $!"; | ||||||
1011 | 0 | 0 | $IO->print ($str); | ||||
1012 | 0 | 0 | 0 | $IO->close | |||
1013 | or die "rss2leafnode: body I/O close: $!"; | ||||||
1014 | } | ||||||
1015 | |||||||
1016 | # if $str is not ascii then apply encode_mimewords() | ||||||
1017 | sub mimewords_non_ascii { | ||||||
1018 | 0 | 0 | 0 | 0 | my ($str) = @_; | ||
1019 | 0 | 0 | 0 | 0 | if (defined $str && ! is_ascii($str)) { | ||
1020 | 0 | 0 | require MIME::Words; | ||||
1021 | 0 | 0 | $str = MIME::Words::encode_mimewords (Encode::encode_utf8($str), | ||||
1022 | Charset => 'UTF-8'); | ||||||
1023 | } | ||||||
1024 | 0 | 0 | return $str; | ||||
1025 | } | ||||||
1026 | |||||||
1027 | sub mime_build { | ||||||
1028 | 0 | 0 | 0 | 0 | my ($self, $headers, @args) = @_; | ||
1029 | |||||||
1030 | # Headers in utf-8, the same as other text. The docs of | ||||||
1031 | # encode_mimewords() isn't clear, but seems to expect bytes of the | ||||||
1032 | # specified charset. | ||||||
1033 | 0 | 0 | foreach my $key (sort keys %$headers) { | ||||
1034 | $headers->{$key} | ||||||
1035 | 0 | 0 | = mimewords_non_ascii(Text::Trim::trim($headers->{$key})); | ||||
1036 | } | ||||||
1037 | |||||||
1038 | 0 | 0 | %$headers = (%$headers, @args); | ||||
1039 | 0 | 0 | 0 | $headers->{'Top'} //= 0; # default to a part not a toplevel | |||
1040 | 0 | 0 | 0 | $headers->{'Encoding'} //= '-SUGGEST'; | |||
1041 | |||||||
1042 | 0 | 0 | 0 | if ($headers->{'Top'}) { | |||
1043 | 0 | 0 | my $now822 = rfc822_time_now(); | ||||
1044 | 0 | 0 | 0 | $headers->{'Date'} //= $now822; | |||
1045 | 0 | 0 | $headers->{'Date-Received:'} = $now822; | ||||
1046 | } | ||||||
1047 | |||||||
1048 | 0 | 0 | 0 | if (utf8::is_utf8($headers->{'Data'})) { | |||
1049 | 0 | 0 | warn 'Oops, mime_build() data should be bytes'; | ||||
1050 | } | ||||||
1051 | |||||||
1052 | # downgrade utf-8 to us-ascii if possible | ||||||
1053 | 0 | 0 | 0 | 0 | if ($headers->{'Type'} eq 'text/plain' | ||
0 | |||||||
0 | |||||||
1054 | && lc($headers->{'Charset'}||0) eq 'utf-8' | ||||||
1055 | && is_ascii ($headers->{'Data'})) { | ||||||
1056 | 0 | 0 | $headers->{'Charset'} = 'us-ascii'; | ||||
1057 | |||||||
1058 | # not sure mangling text/html body content is a good idea -- would only | ||||||
1059 | # want it on generated html, not downloaded | ||||||
1060 | # | ||||||
1061 | # if ($headers->{'Type'} eq 'text/html') { | ||||||
1062 | # $headers->{'Data'} =~ s{( | ||||||
1063 | # } | ||||||
1064 | } | ||||||
1065 | |||||||
1066 | 0 | 0 | @args = map {$_,$headers->{$_}} sort keys %$headers; | ||||
0 | 0 | ||||||
1067 | 0 | 0 | 0 | if ($self->{'verbose'} >= 4) { | |||
1068 | 0 | 0 | require Data::Dumper; | ||||
1069 | 0 | 0 | $self->verbose (4, Data::Dumper->new([\@args],['mime headers'])->Dump); | ||||
1070 | } | ||||||
1071 | |||||||
1072 | 0 | 0 | require MIME::Entity; | ||||
1073 | 0 | 0 | my $top = MIME::Entity->build (Disposition => 'inline', @args); | ||||
1074 | |||||||
1075 | 0 | 0 | 0 | 0 | if ($headers->{'Top'} && ! defined $headers->{'X-Mailer:'}) { | ||
1076 | 0 | 0 | my $head = $top->head; | ||||
1077 | 0 | 0 | $head->set('X-Mailer', join_non_empty (', ', | ||||
1078 | $self->mime_mailer_extra, | ||||||
1079 | $head->get('X-Mailer'))); | ||||||
1080 | } | ||||||
1081 | |||||||
1082 | 0 | 0 | return $top; | ||||
1083 | } | ||||||
1084 | |||||||
1085 | # $resp is a HTTP::Response | ||||||
1086 | # Return a MIME::Entity which contains the response, and any further @headers. | ||||||
1087 | # If $self->{'render'} is true then render HTML to plain text. | ||||||
1088 | # | ||||||
1089 | sub mime_part_from_response { | ||||||
1090 | 0 | 0 | 0 | 0 | my ($self, $resp, @headers) = @_; | ||
1091 | |||||||
1092 | 0 | 0 | my $content_type = $resp->content_type; | ||||
1093 | 0 | 0 | $self->verbose (2, ' content-type: ',$content_type); | ||||
1094 | 0 | 0 | $resp->decode; | ||||
1095 | 0 | 0 | my $content = $resp->content; # the bytes | ||||
1096 | 0 | 0 | my $charset = $resp->content_charset; # and their charset | ||||
1097 | 0 | 0 | my $url = $resp->request->uri->as_string; | ||||
1098 | 0 | 0 | my $content_md5 = $resp->header('Content-MD5'); | ||||
1099 | |||||||
1100 | 0 | 0 | ($content, $content_type, $charset, my $rendered) | ||||
1101 | = $self->render_maybe ($content, $content_type, $charset, $url); | ||||||
1102 | 0 | 0 | 0 | if ($rendered) { | |||
1103 | 0 | 0 | undef $content_md5; | ||||
1104 | } | ||||||
1105 | |||||||
1106 | 0 | 0 | return $self->mime_build | ||||
1107 | ({ 'Content-Language:' => scalar($resp->header('Content-Language')), | ||||||
1108 | 'Content-Location:' => $url, | ||||||
1109 | 'Content-MD5:' => $content_md5, | ||||||
1110 | @headers, | ||||||
1111 | }, | ||||||
1112 | Type => $content_type, | ||||||
1113 | Charset => $charset, | ||||||
1114 | Data => $content, | ||||||
1115 | Filename => $resp->filename); | ||||||
1116 | } | ||||||
1117 | |||||||
1118 | |||||||
1119 | # set "Lines:" header per RFC 1036 | ||||||
1120 | # MIME::Entity 5.428 doesn't seem to have anything for this itself | ||||||
1121 | # this is after qp or base64, is that right? the actual message lines | ||||||
1122 | sub mime_entity_lines { | ||||||
1123 | 0 | 0 | 0 | 0 | my ($top) = @_; | ||
1124 | 0 | 0 | $top->head->set('Lines', str_count_lines ($top->stringify_body)); | ||||
1125 | } | ||||||
1126 | |||||||
1127 | #------------------------------------------------------------------------------ | ||||||
1128 | # XML::Twig stuff | ||||||
1129 | |||||||
1130 | # Return the text of $elt and treat child elements as improperly escaped | ||||||
1131 | # parts of the text too. | ||||||
1132 | # | ||||||
1133 | # This is good for elements which are supposed to be HTML with etc |
||||||
1134 | # escaped as <p>, but copes with feeds that don't have the necessary | ||||||
1135 | # escapes and thus come out with xml child elements under $elt. | ||||||
1136 | # | ||||||
1137 | # For elements which are supposed to be plain text with no markup and no | ||||||
1138 | # sub-elements this will at least make improper child text visible, though | ||||||
1139 | # it might not look very good. | ||||||
1140 | # | ||||||
1141 | # As of June 2010 http://www.drweil.com/drw/ecs/rss.xml is an example of | ||||||
1142 | # improperly escaped html. | ||||||
1143 | # | ||||||
1144 | # FIXME: Any need to watch out for |
||||||
1145 | # | ||||||
1146 | sub elt_subtext { | ||||||
1147 | 0 | 0 | 0 | 0 | my ($elt) = @_; | ||
1148 | 0 | 0 | 0 | defined $elt or return undef; | |||
1149 | 0 | 0 | 0 | if ($elt->is_text) { return $elt->text; } | |||
0 | 0 | ||||||
1150 | 0 | 0 | return join ('', map {_elt_subtext_with_tags($_)} $elt->children); | ||||
0 | 0 | ||||||
1151 | } | ||||||
1152 | sub _elt_subtext_with_tags { | ||||||
1153 | 0 | 0 | 0 | my ($elt) = @_; | |||
1154 | 0 | 0 | 0 | defined $elt or return undef; | |||
1155 | 0 | 0 | 0 | if ($elt->is_text) { return $elt->text; } | |||
0 | 0 | ||||||
1156 | return ($elt->start_tag | ||||||
1157 | 0 | 0 | . join ('', map {_elt_subtext_with_tags($_)} $elt->children) | ||||
0 | 0 | ||||||
1158 | . $elt->end_tag); | ||||||
1159 | } | ||||||
1160 | |||||||
1161 | # $elt contains xhtml etc sub-elements. Return a plain html string. |
||||||
1162 | # Prefixes like |
||||||
1163 | # This relies on the map_xmlns mapping to give prefix "xhtml:" | ||||||
1164 | # | ||||||
1165 | sub elt_xhtml_to_html { | ||||||
1166 | 0 | 0 | 0 | 0 | my ($elt) = @_; | ||
1167 | |||||||
1168 | # could probably do it destructively, but just in case | ||||||
1169 | 0 | 0 | $elt = $elt->copy; | ||||
1170 | 0 | 0 | App::RSS2Leafnode::XML::Twig::Other::elt_tree_strip_prefix ($elt, 'xhtml'); | ||||
1171 | |||||||
1172 | # lose xmlns:xhtml="http://www.w3.org/1999/xhtml" | ||||||
1173 | 0 | 0 | $elt->strip_att('xmlns:xhtml'); | ||||
1174 | |||||||
1175 | # something fishy turns "href" to "xhtml:href", drop any "xhtml:" | ||||||
1176 | # bare "href" also gets turned into atom:href as the default namespace, | ||||||
1177 | # drop any "atom:" | ||||||
1178 | 0 | 0 | foreach my $child ($elt->descendants) { | ||||
1179 | 0 | 0 | foreach my $attname ($child->att_names) { | ||||
1180 | 0 | 0 | 0 | if ($attname =~ /^(xhtml|atom):(.*)/) { | |||
1181 | 0 | 0 | $child->change_att_name($attname, $2); | ||||
1182 | } | ||||||
1183 | } | ||||||
1184 | } | ||||||
1185 | |||||||
1186 | 0 | 0 | my $old_pretty = $elt->set_pretty_print ('none'); | ||||
1187 | ### $old_pretty | ||||||
1188 | 0 | 0 | my $ret = $elt->xml_string; | ||||
1189 | 0 | 0 | $elt->set_pretty_print ($old_pretty); | ||||
1190 | 0 | 0 | return $ret; | ||||
1191 | |||||||
1192 | } | ||||||
1193 | |||||||
1194 | # elt_content_type() returns 'text', 'html', 'xhtml' or a mime type. | ||||||
1195 | # If no type="" attribute the default is 'text', except for RSS | ||||||
1196 | # |
||||||
1197 | # | ||||||
1198 | # RSS http://www.debian.org/News/weekly/dwn.en.rdf circa Feb 2010 had some | ||||||
1199 | # html in its |
||||||
1200 | # plain text) and that RSS is all plain text outside |
||||||
1201 | # | ||||||
1202 | # |
||||||
1203 | # not the formatting as html vs text. | ||||||
1204 | # | ||||||
1205 | @known{'/channel/item/dc:type'} = undef; | ||||||
1206 | # | ||||||
1207 | sub elt_content_type { | ||||||
1208 | 0 | 0 | 0 | 0 | my ($elt) = @_; | ||
1209 | 0 | 0 | 0 | if (! defined $elt) { return undef; } | |||
0 | 0 | ||||||
1210 | |||||||
1211 | 0 | 0 | 0 | 0 | if (defined (my $type = ($elt->att('atom:type') // $elt->att('type')))) { | ||
1212 | # type="application/xhtml+xml" at http://xmltwig.com/blog/index.atom, | ||||||
1213 | # dunno if it should be just "xhtml", but recognise it anyway | ||||||
1214 | 0 | 0 | 0 | if ($type eq 'application/xhtml+xml') { return 'xhtml'; } | |||
0 | 0 | ||||||
1215 | 0 | 0 | return $type; | ||||
1216 | } | ||||||
1217 | 0 | 0 | 0 | if ($elt->root->tag eq 'feed') { | |||
1218 | 0 | 0 | return 'text'; # Atom |
||||
1219 | } | ||||||
1220 | 0 | 0 | my $tag = $elt->tag; | ||||
1221 | 0 | 0 | 0 | if ($tag =~ /^itunes:/) { | |||
1222 | # itunes spec is for text-only, no html markup | ||||||
1223 | 0 | 0 | return 'text'; | ||||
1224 | } | ||||||
1225 | 0 | 0 | 0 | 0 | if ($tag eq 'description' # RSS |
||
1226 | || $tag eq 'content:encoded') { # same in content:encoded | ||||||
1227 | 0 | 0 | return 'html'; | ||||
1228 | } | ||||||
1229 | # other RSS is text | ||||||
1230 | 0 | 0 | return 'text'; | ||||
1231 | } | ||||||
1232 | |||||||
1233 | # $elt is an XML::Twig::Elt of an RSS or Atom text element. | ||||||
1234 | # Atom has a type="" attribute, RSS is html. Html or xhtml are rendered to | ||||||
1235 | # a single long line of plain text. | ||||||
1236 | # | ||||||
1237 | sub elt_to_rendered_line { | ||||||
1238 | 0 | 0 | 0 | 0 | my ($elt) = @_; | ||
1239 | 0 | 0 | 0 | defined $elt or return; | |||
1240 | |||||||
1241 | 0 | 0 | my $str; | ||||
1242 | 0 | 0 | my $type = elt_content_type ($elt); | ||||
1243 | 0 | 0 | 0 | if ($type eq 'xhtml') { | |||
1244 | 0 | 0 | $str = elt_xhtml_to_html ($elt); | ||||
1245 | 0 | 0 | $type = 'html'; | ||||
1246 | } else { | ||||||
1247 | 0 | 0 | $str = elt_subtext($elt); | ||||
1248 | } | ||||||
1249 | 0 | 0 | 0 | if ($type eq 'html') { | |||
1250 | 0 | 0 | $str = html_to_rendered_line($str); | ||||
1251 | } | ||||||
1252 | # plain 'text' or anything unrecognised collapsed too | ||||||
1253 | 0 | 0 | return non_empty(collapse_whitespace($str)); | ||||
1254 | } | ||||||
1255 | |||||||
1256 | sub html_to_rendered_line { | ||||||
1257 | 0 | 0 | 0 | 0 | my ($html) = @_; | ||
1258 | 0 | 0 | require HTML::FormatText; | ||||
1259 | 0 | 0 | return collapse_whitespace | ||||
1260 | (HTML::FormatText->format_string ($html, | ||||||
1261 | leftmargin => 0, | ||||||
1262 | rightmargin => 999)); | ||||||
1263 | } | ||||||
1264 | |||||||
1265 | |||||||
1266 | #------------------------------------------------------------------------------ | ||||||
1267 | # XML::RSS::Timing | ||||||
1268 | |||||||
1269 | sub twig_to_timingfields { | ||||||
1270 | 0 | 0 | 0 | 0 | my ($self, $twig) = @_; | ||
1271 | 0 | 0 | 0 | return if ! defined $twig; | |||
1272 | 0 | 0 | my $root = $twig->root; | ||||
1273 | 0 | 0 | my %timingfields; | ||||
1274 | |||||||
1275 | 0 | 0 | 0 | if (my $ttl = $root->first_descendant('ttl')) { | |||
1276 | 0 | 0 | $timingfields{'ttl'} = $ttl->trimmed_text; | ||||
1277 | } | ||||||
1278 | 0 | 0 | 0 | if (my $skipHours = $root->first_descendant('skipHours')) { | |||
1279 | 0 | 0 | $timingfields{'skipHours'} = [map {$_->trimmed_text} $skipHours->children('hour')]; | ||||
0 | 0 | ||||||
1280 | } | ||||||
1281 | 0 | 0 | 0 | if (my $skipDays = $root->first_descendant('skipDays')) { | |||
1282 | 0 | 0 | $timingfields{'skipDays'} = [map {$_->trimmed_text} $skipDays->children('day')]; | ||||
0 | 0 | ||||||
1283 | } | ||||||
1284 | |||||||
1285 | # "syn:updatePeriod" etc | ||||||
1286 | 0 | 0 | foreach my $key (qw(updatePeriod updateFrequency updateBase)) { | ||||
1287 | 0 | 0 | 0 | if (my $update = $root->first_descendant("syn:$key")) { | |||
1288 | 0 | 0 | $timingfields{$key} = $update->trimmed_text; | ||||
1289 | } | ||||||
1290 | } | ||||||
1291 | 0 | 0 | 0 | if ($self->{'verbose'} >= 2) { | |||
1292 | 0 | 0 | require Data::Dumper; | ||||
1293 | 0 | 0 | $self->verbose (2, | ||||
1294 | Data::Dumper->new([\%timingfields],['timingfields']) | ||||||
1295 | ->Indent(1)->Sortkeys(1)->Dump); | ||||||
1296 | } | ||||||
1297 | 0 | 0 | 0 | if (! %timingfields) { | |||
1298 | 0 | 0 | return; # no info | ||||
1299 | } | ||||||
1300 | |||||||
1301 | # if XML::RSS::Timing doesn't like the values then don't record them | ||||||
1302 | 0 | 0 | 0 | return unless $self->timingfields_to_timing(\%timingfields); | |||
1303 | |||||||
1304 | 0 | 0 | return \%timingfields; | ||||
1305 | } | ||||||
1306 | @known{qw(/channel/skipDays | ||||||
1307 | /channel/skipDays/day | ||||||
1308 | /channel/skipHours | ||||||
1309 | /channel/skipHours/hour | ||||||
1310 | /channel/ttl | ||||||
1311 | /channel/syn:updateBase | ||||||
1312 | /channel/syn:updatePeriod | ||||||
1313 | /channel/syn:updateFrequency)} = (); | ||||||
1314 | |||||||
1315 | # return an XML::RSS::Timing object, or undef | ||||||
1316 | sub timingfields_to_timing { | ||||||
1317 | 0 | 0 | 0 | 0 | my ($self, $timingfields) = @_; | ||
1318 | 0 | 0 | 0 | $timingfields // return undef; | |||
1319 | |||||||
1320 | 0 | 0 | 0 | eval { require XML::RSS::Timing } || return undef; | |||
0 | 0 | ||||||
1321 | 0 | 0 | my $timing = XML::RSS::Timing->new; | ||||
1322 | 0 | 0 | $timing->use_exceptions(0); | ||||
1323 | 0 | 0 | while (my ($key, $value) = each %$timingfields) { | ||||
1324 | 0 | 0 | 0 | if (ref $value) { | |||
1325 | 0 | 0 | $timing->$key (@$value); | ||||
1326 | } else { | ||||||
1327 | 0 | 0 | $timing->$key ($value); | ||||
1328 | } | ||||||
1329 | } | ||||||
1330 | 0 | 0 | 0 | if (my @complaints = $timing->complaints) { | |||
1331 | say __x('XML::RSS::Timing complains about {url}', | ||||||
1332 | 0 | 0 | url => $self->{'uri'}); | ||||
1333 | 0 | 0 | foreach my $complaint (@complaints) { | ||||
1334 | 0 | 0 | say " $complaint"; | ||||
1335 | } | ||||||
1336 | 0 | 0 | return undef; | ||||
1337 | } | ||||||
1338 | 0 | 0 | return $timing; | ||||
1339 | } | ||||||
1340 | |||||||
1341 | |||||||
1342 | #------------------------------------------------------------------------------ | ||||||
1343 | # rss2leafnode.status file | ||||||
1344 | |||||||
1345 | # $self->{'global_status'} is a hashref containing entries URL => STATUS, | ||||||
1346 | # where URL is a string and STATUS is a sub-hashref of information | ||||||
1347 | |||||||
1348 | 2 | 2 | 10 | use constant STATUS_EXPIRE_DAYS => 45; | |||
2 | 2 | ||||||
2 | 4714 | ||||||
1349 | |||||||
1350 | # read $status_filename into $self->{'global_status'} | ||||||
1351 | sub status_read { | ||||||
1352 | 0 | 0 | 0 | 0 | my ($self) = @_; | ||
1353 | 0 | 0 | $self->{'global_status'} = {}; | ||||
1354 | 0 | 0 | my $status_filename = $self->status_filename; | ||||
1355 | 0 | 0 | $self->verbose (2, 'read status: ', $status_filename); | ||||
1356 | |||||||
1357 | 0 | 0 | $! = 0; | ||||
1358 | 0 | 0 | my $global_status = do $status_filename; | ||||
1359 | 0 | 0 | 0 | if (! defined $global_status) { | |||
1360 | 0 | 0 | 0 | if ($! == POSIX::ENOENT()) { | |||
1361 | 0 | 0 | $self->verbose (2, "status file doesn't exist"); | ||||
1362 | } else { | ||||||
1363 | 0 | 0 | say "rss2leafnode: error in $status_filename\n$@"; | ||||
1364 | 0 | 0 | say "ignoring that file"; | ||||
1365 | } | ||||||
1366 | 0 | 0 | $global_status = {}; | ||||
1367 | } | ||||||
1368 | 0 | 0 | $self->{'global_status'} = $global_status; | ||||
1369 | } | ||||||
1370 | |||||||
1371 | # delete old entries from $self->{'global_status'} | ||||||
1372 | sub status_prune { | ||||||
1373 | 0 | 0 | 0 | 0 | my ($self) = @_; | ||
1374 | 0 | 0 | 0 | my $global_status = $self->{'global_status'} // return; | |||
1375 | 0 | 0 | my $pruned = 0; | ||||
1376 | 0 | 0 | my $old_time = time() - STATUS_EXPIRE_DAYS * 86400; | ||||
1377 | 0 | 0 | foreach my $key (keys %$global_status) { | ||||
1378 | 0 | 0 | 0 | if ($global_status->{$key}->{'status-time'} < $old_time) { | |||
1379 | 0 | 0 | $self->verbose (2, __x("discard old status {url}\n", url => $key)); | ||||
1380 | 0 | 0 | delete $global_status->{$key}; | ||||
1381 | 0 | 0 | $pruned++; | ||||
1382 | } | ||||||
1383 | } | ||||||
1384 | 0 | 0 | 0 | if ($pruned) { | |||
1385 | 0 | 0 | $self->verbose (1, __xn("discard {count} old status entry\n", | ||||
1386 | "discard {count} old status entries\n", | ||||||
1387 | $pruned, | ||||||
1388 | count => $pruned)); | ||||||
1389 | } | ||||||
1390 | } | ||||||
1391 | |||||||
1392 | # save $self->{'global_status'} into the $status_filename | ||||||
1393 | sub status_save { | ||||||
1394 | 0 | 0 | 0 | 0 | my ($self, $status) = @_; | ||
1395 | 0 | 0 | $status->{'status-time'} = time(); | ||||
1396 | 0 | 0 | 0 | if ($status->{'timingfields'}) { | |||
1397 | 0 | 0 | $status->{'timingfields'}->{'lastPolled'} = $status->{'status-time'}; | ||||
1398 | } | ||||||
1399 | |||||||
1400 | 0 | 0 | $self->status_prune; | ||||
1401 | |||||||
1402 | 0 | 0 | require Data::Dumper; | ||||
1403 | 0 | 0 | my $str = Data::Dumper->new([$self->{'global_status'}],['global_status']) | ||||
1404 | ->Indent(1)->Sortkeys(1)->Terse(1)->Useqq(1)->Dump; | ||||||
1405 | 0 | 0 | $str = <<"HERE"; | ||||
1406 | # rss2leafnode status file -- automatically generated -- DO NOT EDIT | ||||||
1407 | # | ||||||
1408 | # (If there seems to be something very wrong then you can delete this file | ||||||
1409 | # and it'll be started afresh on the next run.) | ||||||
1410 | |||||||
1411 | $str | ||||||
1412 | |||||||
1413 | |||||||
1414 | # Local variables: | ||||||
1415 | # mode: perl-mode | ||||||
1416 | # End: | ||||||
1417 | HERE | ||||||
1418 | |||||||
1419 | 0 | 0 | my $status_filename = $self->status_filename; | ||||
1420 | 0 | 0 | my $out; | ||||
1421 | 0 | 0 | 0 | 0 | (open $out, '>', $status_filename | ||
0 | |||||||
1422 | and print $out $str | ||||||
1423 | and close $out) | ||||||
1424 | or croak "rss2leafnode: cannot write to $status_filename: $!\n"; | ||||||
1425 | } | ||||||
1426 | |||||||
1427 | # return a hashref which has status information about $url, or undef if | ||||||
1428 | # nothing recorded about $url | ||||||
1429 | sub status_geturl { | ||||||
1430 | 0 | 0 | 0 | 0 | my ($self, $url) = @_; | ||
1431 | 0 | 0 | 0 | $self->status_read if ! $self->{'global_status'}; | |||
1432 | 0 | 0 | 0 | if (! $self->{'global_status'}->{$url}) { | |||
1433 | 0 | 0 | $self->{'global_status'}->{$url} = { 'status-time' => time() }; | ||||
1434 | } | ||||||
1435 | 0 | 0 | return $self->{'global_status'}->{$url}; | ||||
1436 | } | ||||||
1437 | |||||||
1438 | # $resp is a HTTP::Response object from retrieving $url. | ||||||
1439 | # Optional $twig is an XML::Twig. | ||||||
1440 | # Record against $url any ETag, Last-Modified and ttl from $resp and $twig. | ||||||
1441 | # If $resp is an error return, or is undef, then do nothing. | ||||||
1442 | sub status_etagmod_resp { | ||||||
1443 | 0 | 0 | 0 | 0 | my ($self, $url, $resp, $twig) = @_; | ||
1444 | 0 | 0 | 0 | 0 | if ($resp && $resp->is_success) { | ||
1445 | 0 | 0 | my $status = $self->status_geturl ($url); | ||||
1446 | 0 | 0 | $status->{'Last-Modified'} = $resp->header('Last-Modified'); | ||||
1447 | 0 | 0 | $status->{'ETag'} = $resp->header('ETag'); | ||||
1448 | 0 | 0 | $status->{'timingfields'} = $self->twig_to_timingfields ($twig); | ||||
1449 | |||||||
1450 | 0 | 0 | 0 | 0 | if (! defined $status->{'ETag'} && ! defined $status->{'Last-Modified'}) { | ||
1451 | 0 | 0 | $self->verbose (1, " no ETag or Last-Modified"); | ||||
1452 | } | ||||||
1453 | 0 | 0 | 0 | if (defined (my $comments_count = $self->{'comments_count'})) { | |||
1454 | 0 | 0 | $status->{'comments_count'} = $comments_count; | ||||
1455 | } | ||||||
1456 | |||||||
1457 | 0 | 0 | 0 | if ($twig) { | |||
1458 | # record previously applied newest option | ||||||
1459 | 0 | 0 | $status->{'rss_newest_only'} = $self->{'rss_newest_only'}; | ||||
1460 | |||||||
1461 | # if (rss_newest_cmp($self,$status) > 0) { | ||||||
1462 | # # the newest number increases | ||||||
1463 | # } | ||||||
1464 | } | ||||||
1465 | 0 | 0 | foreach my $key (keys %$status) { | ||||
1466 | 0 | 0 | 0 | if (! defined $status->{$key}) { delete $status->{$key} } | |||
0 | 0 | ||||||
1467 | } | ||||||
1468 | 0 | 0 | $self->status_save($status); | ||||
1469 | } | ||||||
1470 | } | ||||||
1471 | |||||||
1472 | # update recorded status for a $url with unchanged contents | ||||||
1473 | sub status_unchanged { | ||||||
1474 | 0 | 0 | 0 | 0 | my ($self, $url) = @_; | ||
1475 | 0 | 0 | $self->verbose (1, ' ', __('unchanged')); | ||||
1476 | 0 | 0 | $self->status_save ($self->status_geturl ($url)); | ||||
1477 | } | ||||||
1478 | |||||||
1479 | # $req is a HTTP::Request object. | ||||||
1480 | # Add "If-None-Match" and/or "If-Modified-Since" headers to it based on what | ||||||
1481 | # the status file has recorded from when we last fetched the url in $req. | ||||||
1482 | # Return 1 to download, 0 if nothing expected yet by RSS timing fields | ||||||
1483 | # | ||||||
1484 | sub status_etagmod_req { | ||||||
1485 | 0 | 0 | 0 | 0 | my ($self, $req, $for_rss) = @_; | ||
1486 | 0 | 0 | 0 | $self->{'global_status'} or $self->status_read; | |||
1487 | |||||||
1488 | 0 | 0 | my $url = $req->uri->as_string; | ||||
1489 | my $status = $self->{'global_status'}->{$url} | ||||||
1490 | 0 | 0 | 0 | // do { | |||
1491 | 0 | 0 | $self->verbose (2, __x("no status info for {url}\n", url => $url)); | ||||
1492 | 0 | 0 | return 1; # want download | ||||
1493 | }; | ||||||
1494 | |||||||
1495 | 0 | 0 | 0 | if ($for_rss) { | |||
1496 | # if status says the last download was for only a certain number of | ||||||
1497 | # newest, then force a re-download if that option now different | ||||||
1498 | 0 | 0 | 0 | if (! str_equal($self->{'rss_newest_only'}, | |||
1499 | $status->{'rss_newest_only'})) { | ||||||
1500 | 0 | 0 | return 1; # want download | ||||
1501 | } | ||||||
1502 | } | ||||||
1503 | |||||||
1504 | 0 | 0 | 0 | if (my $timing = $self->timingfields_to_timing ($status->{'timingfields'})) { | |||
1505 | 0 | 0 | my $next = $timing->nextUpdate; | ||||
1506 | 0 | 0 | my $now = time(); | ||||
1507 | 0 | 0 | 0 | if ($next > $now) { | |||
1508 | 0 | 0 | $self->verbose (1, ' ', | ||||
1509 | __x('timing: next update {time} (local time)', | ||||||
1510 | time => POSIX::strftime ("%H:%M:%S %a %d %b %Y", | ||||||
1511 | localtime($next)))); | ||||||
1512 | 0 | 0 | 0 | 0 | if (eval 'use Time::Duration::Locale; 1' | ||
1513 | || eval 'use Time::Duration; 1') { | ||||||
1514 | 0 | 0 | $self->verbose (1, ' ', __x('which is {duration} from now', | ||||
1515 | duration => duration($next-$now))); | ||||||
1516 | } | ||||||
1517 | 0 | 0 | return 0; # no update yet | ||||
1518 | } | ||||||
1519 | } | ||||||
1520 | 0 | 0 | 0 | if (defined (my $lastmod = $status->{'Last-Modified'})) { | |||
1521 | 0 | 0 | $req->header('If-Modified-Since' => $lastmod); | ||||
1522 | } | ||||||
1523 | 0 | 0 | 0 | if (defined (my $etag = $status->{'ETag'})) { | |||
1524 | 0 | 0 | $req->header('If-None-Match' => $etag); | ||||
1525 | } | ||||||
1526 | 0 | 0 | return 1; | ||||
1527 | } | ||||||
1528 | |||||||
1529 | # return -1 if x |
||||||
1530 | # sub rss_newest_cmp { | ||||||
1531 | # my ($x, $y) = @_; | ||||||
1532 | # if ($x->{'rss_newest_only'}) { | ||||||
1533 | # if (! $y->{'rss_newest_only'}) { | ||||||
1534 | # return -1; # x finite, y infinite | ||||||
1535 | # } | ||||||
1536 | # # x and y finite | ||||||
1537 | # return ($x->{'rss_newest_only'} <=> $y->{'rss_newest_only'}); | ||||||
1538 | # } else { | ||||||
1539 | # # x infinite, so 1 if y finite, 0 if y infinite too | ||||||
1540 | # return !! $y->{'rss_newest_only'}; | ||||||
1541 | # } | ||||||
1542 | # } | ||||||
1543 | sub str_equal { | ||||||
1544 | 0 | 0 | 0 | 0 | my ($x, $y) = @_; | ||
1545 | 0 | 0 | 0 | return ((defined $x && defined $y && $x eq $y) | |||
1546 | || (! defined $x && ! defined $y)); | ||||||
1547 | } | ||||||
1548 | |||||||
1549 | #------------------------------------------------------------------------------ | ||||||
1550 | # render html | ||||||
1551 | |||||||
1552 | # $content_type is a string like "text/html" or "text/plain". | ||||||
1553 | # $content is data as raw bytes. | ||||||
1554 | # $charset is the character set of those bytes, eg. "utf-8". | ||||||
1555 | # | ||||||
1556 | # If the $render option is set, and $content_type is 'text/html', then | ||||||
1557 | # render $content down to 'text/plain', using either HTML::FormatText or | ||||||
1558 | # Lynx. | ||||||
1559 | # The return is a new triplet ($content, $content_type, $charset). | ||||||
1560 | # | ||||||
1561 | sub render_maybe { | ||||||
1562 | 0 | 0 | 0 | 0 | my ($self, $content, $content_type, $charset, $base_url) = @_; | ||
1563 | 0 | 0 | my $rendered = 0; | ||||
1564 | 0 | 0 | 0 | 0 | if ($self->{'render'} && $content_type eq 'text/html') { | ||
1565 | |||||||
1566 | 0 | 0 | my $class = $self->{'render'}; | ||||
1567 | 0 | 0 | 0 | if ($class !~ /^HTML::/) { $class = "HTML::FormatText::\u$class"; } | |||
0 | 0 | ||||||
1568 | 0 | 0 | $class =~ s/::1$//; # "::1" is $render=1 for plain HTML::FormatText | ||||
1569 | 0 | 0 | require Module::Load; | ||||
1570 | 0 | 0 | Module::Load::load ($class); | ||||
1571 | |||||||
1572 | # decode() can error out on bad charset. | ||||||
1573 | 0 | 0 | 0 | unless (eval { $content = Encode::decode ($charset, $content); 1; }) { | |||
0 | 0 | ||||||
0 | 0 | ||||||
1574 | 0 | 0 | print __x(" oops, cannot decode {charset}: {error}\n", | ||||
1575 | charset => $charset, | ||||||
1576 | error => $@); | ||||||
1577 | } | ||||||
1578 | |||||||
1579 | # HTML::FormatText (as of version 2.04) doesn't do anything about input | ||||||
1580 | # or output charsets but putting wide chars through gives reasonable | ||||||
1581 | # results. Likewise HTML::FormatText::WithLinks (as of its version | ||||||
1582 | # 0.11). The HTML::FormatExternal modules version 23 up have wide char | ||||||
1583 | # input and output. | ||||||
1584 | { | ||||||
1585 | 0 | 0 | local $SIG{'__WARN__'} = \&_warn_suppress_unknown_configure_option; | ||||
0 | 0 | ||||||
1586 | $content = $class->format_string | ||||||
1587 | ($content, | ||||||
1588 | base => $base_url, | ||||||
1589 | doc_overrides_base => 1, # for HTML::FormatText::WithLinks | ||||||
1590 | leftmargin => 0, | ||||||
1591 | 0 | 0 | rightmargin => $self->{'render_width'}); | ||||
1592 | } | ||||||
1593 | # $content is wide chars, go to utf-8 bytes | ||||||
1594 | |||||||
1595 | 0 | 0 | $content = Encode::encode_utf8 ($content); | ||||
1596 | 0 | 0 | $charset = 'UTF-8'; | ||||
1597 | 0 | 0 | $content_type = 'text/plain'; | ||||
1598 | 0 | 0 | $rendered = 1; | ||||
1599 | } | ||||||
1600 | 0 | 0 | return ($content, $content_type, $charset, $rendered); | ||||
1601 | } | ||||||
1602 | # HTML::FormatText emits "Unknown configure option" for an option key it | ||||||
1603 | # doesn't know. Would probably prefer it to quietly ignore. | ||||||
1604 | sub _warn_suppress_unknown_configure_option { | ||||||
1605 | 0 | 0 | 0 | my ($msg) = @_; | |||
1606 | 0 | 0 | 0 | $msg =~ /^Unknown configure option/ | |||
1607 | or warn $msg; | ||||||
1608 | } | ||||||
1609 | |||||||
1610 | # $str is a wide-char string of text | ||||||
1611 | sub text_wrap { | ||||||
1612 | 0 | 0 | 0 | 0 | my ($self, $str, $prefix) = @_; | ||
1613 | 0 | 0 | 0 | if (! defined $prefix) { $prefix = ''; } | |||
0 | 0 | ||||||
1614 | 0 | 0 | require Text::WrapI18N; | ||||
1615 | 0 | 0 | local $Text::WrapI18N::columns = $self->{'render_width'} + 1; | ||||
1616 | 0 | 0 | local $Text::WrapI18N::unexpand = 0; # no tabs in output | ||||
1617 | 0 | 0 | local $Text::WrapI18N::huge = 'overflow'; # don't break long words | ||||
1618 | 0 | 0 | $str =~ tr/\n/ /; | ||||
1619 | 0 | 0 | 0 | my $second_prefix = (length($prefix) ? ' 'x(length($prefix)+2) : ''); | |||
1620 | 0 | 0 | return Text::WrapI18N::wrap($prefix, $second_prefix, $str); | ||||
1621 | } | ||||||
1622 | |||||||
1623 | #------------------------------------------------------------------------------ | ||||||
1624 | # Face icons | ||||||
1625 | |||||||
1626 | # $item is an XML::Twig::Elt of an RSS or Atom item | ||||||
1627 | # return a string value for the Face: header, or undef if no icon | ||||||
1628 | sub item_to_face { | ||||||
1629 | 0 | 0 | 0 | 0 | my ($self, $item) = @_; | ||
1630 | 0 | 0 | 0 | $self->{'get_icon'} || return; | |||
1631 | 0 | 0 | 0 | my ($uri, $width, $height) = $self->item_image_uwh ($item) | |||
1632 | or return; | ||||||
1633 | 0 | 0 | 0 | $self->face_wh_ok ($width, $height) || return; | |||
1634 | 0 | 0 | return $self->download_face ($uri, $width, $height); | ||||
1635 | } | ||||||
1636 | |||||||
1637 | # $item is an XML::Twig::Elt of an RSS or Atom item | ||||||
1638 | # return values ($uri, $width, $height) of the |
||||||
1639 | # | ||||||
1640 | sub item_image_uwh { | ||||||
1641 | 0 | 0 | 0 | 0 | my ($self, $item) = @_; | ||
1642 | ### item_image_uwh() ... | ||||||
1643 | |||||||
1644 | 0 | 0 | foreach my $where ($item, | ||||
1645 | elt_to_channel($item)) { | ||||||
1646 | ### image text: $where->first_child_text('image') | ||||||
1647 | |||||||
1648 | # identi.ca | ||||||
1649 | 0 | 0 | 0 | if (my $actor = $where->first_child('activity:actor')) { | |||
1650 | 0 | 0 | my ($url, $width, $height); | ||||
1651 | 0 | 0 | foreach my $link_elt ($actor->children('link')) { | ||||
1652 | 0 | 0 | 0 | 0 | ($link_elt->att('rel')||$link_elt->att('atom:rel')||'') | ||
1653 | eq 'avatar' or next; | ||||||
1654 | 0 | 0 | 0 | $url = $link_elt->att('href') // $link_elt->att('atom:href') // next; | |||
0 | |||||||
1655 | 0 | 0 | my $this_width = $link_elt->att('media:width'); | ||||
1656 | 0 | 0 | 0 | 0 | next if (defined $width | ||
0 | |||||||
1657 | && defined $this_width | ||||||
1658 | && $width < $this_width); # prefer smallest | ||||||
1659 | 0 | 0 | $url = App::RSS2Leafnode::XML::Twig::Other::elt_xml_based_uri ($link_elt, $url); | ||||
1660 | 0 | 0 | 0 | $width = ($this_width || 0); | |||
1661 | 0 | 0 | 0 | $height = ($link_elt->att('media:height') || 0); | |||
1662 | ### $url | ||||||
1663 | ### $width | ||||||
1664 | ### $height | ||||||
1665 | } | ||||||
1666 | 0 | 0 | 0 | if (defined $url) { | |||
1667 | 0 | 0 | return ($url, $width, $height); | ||||
1668 | } | ||||||
1669 | } | ||||||
1670 | |||||||
1671 | # RSS | ||||||
1672 | # |
||||||
1673 | # |
||||||
1674 | # |
||||||
1675 | # |
||||||
1676 | # | ||||||
1677 | 0 | 0 | 0 | if (my $image_elt = $where->first_child('image')) { | |||
1678 | 0 | 0 | my $url_elt; # XML::Twig::Elt where the url came from | ||||
1679 | my $url; # url string | ||||||
1680 | 0 | 0 | 0 | if ($url_elt = $image_elt->first_child('url')) { | |||
1681 | 0 | 0 | $url = $url_elt->trimmed_text; | ||||
1682 | } else { | ||||||
1683 | # Cooper Hewitt museum http://blog.cooperhewitt.org/rss/?limit=10 | ||||||
1684 | # item |
||||||
1685 | # |
||||||
1686 | # ]]> | ||||||
1687 | # | ||||||
1688 | # don't want to encourage dodginess like this, but picking it out | ||||||
1689 | # isn't too hard | ||||||
1690 | 0 | 0 | 0 | if ($image_elt->text =~ / |
|||
1691 | ### image from html: $1 | ||||||
1692 | 0 | 0 | $url_elt = $image_elt; | ||||
1693 | 0 | 0 | $url = $1; | ||||
1694 | } | ||||||
1695 | } | ||||||
1696 | 0 | 0 | 0 | if (is_non_empty ($url)) { | |||
1697 | 0 | 0 | my $width = $image_elt->first_child_text('width'); | ||||
1698 | 0 | 0 | 0 | 0 | unless (Scalar::Util::looks_like_number($width) && $width > 0) { | ||
1699 | 0 | 0 | $width = 0; | ||||
1700 | } | ||||||
1701 | 0 | 0 | my $height = $image_elt->first_child_text('height'); | ||||
1702 | 0 | 0 | 0 | 0 | unless (Scalar::Util::looks_like_number($height) && $height > 0) { | ||
1703 | 0 | 0 | $height = 0; | ||||
1704 | } | ||||||
1705 | ### item_image_uwh() RSS: $url | ||||||
1706 | 0 | 0 | return (App::RSS2Leafnode::XML::Twig::Other::elt_xml_based_uri ($url_elt, $url), | ||||
1707 | $width, $height); | ||||||
1708 | } | ||||||
1709 | } | ||||||
1710 | |||||||
1711 | # Atom channel |
||||||
1712 | # or channel |
||||||
1713 | # | ||||||
1714 | # |
||||||
1715 | # is bigger than the RSS 48x48, so would probably need shrinking. Rate | ||||||
1716 | # it below |
||||||
1717 | # | ||||||
1718 | # |
||||||
1719 | # Is it better to show the channel icon, being the From person? | ||||||
1720 | { | ||||||
1721 | 0 | 0 | my $elt; | ||||
0 | 0 | ||||||
1722 | 0 | 0 | my ($width, $height); | ||||
1723 | my $url = ((($elt = $where->first_child('icon')) | ||||||
1724 | && non_empty ($elt->text)) | ||||||
1725 | || (($elt = $where->first_child('logo')) | ||||||
1726 | && non_empty ($elt->text)) | ||||||
1727 | || (($elt = $where->first_child('itunes:image')) | ||||||
1728 | && non_empty ($elt->att('href'))) | ||||||
1729 | || (($elt = $where->first_child('media:thumbnail')) | ||||||
1730 | && is_non_empty ($elt->att('url')) | ||||||
1731 | && do { | ||||||
1732 | $width = $elt->att('width'); | ||||||
1733 | $height = $elt->att('height'); | ||||||
1734 | $elt->att('url') }) | ||||||
1735 | # seen att('atom:url' rather than plain 'url' ... | ||||||
1736 | || (($elt = $where->first_child('media:thumbnail')) | ||||||
1737 | && is_non_empty ($elt->att('atom:url')) | ||||||
1738 | 0 | 0 | 0 | && do { | |||
1739 | $width = $elt->att('width'); | ||||||
1740 | $height = $elt->att('height'); | ||||||
1741 | $elt->att('atom:url') })); | ||||||
1742 | ### $url | ||||||
1743 | 0 | 0 | 0 | if ($url) { | |||
1744 | 0 | 0 | 0 | 0 | unless (Scalar::Util::looks_like_number($width) && $width > 0) { | ||
1745 | 0 | 0 | $width = 0; | ||||
1746 | } | ||||||
1747 | 0 | 0 | 0 | 0 | unless (Scalar::Util::looks_like_number($height) && $height > 0) { | ||
1748 | 0 | 0 | $height = 0; | ||||
1749 | } | ||||||
1750 | 0 | 0 | return (App::RSS2Leafnode::XML::Twig::Other::elt_xml_based_uri ($elt, $url), | ||||
1751 | $width, | ||||||
1752 | $height); | ||||||
1753 | } | ||||||
1754 | } | ||||||
1755 | |||||||
1756 | # status.net for rss 1.0 | ||||||
1757 | # |
||||||
1758 | 0 | 0 | 0 | if (my $elt = $where->first_child('statusnet:postIcon')) { | |||
1759 | 0 | 0 | 0 | if (is_non_empty (my $url = $elt->att('rdf:resource'))) { | |||
1760 | 0 | 0 | return (App::RSS2Leafnode::XML::Twig::Other::elt_xml_based_uri ($elt, $url), | ||||
1761 | 0, 0); # unknown size | ||||||
1762 | } | ||||||
1763 | } | ||||||
1764 | |||||||
1765 | # |
||||||
1766 | # eg. from blogger.com | ||||||
1767 | # |
||||||
1768 | { | ||||||
1769 | 0 | 0 | my $elt; | ||||
0 | 0 | ||||||
1770 | 0 | 0 | 0 | 0 | if (($elt = $where->first_child('author')) | ||
0 | |||||||
0 | |||||||
1771 | && ($elt = $elt->first_child('gd:image')) | ||||||
1772 | && (is_non_empty (my $url = $elt->att('src') // $elt->att('atom:src')))) { | ||||||
1773 | ### $url | ||||||
1774 | 0 | 0 | 0 | return (App::RSS2Leafnode::XML::Twig::Other::elt_xml_based_uri ($elt, $url), | |||
0 | |||||||
1775 | $elt->att('width') || $elt->att('atom:width') || 0, | ||||||
1776 | $elt->att('height') || $elt->att('atom:height') || 0); | ||||||
1777 | } | ||||||
1778 | } | ||||||
1779 | } | ||||||
1780 | 0 | 0 | return; | ||||
1781 | } | ||||||
1782 | @known{qw(/channel/logo | ||||||
1783 | /channel/icon | ||||||
1784 | /channel/image | ||||||
1785 | /channel/image/url | ||||||
1786 | /channel/image/width | ||||||
1787 | /channel/image/height | ||||||
1788 | /channel/image/title | ||||||
1789 | /channel/image/link | ||||||
1790 | /channel/image/description | ||||||
1791 | /channel/itunes:image | ||||||
1792 | /channel/statusnet:postIcon | ||||||
1793 | |||||||
1794 | /channel/item/image | ||||||
1795 | /channel/item/media:thumbnail | ||||||
1796 | /channel/item/statusnet:postIcon | ||||||
1797 | )} = (); | ||||||
1798 | |||||||
1799 | # $resp is a HTTP::Response | ||||||
1800 | # return a string value for the Face: header, or undef if no icon | ||||||
1801 | sub http_resp_to_face { | ||||||
1802 | 0 | 0 | 0 | 0 | my ($self, $resp) = @_; | ||
1803 | 0 | 0 | 0 | $self->{'get_icon'} || return; | |||
1804 | |||||||
1805 | 0 | 0 | 0 | my $uri = http_resp_favicon_uri($resp) || return; | |||
1806 | 0 | 0 | $self->verbose (2, ' response favicon URI: ', $uri); | ||||
1807 | 0 | 0 | return $self->download_face ($uri, 0, 0); | ||||
1808 | } | ||||||
1809 | |||||||
1810 | # $resp is a HTTP::Response | ||||||
1811 | # if it's a html with a favicon link return a URI object of that image | ||||||
1812 | # | ||||||
1813 | # http://www.w3.org/2005/10/howto-favicon | ||||||
1814 | # | ||||||
1815 | sub http_resp_favicon_uri { | ||||||
1816 | 0 | 0 | 0 | 0 | my ($resp) = @_; | ||
1817 | 0 | 0 | 0 | $resp->headers->content_is_html || return; | |||
1818 | 0 | 0 | require HTML::Parser; | ||||
1819 | 0 | 0 | my $href; | ||||
1820 | my $p; | ||||||
1821 | $p = HTML::Parser->new (api_version => 3, | ||||||
1822 | start_h => [ sub { | ||||||
1823 | 0 | 0 | 0 | my ($tagname, $attr) = @_; | |||
1824 | 0 | 0 | 0 | 0 | if ($tagname eq 'link' | ||
1825 | && $attr->{'rel'} eq 'icon') { | ||||||
1826 | 0 | 0 | $href = $attr->{'href'}; | ||||
1827 | 0 | 0 | $p->eof; | ||||
1828 | } | ||||||
1829 | 0 | 0 | }, "tagname, attr"]); | ||||
1830 | 0 | 0 | $resp->decode; | ||||
1831 | 0 | 0 | $p->parse ($resp->content); | ||||
1832 | 0 | 0 | 0 | return $href && URI->new_abs ($href, $resp->base); | |||
1833 | } | ||||||
1834 | |||||||
1835 | # return base64 string value for "Face:" header | ||||||
1836 | # $width and $height are from attributes if known, or 0 if not | ||||||
1837 | sub download_face { | ||||||
1838 | 0 | 0 | 0 | 0 | my ($self, $uri, $width, $height) = @_; | ||
1839 | 0 | 0 | my $key = $uri->canonical->as_string; | ||||
1840 | 0 | 0 | 0 | if (! exists $self->{'download_face'}->{$key}) { | |||
1841 | 0 | 0 | $self->{'download_face'}->{$key} | ||||
1842 | = $self->download_face_uncached ($uri, $width, $height); | ||||||
1843 | } | ||||||
1844 | 0 | 0 | return $self->{'download_face'}->{$key}; | ||||
1845 | } | ||||||
1846 | sub download_face_uncached { | ||||||
1847 | 0 | 0 | 0 | 0 | my ($self, $url, $width, $height) = @_; | ||
1848 | |||||||
1849 | 0 | 0 | $self->{'download_face_uncached'} = $url; | ||||
1850 | 0 | 0 | $self->verbose (1, ' image download: ', $url); | ||||
1851 | |||||||
1852 | 0 | 0 | require HTTP::Request; | ||||
1853 | 0 | 0 | my $req = HTTP::Request->new (GET => $url); | ||||
1854 | 0 | 0 | my $resp = $self->ua->request($req); | ||||
1855 | 0 | 0 | 0 | if (! $resp->is_success) { | |||
1856 | 0 | 0 | print __x(" no image: {status}\n", | ||||
1857 | status => $resp->status_line); | ||||||
1858 | 0 | 0 | return; | ||||
1859 | } | ||||||
1860 | |||||||
1861 | 0 | 0 | my $type = $resp->content_type; | ||||
1862 | ### $type | ||||||
1863 | # FIXME: is mime=>$type the right way? could give it a look at the url | ||||||
1864 | # basename or server's suggested filename too, for Read() to use the | ||||||
1865 | # extension. | ||||||
1866 | 0 | 0 | 0 | 0 | if ($type eq 'image/vnd.microsoft.icon' || $type eq 'image/x-icon') { | ||
0 | |||||||
1867 | # mime.xml of imagemagick 6.6.0 only has "image/x-ico", and nothing for | ||||||
1868 | # ico in magic.xml | ||||||
1869 | 0 | 0 | $type = 'ico'; | ||||
1870 | } elsif ($type =~ m{^image/(.*)$}i) { | ||||||
1871 | 0 | 0 | $type = $1; | ||||
1872 | } else { | ||||||
1873 | 0 | 0 | $self->verbose (2, 'ignore non-image icon type: ',$type); | ||||
1874 | 0 | 0 | return; | ||||
1875 | } | ||||||
1876 | |||||||
1877 | 0 | 0 | $resp->decode; | ||||
1878 | 0 | 0 | my $data = $resp->content; | ||||
1879 | 0 | 0 | 0 | 0 | if ($type ne 'png' | ||
0 | |||||||
0 | |||||||
0 | |||||||
1880 | || $width == 0 || $height == 0 | ||||||
1881 | || $width > 48 || $height > 48) { | ||||||
1882 | 0 | 0 | 0 | $data = $self->imagemagick_to_png($type,$data) // return; | |||
1883 | } | ||||||
1884 | 0 | 0 | $self->verbose (2, " image for Face ",length($data)," bytes"); | ||||
1885 | |||||||
1886 | # use a space as a separator since MIME::Entity will collapse out a | ||||||
1887 | # newline and make an enormous long word which then can't be split across | ||||||
1888 | # header lines and will likely exceed the nntp 998 char single-line limit | ||||||
1889 | 0 | 0 | require MIME::Base64; | ||||
1890 | 0 | 0 | $data = MIME::Base64::encode_base64($data, " "); | ||||
1891 | ### $data | ||||||
1892 | |||||||
1893 | 0 | 0 | return $data; | ||||
1894 | } | ||||||
1895 | |||||||
1896 | sub face_wh_ok { | ||||||
1897 | 0 | 0 | 0 | 0 | my ($self, $width, $height) = @_; | ||
1898 | |||||||
1899 | 0 | 0 | 0 | 0 | if ($width > 0 && $width > 2*$height) { | ||
1900 | # some obnoxious banner | ||||||
1901 | 0 | 0 | $self->verbose (1, ' ', | ||||
1902 | __x('image is a banner ({width}x{height}), ignore', | ||||||
1903 | width => $width, height => $height)); | ||||||
1904 | 0 | 0 | return 0; | ||||
1905 | } | ||||||
1906 | 0 | 0 | return 1; | ||||
1907 | } | ||||||
1908 | |||||||
1909 | #------------------------------------------------------------------------------ | ||||||
1910 | # ImageMagick bits | ||||||
1911 | |||||||
1912 | # $type is "gif", "ico" etc, $data is an image in a byte string | ||||||
1913 | # return a byte string of png, or undef if $data unrecognised | ||||||
1914 | sub imagemagick_to_png { | ||||||
1915 | 0 | 0 | 0 | 0 | my ($self, $type, $data) = @_; | ||
1916 | ### $type | ||||||
1917 | 0 | 0 | 0 | my $image = $self->imagemagick_from_data($type,$data) // return; | |||
1918 | |||||||
1919 | 0 | 0 | my $width = $image->Get('width'); | ||||
1920 | 0 | 0 | my $height = $image->Get('height'); | ||||
1921 | ### compress: $image->Get('compression') | ||||||
1922 | 0 | 0 | $self->verbose (2, " image ${width}x${height}"); | ||||
1923 | 0 | 0 | 0 | 0 | if ($width == 0 || $height == 0) { | ||
1924 | 0 | 0 | return; | ||||
1925 | } | ||||||
1926 | 0 | 0 | 0 | 0 | if ($width <= 48 && $height <= 48 && $type eq 'png') { | ||
0 | |||||||
1927 | 0 | 0 | return $data; | ||||
1928 | } | ||||||
1929 | |||||||
1930 | # having downloaded the image is it better to keep a banner but shrink it, | ||||||
1931 | # or discard as no good? | ||||||
1932 | # | ||||||
1933 | # $self->face_wh_ok ($width, $height) || return; | ||||||
1934 | |||||||
1935 | 0 | 0 | 0 | 0 | if ($width > 48 || $height > 48) { | ||
1936 | 0 | 0 | my $factor; | ||||
1937 | 0 | 0 | 0 | 0 | if ($width <= 2*48 && $height <= 2*48) { | ||
1938 | 0 | 0 | $factor = 0.5; | ||||
1939 | } else { | ||||||
1940 | 0 | 0 | $factor = min (48 / $width, 48 / $height); | ||||
1941 | } | ||||||
1942 | 0 | 0 | $width = POSIX::ceil ($width * $factor); | ||||
1943 | 0 | 0 | $height = POSIX::ceil ($height * $factor); | ||||
1944 | 0 | 0 | $self->verbose (2, " image shrink by $factor to ${width}x${height}"); | ||||
1945 | # cf LiquidResize() or plain Resize() | ||||||
1946 | 0 | 0 | $image->AdaptiveResize (width => $width, height => $height); | ||||
1947 | } | ||||||
1948 | |||||||
1949 | 0 | 0 | my $ret = $image->Set (magick => 'PNG8'); | ||||
1950 | ### ret: "$ret" | ||||||
1951 | ### ret: $ret+0 | ||||||
1952 | 0 | 0 | 0 | if ($ret != 0) { | |||
1953 | 0 | 0 | print "oops, imagemagick doesn't like PNG8: $ret\n"; | ||||
1954 | 0 | 0 | return; | ||||
1955 | } | ||||||
1956 | ### compress: $image->Get('compression') | ||||||
1957 | |||||||
1958 | # $image->Write ('/tmp/x.png'); | ||||||
1959 | 0 | 0 | ($data) = $image->ImageToBlob (); | ||||
1960 | 0 | 0 | return $data; | ||||
1961 | } | ||||||
1962 | |||||||
1963 | |||||||
1964 | # $type is "png", "ico" etc, $data is an image in a byte string | ||||||
1965 | # return a Image::Magick object, or undef if Perl-Magick not available | ||||||
1966 | sub imagemagick_from_data { | ||||||
1967 | 0 | 0 | 0 | 0 | my ($self, $type, $data) = @_; | ||
1968 | ### imagemagick_from_data(): $type | ||||||
1969 | 0 | 0 | 0 | eval { require Image::Magick } or return; | |||
0 | 0 | ||||||
1970 | |||||||
1971 | 0 | 0 | my $image = Image::Magick->new (magick=>$type); | ||||
1972 | # $image->Set(debug=>'All'); | ||||||
1973 | 0 | 0 | my $ret = $image->BlobToImage ($data); | ||||
1974 | ### ret: "$ret" | ||||||
1975 | ### ret: $ret+0 | ||||||
1976 | 0 | 0 | 0 | if ($ret == 1) { | |||
1977 | 0 | 0 | return $image; | ||||
1978 | } | ||||||
1979 | |||||||
1980 | # try again without the $type forced, in case bad Content-Type from http | ||||||
1981 | 0 | 0 | $image = Image::Magick->new; | ||||
1982 | # $image->Set(debug=>'All'); | ||||||
1983 | 0 | 0 | $ret = $image->BlobToImage ($data); | ||||
1984 | ### ret: "$ret" | ||||||
1985 | ### ret: $ret+0 | ||||||
1986 | 0 | 0 | 0 | if ($ret == 1) { | |||
1987 | 0 | 0 | return $image; | ||||
1988 | } | ||||||
1989 | |||||||
1990 | print __x(" imagemagick doesn't like image data ({length} bytes) from {url}: {error}\n", | ||||||
1991 | length => length($data), | ||||||
1992 | 0 | 0 | url => $self->{'download_face_uncached'}, | ||||
1993 | error => $ret); | ||||||
1994 | 0 | 0 | return undef; | ||||
1995 | } | ||||||
1996 | |||||||
1997 | |||||||
1998 | #------------------------------------------------------------------------------ | ||||||
1999 | # XML::Liberal | ||||||
2000 | |||||||
2001 | use constant::defer have_xml_liberal => sub { | ||||||
2002 | 0 | 0 | my ($self) = @_; | ||||
2003 | 0 | 0 | 0 | if (eval { require XML::Liberal; 1 }) { | |||
0 | 0 | ||||||
0 | 0 | ||||||
2004 | 0 | 0 | return 1; | ||||
2005 | } | ||||||
2006 | 0 | 0 | $self->verbose (3, __x('XML::Liberal not available: {error}', error => $@)); | ||||
2007 | 0 | 0 | return 0; | ||||
2008 | 2 | 2 | 10 | }; | |||
2 | 1 | ||||||
2 | 11 | ||||||
2009 | |||||||
2010 | # try to correct $xmlstr | ||||||
2011 | # if successful return a new xml string, otherwise return undef | ||||||
2012 | sub xml_liberal_correction { | ||||||
2013 | 0 | 0 | 0 | 0 | my ($self, $xmlstr) = @_; | ||
2014 | 0 | 0 | 0 | $self->have_xml_liberal or return; | |||
2015 | |||||||
2016 | ### try XML-Liberal ... | ||||||
2017 | 0 | 0 | my $liberal = XML::Liberal->new('LibXML'); | ||||
2018 | 0 | 0 | 0 | if (my $doc = eval { $liberal->parse_string($xmlstr) }) { | |||
0 | 0 | ||||||
2019 | 0 | 0 | return $doc->toString; | ||||
2020 | } else { | ||||||
2021 | 0 | 0 | $self->verbose (2, __x('XML::Liberal parse error: {error}', error => $@)); | ||||
2022 | 0 | 0 | return undef; | ||||
2023 | } | ||||||
2024 | } | ||||||
2025 | |||||||
2026 | |||||||
2027 | #------------------------------------------------------------------------------ | ||||||
2028 | # error as news message | ||||||
2029 | |||||||
2030 | sub error_message { | ||||||
2031 | 0 | 0 | 0 | 0 | my ($self, $subject, $message, $attach_bytes) = @_; | ||
2032 | |||||||
2033 | 0 | 0 | require Encode; | ||||
2034 | 0 | 0 | my $charset = 'utf-8'; | ||||
2035 | 0 | 0 | $message = str_ensure_newline ($message); | ||||
2036 | 0 | 0 | $message = Encode::encode ($charset, $message, Encode::FB_DEFAULT()); | ||||
2037 | |||||||
2038 | 0 | 0 | my $date = rfc822_time_now(); | ||||
2039 | 0 | 0 | require Digest::MD5; | ||||
2040 | 0 | 0 | my $msgid = $self->url_to_msgid | ||||
2041 | ('http://localhost', | ||||||
2042 | Digest::MD5::md5_base64 ($date.$subject.$message)); | ||||||
2043 | |||||||
2044 | my $top = $self->mime_build | ||||||
2045 | ({ | ||||||
2046 | 'Path:' => 'localhost', | ||||||
2047 | 0 | 0 | 'Newsgroups:' => $self->{'nntp_group'}, | ||||
2048 | From => __('RSS2Leafnode').' |
||||||
2049 | Subject => $subject, | ||||||
2050 | Date => $date, | ||||||
2051 | 'Message-ID' => $msgid, | ||||||
2052 | }, | ||||||
2053 | Top => 1, | ||||||
2054 | Type => 'text/plain', | ||||||
2055 | Charset => $charset, | ||||||
2056 | Data => $message); | ||||||
2057 | |||||||
2058 | 0 | 0 | 0 | if (defined $attach_bytes) { | |||
2059 | 0 | 0 | $top->make_multipart; | ||||
2060 | 0 | 0 | my $part = $self->mime_build | ||||
2061 | ({}, | ||||||
2062 | Charset => 'none', | ||||||
2063 | Type => 'application/octet-stream', | ||||||
2064 | Data => $attach_bytes); | ||||||
2065 | 0 | 0 | $top->add_part ($part); | ||||
2066 | } | ||||||
2067 | |||||||
2068 | 0 | 0 | mime_entity_lines($top); | ||||
2069 | 0 | 0 | 0 | $self->nntp_post($top) || return; | |||
2070 | 0 | 0 | say __x('{group} 1 new article', group => $self->{'nntp_group'}); | ||||
2071 | } | ||||||
2072 | |||||||
2073 | |||||||
2074 | #------------------------------------------------------------------------------ | ||||||
2075 | # fetch HTML | ||||||
2076 | |||||||
2077 | sub http_resp_to_from { | ||||||
2078 | 0 | 0 | 0 | 0 | my ($self, $resp) = @_; | ||
2079 | ### http_resp_to_from() | ||||||
2080 | 0 | 0 | 0 | return $self->http_resp_exiftool_author($resp) | |||
2081 | // 'nobody@'.$self->uri_to_host; | ||||||
2082 | } | ||||||
2083 | sub http_resp_exiftool_author { | ||||||
2084 | 0 | 0 | 0 | 0 | my ($self, $resp) = @_; | ||
2085 | # PNG Author field, or HTML author | ||||||
2086 | 0 | 0 | 0 | my $author = resp_exiftool_info($resp)->{'Author'} // return; | |||
2087 | 0 | 0 | return $self->email_format_maybe (Encode::decode_utf8($author), '', undef); | ||||
2088 | } | ||||||
2089 | |||||||
2090 | sub http_resp_to_copyright { | ||||||
2091 | 0 | 0 | 0 | 0 | my ($self, $resp) = @_; | ||
2092 | ### http_http_resp_to_copyright() ... | ||||||
2093 | |||||||
2094 | 0 | 0 | my @copyrights = non_empty($resp->header('X-Meta-Copyright')); | ||||
2095 | 0 | 0 | 0 | unless ($resp->content_type =~ m{^text/}) { | |||
2096 | # PNG Copyright field, perhaps other formats | ||||||
2097 | 0 | 0 | push @copyrights, non_empty(resp_exiftool_info($resp)->{'Copyright'}); | ||||
2098 | } | ||||||
2099 | 0 | 0 | return \@copyrights; | ||||
2100 | } | ||||||
2101 | |||||||
2102 | # return a "Keywords:" string, or undef if nothing | ||||||
2103 | sub http_resp_to_keywords { | ||||||
2104 | 0 | 0 | 0 | 0 | my ($self, $resp) = @_; | ||
2105 | ### http_resp_to_keywords() ... | ||||||
2106 | |||||||
2107 | 0 | 0 | my @keywords = $resp->header('X-Meta-Keywords'); | ||||
2108 | |||||||
2109 | 0 | 0 | 0 | if ($resp->headers->content_is_html) { | |||
2110 | 0 | 0 | $resp->decode; | ||||
2111 | 0 | 0 | require HTML::Parser; | ||||
2112 | my $p = HTML::Parser->new | ||||||
2113 | (api_version => 3, | ||||||
2114 | report_tags => ['meta'], | ||||||
2115 | start_h => [ sub { | ||||||
2116 | 0 | 0 | 0 | my ($tagname, $attr) = @_; | |||
2117 | # facebook thing | ||||||
2118 | 0 | 0 | 0 | 0 | if ($tagname eq 'meta' | ||
0 | |||||||
2119 | && lc($attr->{'property'}||'') eq 'og:type') { | ||||||
2120 | 0 | 0 | push @keywords, $attr->{'content'}; | ||||
2121 | } | ||||||
2122 | |||||||
2123 | 0 | 0 | }, "tagname, attr" ]); | ||||
2124 | 0 | 0 | $p->parse ($resp->decoded_content); | ||||
2125 | } | ||||||
2126 | ### @keywords | ||||||
2127 | |||||||
2128 | return join_non_empty | ||||||
2129 | 0 | 0 | (', ', List::MoreUtils::uniq(map {collapse_whitespace($_)} | ||||
0 | 0 | ||||||
2130 | @keywords)); | ||||||
2131 | } | ||||||
2132 | |||||||
2133 | sub fetch_html { | ||||||
2134 | 0 | 0 | 1 | 0 | my ($self, $group, $url, %options) = @_; | ||
2135 | ### fetch_html() ... | ||||||
2136 | |||||||
2137 | 0 | 0 | local @{$self}{keys %options} = values %options; # hash slice | ||||
0 | 0 | ||||||
2138 | 0 | 0 | $self->verbose (1, __x('page: {url}', url => $url)); | ||||
2139 | |||||||
2140 | 0 | 0 | my $group_uri = URI->new($group,'news'); | ||||
2141 | 0 | 0 | local $self->{'nntp_host'} = uri_to_nntp_host ($group_uri); | ||||
2142 | 0 | 0 | local $self->{'nntp_group'} = $group = $group_uri->group; | ||||
2143 | 0 | 0 | 0 | $self->nntp_group_check($group) or return; | |||
2144 | |||||||
2145 | 0 | 0 | require HTTP::Request; | ||||
2146 | 0 | 0 | my $req = HTTP::Request->new (GET => $url); | ||||
2147 | 0 | 0 | $self->status_etagmod_req ($req); | ||||
2148 | 0 | 0 | my $resp = $self->ua->request($req); | ||||
2149 | 0 | 0 | 0 | if ($resp->code == 304) { | |||
2150 | 0 | 0 | $self->status_unchanged ($url); | ||||
2151 | 0 | 0 | return; | ||||
2152 | } | ||||||
2153 | 0 | 0 | 0 | if (! $resp->is_success) { | |||
2154 | 0 | 0 | print __x("rss2leafnode: {url}\n {status}\n", | ||||
2155 | url => $url, | ||||||
2156 | status => $resp->status_line); | ||||||
2157 | 0 | 0 | return; | ||||
2158 | } | ||||||
2159 | 0 | 0 | $self->verbose (2, $resp->headers->as_string); | ||||
2160 | 0 | 0 | $self->enforce_html_charset_from_content ($resp); | ||||
2161 | |||||||
2162 | # message id is either the etag if present, or an md5 of the content if not | ||||||
2163 | my $msgid = $self->url_to_msgid | ||||||
2164 | ($url, | ||||||
2165 | 0 | 0 | 0 | $resp->header('ETag') // do { | |||
2166 | 0 | 0 | require Digest::MD5; | ||||
2167 | 0 | 0 | $resp->decode; | ||||
2168 | 0 | 0 | my $content = $resp->content; | ||||
2169 | 0 | 0 | Digest::MD5::md5_base64($content) | ||||
2170 | }); | ||||||
2171 | 0 | 0 | 0 | return 0 if $self->nntp_message_id_exists ($msgid); | |||
2172 | |||||||
2173 | 0 | 0 | 0 | my $subject = (html_title($resp) | |||
0 | |||||||
2174 | // $resp->filename | ||||||
2175 | # show original url in subject, not anywhere redirected | ||||||
2176 | // __x('RSS2Leafnode {url}', url => $url)); | ||||||
2177 | |||||||
2178 | 0 | 0 | my $from = $self->http_resp_to_from($resp); | ||||
2179 | 0 | 0 | my $date = $resp->header('Last-Modified'); | ||||
2180 | 0 | 0 | my $face = $self->http_resp_to_face($resp); | ||||
2181 | 0 | 0 | my $copyright = $self->http_resp_to_copyright($resp); | ||||
2182 | 0 | 0 | my $keywords = $self->http_resp_to_keywords($resp); | ||||
2183 | |||||||
2184 | 0 | 0 | my $part = $self->http_resp_extract_main($resp); | ||||
2185 | |||||||
2186 | 0 | 0 | my $top = $self->mime_part_from_response | ||||
2187 | ($resp, | ||||||
2188 | Top => 1, | ||||||
2189 | 'Path:' => scalar($self->uri_to_host), | ||||||
2190 | 'Newsgroups:' => $group, | ||||||
2191 | From => $from, | ||||||
2192 | Subject => $subject, | ||||||
2193 | Date => $date, | ||||||
2194 | 'Message-ID' => $msgid, | ||||||
2195 | Keywords => $keywords, | ||||||
2196 | 'Face:' => $face, | ||||||
2197 | 'X-Copyright:' => $copyright); | ||||||
2198 | 0 | 0 | 0 | if ($part) { | |||
2199 | ### attach full part ... | ||||||
2200 | 0 | 0 | $top->make_multipart; | ||||
2201 | 0 | 0 | $top->add_part ($part); | ||||
2202 | } | ||||||
2203 | |||||||
2204 | 0 | 0 | mime_entity_lines($top); | ||||
2205 | 0 | 0 | 0 | $self->nntp_post($top) || return; | |||
2206 | 0 | 0 | $self->status_etagmod_resp ($url, $resp); | ||||
2207 | 0 | 0 | say __x("{group} 1 new article", group => $group); | ||||
2208 | } | ||||||
2209 | |||||||
2210 | # $resp is a HTTP::Response | ||||||
2211 | # If the $self->{'html_extract_main'} option is true and $resp is html then | ||||||
2212 | # resplace the $resp content with HTML::ExtractMain extracted part. | ||||||
2213 | # | ||||||
2214 | sub http_resp_extract_main { | ||||||
2215 | 0 | 0 | 0 | 0 | my ($self, $resp) = @_; | ||
2216 | |||||||
2217 | 0 | 0 | 0 | $self->{'html_extract_main'} or return; | |||
2218 | 0 | 0 | 0 | $resp->headers->content_is_html() or return; | |||
2219 | |||||||
2220 | my $full_part | ||||||
2221 | = (defined $self->{'html_extract_main'} | ||||||
2222 | 0 | 0 | 0 | && $self->{'html_extract_main'} eq 'attach_full' | |||
2223 | && $self->mime_part_from_response($resp, | ||||||
2224 | Disposition => "attachment")); | ||||||
2225 | |||||||
2226 | 0 | 0 | require HTML::ExtractMain; | ||||
2227 | 0 | 0 | HTML::ExtractMain->VERSION(0.63); # for output_type=>'html' | ||||
2228 | 0 | 0 | $resp->decode; # expand any compression | ||||
2229 | 0 | 0 | my $content = $resp->decoded_content; # as wide-chars | ||||
2230 | |||||||
2231 | # Output type 'html' differs from the default xhtml by a few entities, in | ||||||
2232 | # particular it avoids ' which is an xml-ism not in the html standards. | ||||||
2233 | # Various browsers support &apos anyway, but not for example by w3m. | ||||||
2234 | 0 | 0 | $content = HTML::ExtractMain::extract_main_html($content, | ||||
2235 | output_type => 'html'); | ||||||
2236 | 0 | 0 | 0 | if (! defined $content) { | |||
2237 | 0 | 0 | $self->verbose(1, __(" HTML::ExtractMain no main part found, posting whole")); | ||||
2238 | 0 | 0 | return; | ||||
2239 | } | ||||||
2240 | ### main extracted: $content | ||||||
2241 | 0 | 0 | $resp->remove_header('Content-MD5'); # since changed content | ||||
2242 | 0 | 0 | my $charset = $resp->content_charset; | ||||
2243 | 0 | 0 | $content = Encode::encode ($charset, $content); | ||||
2244 | 0 | 0 | $resp->content($content); | ||||
2245 | |||||||
2246 | 0 | 0 | return $full_part; | ||||
2247 | } | ||||||
2248 | |||||||
2249 | #------------------------------------------------------------------------------ | ||||||
2250 | # RSS hacks | ||||||
2251 | |||||||
2252 | # This is a hack for Yahoo Finance feed uniqification. | ||||||
2253 | # $item is a feed hashref. If it has 'link' field with a yahoo.com | ||||||
2254 | # redirection like | ||||||
2255 | # | ||||||
2256 | # http://au.rd.yahoo.com/finance/news/rss/financenews/*http://au.biz.yahoo.com/071003/30/1fdvx.html | ||||||
2257 | # | ||||||
2258 | # then return the last target url part. Otherwise return false. | ||||||
2259 | # | ||||||
2260 | # This allows the item to be identified by its final target link, so as to | ||||||
2261 | # avoid duplication when the item appears in multiple yahoo feeds with a | ||||||
2262 | # different leading part. (There's no guid in yahoo feeds, as of Oct 2007.) | ||||||
2263 | # | ||||||
2264 | sub item_yahoo_permalink { | ||||||
2265 | 0 | 0 | 0 | 0 | my ($item) = @_; | ||
2266 | 0 | 0 | 0 | my $url = $item->first_child_text('link') | |||
2267 | // return undef; | ||||||
2268 | 0 | 0 | 0 | $url =~ m{^http://[^/]*yahoo\.com/.*\*(http://.*yahoo\.com.*)$} | |||
2269 | or return undef; | ||||||
2270 | 0 | 0 | return $1; | ||||
2271 | } | ||||||
2272 | |||||||
2273 | # This is a special case for Google Groups RSS feeds. | ||||||
2274 | # The arguments are link elements [$name,$uri]. If there's a google groups | ||||||
2275 | # like "http://groups.google.com/group/cfcdev/msg/445d4ccfdabf086b" then | ||||||
2276 | # return a mailing list address like "cfcdev@googlegroups.com". If not in | ||||||
2277 | # that form then return undef. | ||||||
2278 | # | ||||||
2279 | sub googlegroups_link_email { | ||||||
2280 | ## no critic (RequireInterpolationOfMetachars) | ||||||
2281 | 0 | 0 | 0 | 0 | foreach my $l (@_) { | ||
2282 | 0 | 0 | 0 | 0 | if ($l->{'uri'} | ||
2283 | && $l->{'uri'}->canonical =~ m{^http://groups\.google\.com/group/([^/]+)/}) { | ||||||
2284 | 0 | 0 | return ($1 . '@googlegroups.com'); | ||||
2285 | } | ||||||
2286 | } | ||||||
2287 | 0 | 0 | return undef; | ||||
2288 | } | ||||||
2289 | |||||||
2290 | # This is a nasty hack for http://www.aireview.com.au/rss.php | ||||||
2291 | # $url is a link url string just fetched, $resp is a HTTP::Response. The | ||||||
2292 | # return is a possibly new HTTP::Response object. | ||||||
2293 | # | ||||||
2294 | # The first fetch of an item link from aireview gives back content like | ||||||
2295 | # | ||||||
2296 | # | ||||||
2297 | # | ||||||
2298 | # plus some cookies in the headers. The URL "zz=1" in that line seems very | ||||||
2299 | # dodgy, it ends up going to the home page with mozilla. In any case a | ||||||
2300 | # fresh fetch of the link url with the cookies provided is enough to get the | ||||||
2301 | # actual content. | ||||||
2302 | # | ||||||
2303 | # The LWP::UserAgent::FramesReady module on cpan has a similar match of a | ||||||
2304 | # Refresh, for use with frames. It works by turning the response into a | ||||||
2305 | # "302 Moved temporarily" for LWP to follow. urlcheck.pl at | ||||||
2306 | # http://www.cpan.org/authors/id/P/PH/PHILMI/urlcheck-1.00.pl likewise | ||||||
2307 | # follows. But alas both obey the URL given in the , which is no good | ||||||
2308 | # here. | ||||||
2309 | # | ||||||
2310 | sub aireview_follow { | ||||||
2311 | 0 | 0 | 0 | 0 | my ($self, $url, $resp) = @_; | ||
2312 | |||||||
2313 | 0 | 0 | 0 | if ($resp->is_success) { | |||
2314 | 0 | 0 | $resp->decode; | ||||
2315 | 0 | 0 | my $content = $resp->content; | ||||
2316 | 0 | 0 | 0 | if ($content =~ /]*Refresh[^>]*checkForCookies/i) { | |||
2317 | 0 | 0 | $self->verbose (1, ' following aireview META Refresh with cookies'); | ||||
2318 | 0 | 0 | require HTTP::Request; | ||||
2319 | 0 | 0 | my $req = HTTP::Request->new (GET => $url); | ||||
2320 | 0 | 0 | $resp = $self->ua->request($req); | ||||
2321 | } | ||||||
2322 | } | ||||||
2323 | 0 | 0 | return $resp; | ||||
2324 | } | ||||||
2325 | |||||||
2326 | |||||||
2327 | #------------------------------------------------------------------------------ | ||||||
2328 | # RSS links | ||||||
2329 | |||||||
2330 | |||||||
2331 | # WordPress (http://wordpress.org/) pre 2.5 had a bug | ||||||
2332 | # http://core.trac.wordpress.org/ticket/6579 where it gave | ||||||
2333 | # type="appication/atom+xml" missing the "l" in "application/". | ||||||
2334 | # Don't want to workaround every bad generator, but this one is GPL | ||||||
2335 | # free and the past versions still found for instance in the | ||||||
2336 | # language log http://languagelog.ldc.upenn.edu/nll/ in Feb 2011 | ||||||
2337 | # | ||||||
2338 | sub mime_type_is_rss { | ||||||
2339 | 0 | 0 | 0 | 0 | my ($self, $type) = @_; | ||
2340 | 0 | 0 | return ($type =~ m{^appl?ication/atom\+xml$}); | ||||
2341 | } | ||||||
2342 | sub atom_link_is_rss { | ||||||
2343 | 0 | 0 | 0 | 0 | my ($self, $elt) = @_; | ||
2344 | 0 | 0 | 0 | my $type = $elt->att('atom:type') // $elt->att('type') // return 0; | |||
0 | |||||||
2345 | 0 | 0 | return $self->mime_type_is_rss($type); | ||||
2346 | } | ||||||
2347 | |||||||
2348 | # $str is a string like "doi:10.1000/182" or "10.1000/182". | ||||||
2349 | # Return a url string like "http://doi.org/10.1000/182". | ||||||
2350 | sub doi_to_uri { | ||||||
2351 | 2 | 2 | 0 | 1015 | my ($str) = @_; | ||
2352 | 2 | 7 | $str =~ s/^doi://; | ||||
2353 | |||||||
2354 | # DOI numbers can potentially include URI reserved characters, let URI.pm | ||||||
2355 | # percent encode them when necessary. | ||||||
2356 | 2 | 6 | my $uri = URI->new('http://doi.org/'); | ||||
2357 | 2 | 6024 | $uri->path($str); | ||||
2358 | 2 | 158 | return $uri; | ||||
2359 | } | ||||||
2360 | |||||||
2361 | # return list of hashrefs, each being | ||||||
2362 | # { name => $str, | ||||||
2363 | # uri => $uri_object, | ||||||
2364 | # download => $boolean, | ||||||
2365 | # priority => $number, | ||||||
2366 | # } | ||||||
2367 | # | ||||||
2368 | # Links are listed from highest to lowest priority. The current priority | ||||||
2369 | # levels are | ||||||
2370 | # 0 plain links | ||||||
2371 | # -10 comment RSS | ||||||
2372 | # -20 author home page | ||||||
2373 | # -100 geo location text-only | ||||||
2374 | # -101 statusnet geo location | ||||||
2375 | # -200 |
||||||
2376 | # | ||||||
2377 | sub item_to_links { | ||||||
2378 | 0 | 0 | 0 | 0 | my ($self, $item) = @_; | ||
2379 | |||||||
2380 | # |
||||||
2381 | # something has been expanded into the item, or should it be shown? | ||||||
2382 | |||||||
2383 | # FIXME: |
||||||
2384 | # |
||||||
2385 | # Can have a |
||||||
2386 | # | ||||||
2387 | # ENHANCE-ME: |
||||||
2388 | # | ||||||
2389 | # FIXME: |
||||||
2390 | # formats etc. Have seen this from archive.org just duplicating | ||||||
2391 | # |
||||||
2392 | # | ||||||
2393 | # |
||||||
2394 | # case. | ||||||
2395 | # | ||||||
2396 | 0 | 0 | my @elts = $item->children (qr/^(link | ||||
2397 | |enclosure | ||||||
2398 | |content | ||||||
2399 | |wiki:diff | ||||||
2400 | |wiki:history | ||||||
2401 | |comments | ||||||
2402 | |wfw:comment | ||||||
2403 | |wfw:commentRss | ||||||
2404 | |foaf:maker | ||||||
2405 | |sioc:has_creator | ||||||
2406 | |sioc:has_discussion | ||||||
2407 | |sioc:links_to | ||||||
2408 | |sioc:reply_of | ||||||
2409 | |statusnet:origin | ||||||
2410 | |dc:source | ||||||
2411 | |prism:url | ||||||
2412 | )$/ix); | ||||||
2413 | ### link elts: "@elts" | ||||||
2414 | |||||||
2415 | 0 | 0 | my @links; | ||||
2416 | 0 | 0 | foreach my $elt (@elts) { | ||||
2417 | 0 | 0 | 0 | if ($self->{'verbose'} >= 2) { | |||
2418 | 0 | 0 | require Text::Wrap; | ||||
2419 | 0 | 0 | local $Text::Wrap::huge = 'overflow'; # don't break long URLs etc | ||||
2420 | 0 | 0 | $self->verbose (2, "link\n", Text::Trim::trim($elt->sprint)); | ||||
2421 | } | ||||||
2422 | |||||||
2423 | 0 | 0 | my $tag = lc($elt->tag); | ||||
2424 | ### $tag | ||||||
2425 | 0 | 0 | 0 | 0 | if ($tag eq 'content' && atom_content_flavour($elt) ne 'link') { | ||
2426 | 0 | 0 | next; | ||||
2427 | } | ||||||
2428 | 0 | 0 | my $l = { download => 1 }; | ||||
2429 | |||||||
2430 | 0 | 0 | foreach my $name ('hreflang', 'title', 'type') { | ||||
2431 | 0 | 0 | 0 | $l->{$name} = ($elt->att("atom:$name") // $elt->att($name)); | |||
2432 | } | ||||||
2433 | |||||||
2434 | 0 | 0 | 0 | my $rel = non_empty($elt->att('atom:rel') // $elt->att('rel')); | |||
2435 | 0 | 0 | 0 | if (defined $rel) { | |||
2436 | # Atom rel="..." | ||||||
2437 | # Maybe: if ($rel eq 'next') ... # not sure about "next" link | ||||||
2438 | |||||||
2439 | 0 | 0 | 0 | 0 | if ($rel eq 'self' # the feed itself (in the channel normally) | ||
0 | |||||||
0 | |||||||
2440 | || $rel eq 'edit' # to edit the item, maybe | ||||||
2441 | || $rel eq 'service.edit' # to edit the item | ||||||
2442 | || $rel eq 'license' # probably only in the channel part normally | ||||||
2443 | ) { | ||||||
2444 | 0 | 0 | $self->verbose (1, ' ', __x('skip link "{type}"', type => $rel)); | ||||
2445 | 0 | 0 | next; | ||||
2446 | } | ||||||
2447 | 0 | 0 | 0 | if ($rel eq 'alternate') { | |||
0 | |||||||
0 | |||||||
0 | |||||||
0 | |||||||
0 | |||||||
0 | |||||||
0 | |||||||
2448 | # "alternate" is supposed to be the content as the entry, but in a | ||||||
2449 | # web page or something. Not sure that's always quite true, so show | ||||||
2450 | # it as a plain link. If no |
||||||
2451 | # supposed to be mandatory. | ||||||
2452 | |||||||
2453 | } elsif ($rel eq 'enclosure') { | ||||||
2454 | 0 | 0 | $l->{'name'} = __('Encl'); | ||||
2455 | |||||||
2456 | } elsif ($rel eq 'ostatus:conversation') { | ||||||
2457 | 0 | 0 | $l->{'name'} = __('Conversation'); | ||||
2458 | 0 | 0 | $l->{'download'} = 0; | ||||
2459 | |||||||
2460 | } elsif ($rel eq 'ostatus:attention') { | ||||||
2461 | 0 | 0 | $l->{'name'} = __('Attention'); | ||||
2462 | 0 | 0 | $l->{'download'} = 0; | ||||
2463 | |||||||
2464 | } elsif ($rel eq 'related') { | ||||||
2465 | 0 | 0 | $l->{'name'} = __('Related'); | ||||
2466 | |||||||
2467 | } elsif ($rel eq 'replies') { | ||||||
2468 | # Atom | ||||||
2469 | 0 | 0 | my $count = $self->item_elt_comments_count($item,$elt); | ||||
2470 | 0 | 0 | 0 | if ($self->atom_link_is_rss($elt)) { | |||
2471 | 0 | 0 | 0 | $l->{'name'} = (defined $count | |||
2472 | ? __x('RSS Replies({count})', count => $count) | ||||||
2473 | : __('RSS Replies')); | ||||||
2474 | 0 | 0 | $l->{'priority'} = -10; | ||||
2475 | } else { | ||||||
2476 | 0 | 0 | 0 | $l->{'name'} = (defined $count | |||
2477 | ? __x('Replies({count})', count => $count) | ||||||
2478 | : __('Replies')); | ||||||
2479 | } | ||||||
2480 | 0 | 0 | $l->{'download'} = 0; | ||||
2481 | |||||||
2482 | } elsif ($rel eq 'service.post') { | ||||||
2483 | 0 | 0 | $l->{'name'} = __('Comments'); | ||||
2484 | 0 | 0 | $l->{'download'} = 0; | ||||
2485 | |||||||
2486 | } elsif ($rel eq 'via') { | ||||||
2487 | 0 | 0 | $l->{'name'} = __('Via'); | ||||
2488 | 0 | 0 | $l->{'download'} = 0; | ||||
2489 | |||||||
2490 | } else { | ||||||
2491 | 0 | 0 | $l->{'name'} = __x('{linkrel}', linkrel => $rel); | ||||
2492 | } | ||||||
2493 | |||||||
2494 | } else { # ! defined $rel | ||||||
2495 | # tags without rel="" attribute | ||||||
2496 | # | ||||||
2497 | 0 | 0 | 0 | if ($tag eq 'enclosure') { | |||
0 | |||||||
0 | |||||||
0 | |||||||
0 | |||||||
0 | |||||||
0 | |||||||
0 | |||||||
0 | |||||||
2498 | 0 | 0 | $l->{'name'} = __('Encl'); | ||||
2499 | |||||||
2500 | } elsif ($tag eq 'dc:source') { | ||||||
2501 | # might be free form text or might be url or other formal system | ||||||
2502 | 0 | 0 | $l->{'name'} = __('Source'); | ||||
2503 | 0 | 0 | $l->{'download'} = 0; | ||||
2504 | 0 | 0 | $l->{'maybe_text'} = 1; | ||||
2505 | 0 | 0 | $l->{'priority'} = -10; | ||||
2506 | |||||||
2507 | } elsif ($tag eq 'wiki:diff') { | ||||||
2508 | 0 | 0 | $l->{'name'} = __('Diff'); | ||||
2509 | |||||||
2510 | } elsif ($tag eq 'wiki:history') { | ||||||
2511 | 0 | 0 | $l->{'name'} = __('History'); | ||||
2512 | 0 | 0 | $l->{'download'} = 0; | ||||
2513 | |||||||
2514 | } elsif ($tag =~ /foaf:maker|sioc:has_creator/) { | ||||||
2515 | 0 | 0 | $l->{'name'} = __('Author'); | ||||
2516 | 0 | 0 | $l->{'download'} = 0; | ||||
2517 | 0 | 0 | $l->{'priority'} = -20; # low | ||||
2518 | |||||||
2519 | } elsif ($tag eq 'statusnet:origin') { | ||||||
2520 | 0 | 0 | $l->{'name'} = __('Geo location'); | ||||
2521 | 0 | 0 | $l->{'download'} = 0; | ||||
2522 | 0 | 0 | $l->{'priority'} = -101; # just after Geo location | ||||
2523 | |||||||
2524 | } elsif ($tag eq 'sioc:has_discussion') { | ||||||
2525 | 0 | 0 | $l->{'name'} = __('Discussion'); | ||||
2526 | 0 | 0 | $l->{'download'} = 0; | ||||
2527 | |||||||
2528 | } elsif ($tag eq 'wfw:commentrss') { | ||||||
2529 | 0 | 0 | 0 | if (defined (my $count = $self->item_elt_comments_count($item,$elt))) { | |||
2530 | 0 | 0 | $l->{'name'} = __x('RSS Comments({count})', count => $count); | ||||
2531 | } else { | ||||||
2532 | 0 | 0 | $l->{'name'} = __('RSS Comments'); | ||||
2533 | } | ||||||
2534 | 0 | 0 | $l->{'download'} = 0; | ||||
2535 | 0 | 0 | $l->{'priority'} = -10; | ||||
2536 | |||||||
2537 | } elsif ($tag =~ /comment/) { # |
||||||
2538 | 0 | 0 | 0 | if (defined (my $count = $self->item_elt_comments_count($item,$elt))) { | |||
2539 | 0 | 0 | $l->{'name'} = __x('Comments({count})', count => $count); | ||||
2540 | } else { | ||||||
2541 | 0 | 0 | $l->{'name'} = __('Comments'); | ||||
2542 | } | ||||||
2543 | 0 | 0 | $l->{'download'} = 0; | ||||
2544 | } | ||||||
2545 | } | ||||||
2546 | |||||||
2547 | # Atom | ||||||
2548 | # RSS http:.. | ||||||
2549 | # RSS |
||||||
2550 | 0 | 0 | 0 | $l->{'uri'} //= (non_empty ($elt->att('atom:href')) # Atom | |||
0 | |||||||
0 | |||||||
0 | |||||||
0 | |||||||
0 | |||||||
2551 | // non_empty ($elt->att('href')) # Atom | ||||||
2552 | // non_empty ($elt->att('atom:src')) # Atom |
||||||
2553 | // non_empty ($elt->att('src')) # Atom |
||||||
2554 | // non_empty ($elt->att('url')) # RSS |
||||||
2555 | # |
||||||
2556 | // non_empty ($elt->att('rdf:resource'))); | ||||||
2557 | 0 | 0 | 0 | if (defined $l->{'ur'}) { | |||
2558 | 0 | 0 | $l->{'maybe_text'} = 0; # above are definite urls | ||||
2559 | } | ||||||
2560 | 0 | 0 | 0 | $l->{'uri'} //= (non_empty ($elt->trimmed_text) # RSS | |||
0 | |||||||
2561 | // next); # no contents | ||||||
2562 | 0 | 0 | 0 | 0 | if ($l->{'maybe_text'} && $l->{'uri'} !~ m{^[a-z]+://}) { | ||
2563 | 0 | 0 | $l->{'name'} .= ': ' . delete $l->{'uri'}; | ||||
2564 | } else { | ||||||
2565 | 0 | 0 | $l->{'uri'} = App::RSS2Leafnode::XML::Twig::Other::elt_xml_based_uri ($elt, $l->{'uri'}); | ||||
2566 | } | ||||||
2567 | |||||||
2568 | 0 | 0 | 0 | $l->{'name'} //= __('Link'); | |||
2569 | |||||||
2570 | 0 | 0 | my @paren; | ||||
2571 | # show length if biggish, often provided on enclosures but not plain | ||||||
2572 | # links | ||||||
2573 | 0 | 0 | 0 | 0 | if (defined (my $length = ($elt->att('atom:length') | ||
2574 | // $elt->att('length')))) { | ||||||
2575 | 0 | 0 | push @paren, $self->format_size_in_bytes($length); | ||||
2576 | } | ||||||
2577 | # |
||||||
2578 | # seconds, otherwise MM:SS or HH:MM:SS. | ||||||
2579 | 0 | 0 | 0 | 0 | if ($tag eq 'enclosure' | ||
2580 | && defined (my $duration = non_empty ($item->first_child_text('itunes:duration')))) { | ||||||
2581 | 0 | 0 | 0 | if ($duration !~ /:/) { | |||
2582 | 0 | 0 | $duration = __px('s-for-seconds', '{duration}s', | ||||
2583 | duration => $duration); | ||||||
2584 | } | ||||||
2585 | 0 | 0 | push @paren, collapse_whitespace($duration); | ||||
2586 | } | ||||||
2587 | 0 | 0 | 0 | if (@paren) { | |||
2588 | 0 | 0 | $l->{'name'} .= '('.join(', ',@paren). ')'; | ||||
2589 | } | ||||||
2590 | |||||||
2591 | ### push link: $l | ||||||
2592 | 0 | 0 | push @links, $l; | ||||
2593 | } | ||||||
2594 | |||||||
2595 | 0 | 0 | 0 | 0 | if (! $item->first_child('prism:url') | ||
2596 | && (my $elt = $item->first_child('prism:doi'))) { | ||||||
2597 | # Eg. http://www.nature.com/nature/current_issue/rss | ||||||
2598 | # which also has dc:identifier as the same DOI | ||||||
2599 | 0 | 0 | push @links, { name => __('DOI'), | ||||
2600 | uri => doi_to_uri($elt->trimmed_text), | ||||||
2601 | }; | ||||||
2602 | } | ||||||
2603 | |||||||
2604 | # eg. RSS |
||||||
2605 | # Atom |
||||||
2606 | # |
||||||
2607 | # |
||||||
2608 | # | ||||||
2609 | # |
||||||
2610 | # | ||||||
2611 | # | ||||||
2612 | # | ||||||
2613 | 0 | 0 | foreach my $elt ($item->children('source')) { | ||||
2614 | 0 | 0 | 0 | my $str = non_empty (elt_to_rendered_line($elt->first_child('title'))) | |||
2615 | // non_empty ($elt->trimmed_text); | ||||||
2616 | 0 | 0 | 0 | if (defined $str) { | |||
2617 | ### source: $str | ||||||
2618 | 0 | 0 | push @links, { name => __('Source') . ": $str", | ||||
2619 | download => 0, | ||||||
2620 | priority => -200, | ||||||
2621 | }; | ||||||
2622 | } | ||||||
2623 | 0 | 0 | foreach my $subelt | ||||
2624 | ($elt, | ||||||
2625 | 0 | 0 | grep {$self->atom_link_is_rss($_)} $elt->children('link') | ||||
2626 | ) { | ||||||
2627 | 0 | 0 | 0 | 0 | if (defined $subelt | ||
0 | |||||||
0 | |||||||
2628 | && defined (my $url = non_empty ($subelt->att('url')) | ||||||
2629 | // non_empty ($subelt->att('href')) | ||||||
2630 | // non_empty ($subelt->att('atom:href')))) { | ||||||
2631 | 0 | 0 | push @links, { name => __('Source RSS'), | ||||
2632 | uri => App::RSS2Leafnode::XML::Twig::Other::elt_xml_based_uri($subelt,$url), | ||||||
2633 | download => 0, | ||||||
2634 | priority => -200, | ||||||
2635 | }; | ||||||
2636 | } | ||||||
2637 | } | ||||||
2638 | } | ||||||
2639 | |||||||
2640 | # Merge together duplicate urls, so as not to download two copies as | ||||||
2641 | # attachments, and so as to make it clear when there's only one | ||||||
2642 | # destination for two things. | ||||||
2643 | # | ||||||
2644 | # Have seen same url under and |
||||||
2645 | # http://sourceforge.net/export/rss2_keepsake.php?group_id=203650 | ||||||
2646 | # or same url under and |
||||||
2647 | # http://abc.net.au/rn/podcast/feeds/sci.xml | ||||||
2648 | { | ||||||
2649 | 0 | 0 | my %seen; | ||||
0 | 0 | ||||||
2650 | @links = grep { | ||||||
2651 | 0 | 0 | my $l = $_; | ||||
0 | 0 | ||||||
2652 | 0 | 0 | my $want = 1; | ||||
2653 | 0 | 0 | 0 | if (my $uri = $l->{'uri'}) { | |||
2654 | 0 | 0 | my $canonical = $uri->canonical; | ||||
2655 | 0 | 0 | $canonical->fragment(undef); # ignore #foo anchor for uniqueness | ||||
2656 | 0 | 0 | 0 | if (my $prev_l = $seen{$canonical}) { | |||
2657 | 0 | 0 | $want = 0; | ||||
2658 | 0 | 0 | 0 | $prev_l->{'download'} ||= $l->{'download'}; | |||
2659 | $l->{'priority'} = max ($l->{'priority'}||0, | ||||||
2660 | 0 | 0 | 0 | $prev_l->{'priority'}||0); | |||
0 | |||||||
2661 | |||||||
2662 | # prefer no anchor if have both with and without | ||||||
2663 | 0 | 0 | 0 | if (is_empty($l->{'uri'}->fragment)) { | |||
2664 | 0 | 0 | $prev_l->{'uri'} = $l->{'uri'}; | ||||
2665 | } | ||||||
2666 | |||||||
2667 | 0 | 0 | 0 | if ($prev_l->{'name'} eq __('Link')) { | |||
0 | |||||||
0 | |||||||
2668 | # name "Link" doesn't say much, prefer the other over "Link" | ||||||
2669 | 0 | 0 | $prev_l->{'name'} = $l->{'name'}; | ||||
2670 | } elsif ($l->{'name'} eq __('Link')) { | ||||||
2671 | # don't append "Link" to the previous | ||||||
2672 | } elsif ($l->{'name'} eq $prev_l->{'name'}) { | ||||||
2673 | # don't double the same name | ||||||
2674 | } else { | ||||||
2675 | 0 | 0 | $prev_l->{'name'} .= ", $l->{'name'}"; | ||||
2676 | } | ||||||
2677 | } | ||||||
2678 | 0 | 0 | $seen{$canonical} = $l; | ||||
2679 | } | ||||||
2680 | $want | ||||||
2681 | 0 | 0 | } @links; | ||||
2682 | } | ||||||
2683 | 0 | 0 | foreach my $l (@links) { | ||||
2684 | 0 | 0 | 0 | if ($l->{'uri'}) { | |||
2685 | 0 | 0 | $l->{'name'} .= ':'; | ||||
2686 | } | ||||||
2687 | } | ||||||
2688 | |||||||
2689 | 0 | 0 | 0 | if (my $elt = $item->first_child('hlxcd:helex-company-data')) { | |||
2690 | # Eg. http://www.helex.gr/rss-feeds | ||||||
2691 | # http://www.helex.gr/web/guest/rss-feeds/-/asset_publisher/companiesrss/custom-rss | ||||||
2692 | 0 | 0 | my $str = join_non_empty | ||||
2693 | (' ', | ||||||
2694 | $elt->first_child_text('hlxcd:company-ticker-symbol'), | ||||||
2695 | $elt->first_child_text('hlxcd:company-name')); | ||||||
2696 | 0 | 0 | 0 | if (is_non_empty($str)) { | |||
2697 | 0 | 0 | push @links, { name => __('Company:').' '.$str, | ||||
2698 | download => 0, | ||||||
2699 | priority => -100, | ||||||
2700 | }; | ||||||
2701 | } | ||||||
2702 | } | ||||||
2703 | |||||||
2704 | 0 | 0 | 0 | if (defined (my $str = $self->item_to_lat_long_alt_str ($item))) { | |||
2705 | 0 | 0 | push @links, { name => $str, | ||||
2706 | download => 0, | ||||||
2707 | priority => -100, # lat/long low priority | ||||||
2708 | }; | ||||||
2709 | } | ||||||
2710 | |||||||
2711 | # re:rank as for example from stackexchange.com | ||||||
2712 | # What does label="" usually show? Are parens like this good? | ||||||
2713 | 0 | 0 | foreach my $elt ($item->children('re:rank')) { | ||||
2714 | 0 | 0 | my $label = $elt->att('label'); | ||||
2715 | 0 | 0 | my $value = elt_to_rendered_line($elt); | ||||
2716 | 0 | 0 | 0 | push @links, { name => (defined $label | |||
2717 | ? __x('Rank: {value} ({label})', value => $value, label => $label) | ||||||
2718 | : __x('Rank: {value}', value => $value)), | ||||||
2719 | download => 0, | ||||||
2720 | priority => -200, # low priority | ||||||
2721 | }; | ||||||
2722 | } | ||||||
2723 | |||||||
2724 | # eg. |
||||||
2725 | # is there any value in the role="" part? | ||||||
2726 | 0 | 0 | foreach my $elt ($item->children('media:credit')) { | ||||
2727 | 0 | 0 | push @links, { name => __x('Credit: {who}', | ||||
2728 | who => scalar(elt_to_rendered_line($elt))), | ||||||
2729 | download => 0, | ||||||
2730 | priority => -200, # very low priority | ||||||
2731 | }; | ||||||
2732 | } | ||||||
2733 | |||||||
2734 | # |
||||||
2735 | # | ||||||
2736 | # Allow for empty |
||||||
2737 | # http://abc.net.au/rn/podcast/feeds/sci.xml | ||||||
2738 | # | ||||||
2739 | 0 | 0 | foreach my $elt ($item->children('itunes:explicit')) { | ||||
2740 | 0 | 0 | 0 | my $line = elt_to_rendered_line($elt) | |||
2741 | // next; # skip empty |
||||||
2742 | 0 | 0 | push @links, { name => __x('Explicit: {value}', value => $line), | ||||
2743 | download => 0, | ||||||
2744 | priority => -200, # very low priority | ||||||
2745 | }; | ||||||
2746 | } | ||||||
2747 | |||||||
2748 | # |
||||||
2749 | # a fun kind of commentary thing | ||||||
2750 | 0 | 0 | foreach my $elt ($item->children('slash:department')) { | ||||
2751 | 0 | 0 | push @links, { name => __x('Department: {department}', | ||||
2752 | department => scalar(elt_to_rendered_line($elt))), | ||||||
2753 | download => 0, | ||||||
2754 | priority => -200, # very low priority | ||||||
2755 | }; | ||||||
2756 | } | ||||||
2757 | |||||||
2758 | 0 | 0 | return @links; | ||||
2759 | } | ||||||
2760 | @known{qw( | ||||||
2761 | /channel/item/pheedo:origLink | ||||||
2762 | /channel/item/feedburner:origLink | ||||||
2763 | |||||||
2764 | /channel/item/link | ||||||
2765 | /channel/item/enclosure | ||||||
2766 | /channel/item/dc:source | ||||||
2767 | /channel/item/dc:identifier | ||||||
2768 | /channel/item/comments | ||||||
2769 | /channel/item/wfw:comment | ||||||
2770 | /channel/item/wfw:commentRss | ||||||
2771 | /channel/item/slash:comments | ||||||
2772 | /channel/item/slash:hit_parade | ||||||
2773 | /channel/item/slash:department | ||||||
2774 | /channel/item/thr:total | ||||||
2775 | /channel/item/content --atom | ||||||
2776 | /channel/item/wiki:diff | ||||||
2777 | /channel/item/itunes:duration | ||||||
2778 | /channel/item/re:rank | ||||||
2779 | |||||||
2780 | /channel/wiki:interwiki | ||||||
2781 | /channel/wiki:interwiki/rdf:Description | ||||||
2782 | /channel/wiki:interwiki/rdf:Description/rdf:value | ||||||
2783 | /channel/item/wiki:version | ||||||
2784 | /channel/item/wiki:status | ||||||
2785 | /channel/item/wiki:history | ||||||
2786 | /channel/item/foaf:maker | ||||||
2787 | /channel/item/sioc:has_creator | ||||||
2788 | /channel/item/sioc:has_discussion | ||||||
2789 | /channel/item/sioc:links_to | ||||||
2790 | /channel/item/sioc:reply_of | ||||||
2791 | /channel/item/media:credit | ||||||
2792 | /channel/item/itunes:explicit | ||||||
2793 | /channel/item/itunes:block | ||||||
2794 | /channel/item/hlxcd:helex-company-data | ||||||
2795 | |||||||
2796 | --believed-to-be-duplicate-of-description | ||||||
2797 | /channel/item/media:content | ||||||
2798 | /channel/item/media:text | ||||||
2799 | |||||||
2800 | /channel/item/prism:doi | ||||||
2801 | /channel/item/prism:publicationName | ||||||
2802 | /channel/item/prism:publicationDate | ||||||
2803 | /channel/item/prism:url | ||||||
2804 | /channel/item/prism:volume | ||||||
2805 | /channel/item/prism:number | ||||||
2806 | /channel/item/prism:section | ||||||
2807 | /channel/item/prism:startingPage | ||||||
2808 | /channel/item/prism:endingPage | ||||||
2809 | )} = (); | ||||||
2810 | |||||||
2811 | # sub any_link_replies_nonfeed { | ||||||
2812 | # foreach my $elt (@_) { | ||||||
2813 | # # ### any_link_replies_nonfeed(): ref $elt, "$elt", $elt->tag | ||||||
2814 | # # my $rel = ($elt->att('atom:rel') // $elt->att('rel') // ''); | ||||||
2815 | # # my $type = ($elt->att('atom:type') // $elt->att('type') // ''); | ||||||
2816 | # # ### $rel | ||||||
2817 | # # ### $type | ||||||
2818 | # if ($elt->tag eq 'link' | ||||||
2819 | # && (($elt->att('atom:rel') // $elt->att('rel') // '') | ||||||
2820 | # eq 'replies') | ||||||
2821 | # && (($elt->atom_link_is_rss($elt))) { | ||||||
2822 | # return 1; | ||||||
2823 | # } | ||||||
2824 | # } | ||||||
2825 | # return 0; | ||||||
2826 | # } | ||||||
2827 | |||||||
2828 | # Return a string which is the latitude, longitude and possibly altitude | ||||||
2829 | # from the item. If no location in the item then return undef. | ||||||
2830 | # | ||||||
2831 | sub item_to_lat_long_alt_str { | ||||||
2832 | 0 | 0 | 0 | 0 | my ($self, $item) = @_; | ||
2833 | 0 | 0 | 0 | my ($lat, $long, $alt) = $self->item_to_lat_long_alt_values ($item) | |||
2834 | or return; | ||||||
2835 | ### $lat | ||||||
2836 | ### $long | ||||||
2837 | ### $alt | ||||||
2838 | |||||||
2839 | 0 | 0 | 0 | if (Scalar::Util::looks_like_number($lat)) { | |||
2840 | 0 | 0 | 0 | $lat = ($lat >= 0 | |||
2841 | # TRANSLATORS: the latin1/unicode degree symbol can be used here | ||||||
2842 | # instead of " deg", if it will be recognised in translation, | ||||||
2843 | # etc. | ||||||
2844 | ? __x('{latitude} deg N', latitude => $lat) | ||||||
2845 | : __x('{latitude} deg S', latitude => -$lat)); | ||||||
2846 | } | ||||||
2847 | 0 | 0 | 0 | if (Scalar::Util::looks_like_number($long)) { | |||
2848 | 0 | 0 | 0 | $long = ($long >= 0 | |||
2849 | ? __x('{longitude} deg E', longitude => $long) | ||||||
2850 | : __x('{longitude} deg W', longitude => -$long)); | ||||||
2851 | } | ||||||
2852 | |||||||
2853 | 0 | 0 | 0 | if (is_non_empty ($alt)) { | |||
2854 | 0 | 0 | return __x('Geo location: {latitude}, {longitude}, alt {altitude}m', | ||||
2855 | latitude => $lat, | ||||||
2856 | longitude => $long, | ||||||
2857 | altitude => $alt); | ||||||
2858 | } else { | ||||||
2859 | 0 | 0 | return __x('Geo location: {latitude}, {longitude}', | ||||
2860 | latitude => $lat, | ||||||
2861 | longitude => $long); | ||||||
2862 | } | ||||||
2863 | } | ||||||
2864 | |||||||
2865 | # Return a list of values which are the latitude, longitude and possibly | ||||||
2866 | # altitude extracted from $item. | ||||||
2867 | # | ||||||
2868 | # ($latitude, $longitude, $altitude) | ||||||
2869 | # ($latitude, $longitude) | ||||||
2870 | # () | ||||||
2871 | # | ||||||
2872 | # If no location then return an empty list. Some of the values returned | ||||||
2873 | # might be empty strings if say there's a |
||||||
2874 | # | ||||||
2875 | # Latitude is degrees North, or negative for South. Longitude is degrees | ||||||
2876 | # East, or negative for West. Both possibly with decimal places. | ||||||
2877 | # | ||||||
2878 | sub item_to_lat_long_alt_values { | ||||||
2879 | 0 | 0 | 0 | 0 | my ($self, $item) = @_; | ||
2880 | |||||||
2881 | # per-item |
||||||
2882 | # http://earthquake.usgs.gov/eqcenter/recenteqsww/catalogs/eqs7day-M5.xml | ||||||
2883 | # |
||||||
2884 | # |
||||||
2885 | # |
||||||
2886 | # | ||||||
2887 | # or under geo:Point, maybe, eg. http://www.gdacs.org/xml/RSSTC.xml | ||||||
2888 | # |
||||||
2889 | # |
||||||
2890 | # |
||||||
2891 | # |
||||||
2892 | # | ||||||
2893 | 0 | 0 | foreach my $elt ($item, $item->children(qr/^geo:point$/i)) { | ||||
2894 | 0 | 0 | my $lat = $elt->first_child_trimmed_text('geo:lat'); | ||||
2895 | 0 | 0 | 0 | if (is_non_empty ($lat)) { | |||
2896 | 0 | 0 | return ($lat, | ||||
2897 | $elt->first_child_trimmed_text('geo:long'), | ||||||
2898 | non_empty ($elt->first_child_trimmed_text('geo:alt'))); | ||||||
2899 | } | ||||||
2900 | } | ||||||
2901 | |||||||
2902 | # |
||||||
2903 | # |
||||||
2904 | # space separator per http://www.georss.org/Encodings | ||||||
2905 | { | ||||||
2906 | 0 | 0 | my $str = $item->first_child_trimmed_text ('georss:point'); | ||||
0 | 0 | ||||||
2907 | 0 | 0 | 0 | if (is_non_empty ($str)) { | |||
2908 | 0 | 0 | return split(/\s+/, $str, 2); # no altitude | ||||
2909 | } | ||||||
2910 | } | ||||||
2911 | |||||||
2912 | # |
||||||
2913 | # | ||||||
2914 | # rdf:resource="http://sws.geonames.org/2638077/"> | ||||||
2915 | # | ||||||
2916 | 0 | 0 | 0 | if (my $elt = $item->first_child ('statusnet:origin')) { | |||
2917 | 0 | 0 | 0 | if (defined (my $lat = $elt->att('geo:lat'))) { | |||
2918 | 0 | 0 | my $long = $elt->att('geo:long'); | ||||
2919 | 0 | 0 | return ($lat, $long); | ||||
2920 | } | ||||||
2921 | } | ||||||
2922 | |||||||
2923 | 0 | 0 | return; # not found | ||||
2924 | } | ||||||
2925 | @known{qw(/channel/item/geo:lat | ||||||
2926 | /channel/item/geo:long | ||||||
2927 | /channel/item/geo:alt | ||||||
2928 | /channel/item/geo:Point | ||||||
2929 | /channel/item/geo:Point/geo:lat | ||||||
2930 | /channel/item/geo:Point/geo:long | ||||||
2931 | /channel/item/georss:point | ||||||
2932 | /channel/item/statusnet:origin | ||||||
2933 | )} = (); | ||||||
2934 | |||||||
2935 | |||||||
2936 | sub links_to_html { | ||||||
2937 | 0 | 0 | 0 | 0 | 0 | @_ or return ''; | |
2938 | |||||||
2939 | # |
||||||
2940 | # up by a line-wrap, which can make it hard to cut and paste. can |
||||||
2941 | # prevent a line wrap, but it ends up treated as starting a paragraph, | ||||||
2942 | # separate from the 'name' part. | ||||||
2943 | # | ||||||
2944 | 0 | 0 | my $str = ''; | ||||
2945 | 0 | 0 | my $sep = "\n\n \n"; |
||||
2946 | 0 | 0 | foreach my $l (@_) { | ||||
2947 | 0 | 0 | $str .= "$sep |
||||
2948 | 0 | 0 | $sep = " \n"; |
||||
2949 | |||||||
2950 | 0 | 0 | 0 | if (defined (my $uri = $l->{'uri'})) { | |||
2951 | 0 | 0 | $str .= " | ||||
2952 | 0 | 0 | 0 | if (defined (my $hreflang = $l->{'hreflang'})) { | |||
2953 | 0 | 0 | $str .= " hreflang=\"$Entitize{$hreflang}\""; | ||||
2954 | } | ||||||
2955 | 0 | 0 | 0 | if (defined (my $type = $l->{'type'})) { | |||
2956 | 0 | 0 | $str .= " type=\"$Entitize{$type}\""; | ||||
2957 | } | ||||||
2958 | 0 | 0 | $uri = $Entitize{$uri}; | ||||
2959 | 0 | 0 | $str .= " href=\"$uri\">$uri"; | ||||
2960 | } | ||||||
2961 | 0 | 0 | $str .= "\n"; | ||||
2962 | } | ||||||
2963 | 0 | 0 | return "$str\n"; | ||||
2964 | } | ||||||
2965 | |||||||
2966 | sub links_to_text { | ||||||
2967 | 0 | 0 | 0 | 0 | return join ('', map { join_non_empty (' ', | ||
2968 | $_->{'name'}, | ||||||
2969 | 0 | 0 | $_->{'uri'}) . "\n" } @_); | ||||
2970 | } | ||||||
2971 | |||||||
2972 | |||||||
2973 | #------------------------------------------------------------------------------ | ||||||
2974 | # "From:" and email addresses | ||||||
2975 | |||||||
2976 | 2 | 2 | 6399 | use constant DUMMY_EMAIL_ADDRESS => 'nobody@rss2leafnode.dummy'; | |||
2 | 2 | ||||||
2 | 9390 | ||||||
2977 | |||||||
2978 | { | ||||||
2979 | my %tag_to_link_name | ||||||
2980 | = (author => __('Author:'), | ||||||
2981 | creator => __('Creator:'), | ||||||
2982 | contributor => __('Contributor:'), | ||||||
2983 | managingEditor => __('Managing Editor:'), | ||||||
2984 | webMaster => __('Webmaster:'), | ||||||
2985 | publisher => __('Publisher:'), | ||||||
2986 | owner => __('Owner:'), | ||||||
2987 | username => __('User:'), | ||||||
2988 | ); | ||||||
2989 | |||||||
2990 | # Return ($from, $linkhash,$linkhash,...). | ||||||
2991 | # $from is a string like "foo@example.com". | ||||||
2992 | # Multiple authors are for example "foo@example.com, quux@example.com" as | ||||||
2993 | # per RFC5322 email, though currently no Sender: is picked out from among | ||||||
2994 | # them. | ||||||
2995 | # | ||||||
2996 | # Eg. |
||||||
2997 | # each author of an article http://www.nature.com/nature/current_issue/rss | ||||||
2998 | # | ||||||
2999 | sub item_to_from { | ||||||
3000 | 0 | 0 | 0 | 0 | my ($self, $item) = @_; | ||
3001 | ### item_to_from() ... | ||||||
3002 | 0 | 0 | my $channel = elt_to_channel($item); | ||||
3003 | |||||||
3004 | # |
||||||
3005 | # looser. The RSS recommendation is |
||||||
3006 | # and |
||||||
3007 | # | ||||||
3008 | # |
||||||
3009 | # attribute. | ||||||
3010 | # | ||||||
3011 | # |
||||||
3012 | # | ||||||
3013 | # |
||||||
3014 | # to show just the primary author or authors. | ||||||
3015 | # | ||||||
3016 | 0 | 0 | my @from; | ||||
3017 | my @links; | ||||||
3018 | 0 | 0 | foreach my $try ([$item, 'author'], | ||||
3019 | [$item, 'jf:author'], | ||||||
3020 | [$item, 'slate:author'], | ||||||
3021 | [$item, 'dc:creator'], | ||||||
3022 | [$item, 'dc:contributor'], | ||||||
3023 | [$item, 'wiki:username'], | ||||||
3024 | [$item, 'itunes:author'], | ||||||
3025 | |||||||
3026 | [$channel, 'author'], | ||||||
3027 | [$channel, 'dc:creator'], | ||||||
3028 | [$channel, 'itunes:author'], | ||||||
3029 | [$channel, 'managingEditor'], | ||||||
3030 | [$channel, 'webMaster'], | ||||||
3031 | |||||||
3032 | [$item, 'dc:publisher'], | ||||||
3033 | [$channel, 'dc:publisher'], | ||||||
3034 | [$channel, 'itunes:owner'], | ||||||
3035 | ) { | ||||||
3036 | 0 | 0 | my ($where, $tag) = @$try; | ||||
3037 | ### $tag | ||||||
3038 | |||||||
3039 | 0 | 0 | 0 | if (my @elts = $item->children($tag)) { | |||
3040 | 0 | 0 | foreach my $elt (@elts) { | ||||
3041 | ### elt for From: $elt->sprint | ||||||
3042 | 0 | 0 | push @from, $self->elt_to_email($elt); | ||||
3043 | |||||||
3044 | # author's home page etc as a link | ||||||
3045 | 0 | 0 | 0 | 0 | if (my $uri = | ||
0 | |||||||
3046 | (# Atom | ||||||
3047 | # |
||||||
3048 | # |
||||||
3049 | # |
||||||
3050 | # | ||||||
3051 | # | ||||||
3052 | non_empty ($elt->first_child_text('uri')) | ||||||
3053 | |||||||
3054 | # slate.com | ||||||
3055 | # |
||||||
3056 | // non_empty ($elt->att('url')) | ||||||
3057 | |||||||
3058 | # ModWiki dc:contributor example | ||||||
3059 | # |
||||||
3060 | # |
||||||
3061 | # | ||||||
3062 | # The text shows rss:link= and the example just link=. | ||||||
3063 | # | ||||||
3064 | // non_empty (do { | ||||||
3065 | 0 | 0 | 0 | 0 | my $child; ($child = $elt->first_child('rdf:Description')) | ||
0 | 0 | ||||||
3066 | && ($child->att('link') // $child->att('rss:link')) | ||||||
3067 | }))) { | ||||||
3068 | 0 | 0 | my $tag = $elt->tag; | ||||
3069 | 0 | 0 | $tag =~ s/.*?://; | ||||
3070 | push @links, { uri => URI->new($uri), | ||||||
3071 | 0 | 0 | 0 | name => ($tag_to_link_name{$tag} // "\u$tag:"), | |||
3072 | download => 0, | ||||||
3073 | priority => -20 }; | ||||||
3074 | } | ||||||
3075 | } | ||||||
3076 | } | ||||||
3077 | 0 | 0 | 0 | last if @from; | |||
3078 | } | ||||||
3079 | 0 | 0 | 0 | if (! @from) { | |||
3080 | # Atom |
||||||
3081 | # Hope the channel title is different from the item title. | ||||||
3082 | 0 | 0 | @from = ($self->email_format (elt_to_rendered_line | ||||
3083 | ($channel->first_child('title')))); | ||||||
3084 | } | ||||||
3085 | 0 | 0 | 0 | if (! @from) { | |||
3086 | 0 | 0 | @from = ('nobody@'.$self->uri_to_host); | ||||
3087 | } | ||||||
3088 | |||||||
3089 | ### @from | ||||||
3090 | 0 | 0 | return (join(', ',@from), | ||||
3091 | @links); | ||||||
3092 | } | ||||||
3093 | @known{qw(/channel/author | ||||||
3094 | /channel/author/name --atom | ||||||
3095 | /channel/author/uri --atom | ||||||
3096 | /channel/author/url --atom-typo-maybe | ||||||
3097 | /channel/author/email --atom | ||||||
3098 | /channel/managingEditor | ||||||
3099 | /channel/webMaster | ||||||
3100 | /channel/dc:publisher | ||||||
3101 | /channel/dc:creator | ||||||
3102 | /channel/itunes:author | ||||||
3103 | |||||||
3104 | /channel/item/author | ||||||
3105 | /channel/item/author/name --atom | ||||||
3106 | /channel/item/author/uri --atom | ||||||
3107 | /channel/item/author/url --atom-typo-maybe | ||||||
3108 | /channel/item/author/email --atom | ||||||
3109 | /channel/item/author/gd:extendedProperty --good-dinner | ||||||
3110 | /channel/item/dc:creator | ||||||
3111 | /channel/item/dc:publisher | ||||||
3112 | /channel/item/wiki:username | ||||||
3113 | /channel/item/itunes:author | ||||||
3114 | /channel/item/dc:contributor | ||||||
3115 | /channel/item/dc:contributor/rdf:Description | ||||||
3116 | /channel/item/dc:contributor/rdf:Description/rdf:value | ||||||
3117 | /channel/item/jf:author | ||||||
3118 | /channel/item/slate:author | ||||||
3119 | |||||||
3120 | /channel/item/contributor --atom | ||||||
3121 | /channel/item/contributor/name | ||||||
3122 | /channel/item/contributor/uri | ||||||
3123 | /channel/item/contributor/url --atom-typo-maybe | ||||||
3124 | /channel/item/contributor/email | ||||||
3125 | |||||||
3126 | /channel/item/activity:actor | ||||||
3127 | /channel/item/activity:verb --usually-post-or-something | ||||||
3128 | /channel/item/activity:object-type --is-this-anything | ||||||
3129 | )} = (); | ||||||
3130 | } | ||||||
3131 | |||||||
3132 | # $elt is an XML::Twig::Elt | ||||||
3133 | # Return an email address, either just the text part of $elt or Atom | ||||||
3134 | # sub-elements |
||||||
3135 | # If $elt is empty then return an empty list. | ||||||
3136 | # | ||||||
3137 | sub elt_to_email { | ||||||
3138 | 0 | 0 | 0 | 0 | my ($self, $elt) = @_; | ||
3139 | ### elt_to_email(): "$elt" | ||||||
3140 | 0 | 0 | 0 | return unless defined $elt; | |||
3141 | |||||||
3142 | # |
||||||
3143 | # |
||||||
3144 | 0 | 0 | my $email = elt_to_rendered_line ($elt->first_child(qr/^(itunes:)?email$/)); | ||||
3145 | |||||||
3146 | # |
||||||
3147 | # |
||||||
3148 | 0 | 0 | 0 | my $display = elt_to_rendered_line ($elt->first_child(qr/^(itunes:)?name$/)) | |||
3149 | // ''; | ||||||
3150 | |||||||
3151 | ### $display | ||||||
3152 | |||||||
3153 | |||||||
3154 | my $maybe = join | ||||||
3155 | (' ', | ||||||
3156 | non_empty ($elt->text_only), | ||||||
3157 | 0 | 0 | non_empty (do { | ||||
3158 | # |
||||||
3159 | 0 | 0 | 0 | my $rdfdesc; ($rdfdesc = $elt->first_child('rdf:Description')) | |||
0 | 0 | ||||||
3160 | && $rdfdesc->first_child_text('rdf:value') | ||||||
3161 | })); | ||||||
3162 | |||||||
3163 | # If item has a |
||||||
3164 | # is the same and use an email from there. | ||||||
3165 | # Eg. Skeptoid.xml podcast circa Jan 2017. | ||||||
3166 | 0 | 0 | 0 | 0 | if (defined $display && ! defined $email) { | ||
3167 | 0 | 0 | my $channel = elt_to_channel($elt); | ||||
3168 | 0 | 0 | my ($owner, $name); | ||||
3169 | 0 | 0 | 0 | 0 | if (($owner = $channel->first_child(qr/(itunes:)?owner/)) | ||
0 | |||||||
0 | |||||||
3170 | && defined($name = elt_to_rendered_line ($owner->first_child(qr/^(itunes:)?name$/))) | ||||||
3171 | && ($name eq $display || $name eq $maybe)) { | ||||||
3172 | 0 | 0 | $email = elt_to_rendered_line ($owner->first_child(qr/^(itunes:)?email$/)); | ||||
3173 | } | ||||||
3174 | ### channel: "$channel" | ||||||
3175 | ### owner: "$owner" | ||||||
3176 | ### $name | ||||||
3177 | ### $display | ||||||
3178 | } | ||||||
3179 | |||||||
3180 | 0 | 0 | return $self->email_format_maybe ($maybe, $display, $email); | ||||
3181 | } | ||||||
3182 | |||||||
3183 | # $mailbox_re is a mailbox with domain, like "foo@example.com" | ||||||
3184 | # Allows no dots like "foo@localhost". | ||||||
3185 | # Allows dashes like "www-something@example.com". | ||||||
3186 | # | ||||||
3187 | # $mailbox_with_comment_re allows an optional paren comment part like | ||||||
3188 | # "foo@example.com (Foo)" | ||||||
3189 | # | ||||||
3190 | # cf Email::Address $addr_spec, but its version 1.890 loosened to allow a | ||||||
3191 | # domain-less bare "foo", which is no good | ||||||
3192 | # | ||||||
3193 | my $words_with_dots_re = qr/[[:word:]-]+(\.[[:word:]-]+)*/; | ||||||
3194 | my $mailbox_re = qr/$words_with_dots_re\@$words_with_dots_re/o; | ||||||
3195 | my $mailbox_with_comment_re = qr/$mailbox_re(\s*\([^\)]*\))?/os; | ||||||
3196 | |||||||
3197 | # $maybe is some free-form author part possibly including a foo@example.com | ||||||
3198 | # $display is a display part for the email like "Foo", possibly empty "" | ||||||
3199 | # $email is a mailbox "foo@example.com", or undef | ||||||
3200 | # return an rfc822 "Foo |
||||||
3201 | # | ||||||
3202 | sub email_format_maybe { | ||||||
3203 | 0 | 0 | 0 | 0 | my ($self, $maybe, $display, $email) = @_; | ||
3204 | ### email_format_maybe() start | ||||||
3205 | ### $maybe | ||||||
3206 | ### $display | ||||||
3207 | |||||||
3208 | |||||||
3209 | # look also at $display the same in case Atom no |
||||||
3210 | # which is a mailbox and can be corrected, | ||||||
3211 | # eg. http://www.weather.gov/alerts-beta/hi.php?x=0 | ||||||
3212 | # | ||||||
3213 | # Or $maybe full email like | ||||||
3214 | 0 | 0 | 0 | if (is_empty($email)) { | |||
3215 | 0 | 0 | foreach ($maybe, $display) { | ||||
3216 | |||||||
3217 | 0 | 0 | 0 | 0 | if (/^\s*(mailto:)?($mailbox_with_comment_re)\s*$/o) { | ||
0 | |||||||
3218 | ### maybe or display is a mailbox | ||||||
3219 | # "foo@example.com" | ||||||
3220 | # "mailto:foo@example.com" | ||||||
3221 | # "foo@example.com (Foo)" | ||||||
3222 | 0 | 0 | $email = $2; | ||||
3223 | 0 | 0 | undef $_; | ||||
3224 | 0 | 0 | last; | ||||
3225 | |||||||
3226 | } elsif (/(.*)\((mailto:)?($mailbox_re)\)\s*$/o | ||||||
3227 | || /(.*)<(mailto:)?($mailbox_re)>\s*$/o) { | ||||||
3228 | ### maybe or display part is display plus mailbox | ||||||
3229 | # "Foo (mailto:foo@example.com)" | ||||||
3230 | # "Foo (foo@example.com)" | ||||||
3231 | # "Foo |
||||||
3232 | # | ||||||
3233 | 0 | 0 | $_ = $1; | ||||
3234 | 0 | 0 | $email = $3; | ||||
3235 | 0 | 0 | last; | ||||
3236 | } | ||||||
3237 | } | ||||||
3238 | } | ||||||
3239 | |||||||
3240 | 0 | 0 | 0 | $display .= ' '.($maybe//''); | |||
3241 | 0 | 0 | my $ret; | ||||
3242 | 0 | 0 | 0 | 0 | if (is_empty($email) && $display =~ /^$mailbox_re$/o) { | ||
3243 | # display or maybe is a "foo@example.com" or "foo@example.com (Foo)", | ||||||
3244 | # return it as-is, in particular leave it in "(Foo)" style comment | ||||||
3245 | 0 | 0 | $ret = $display; | ||||
3246 | } else { | ||||||
3247 | 0 | 0 | $ret = $self->email_format ($display, $email); | ||||
3248 | } | ||||||
3249 | |||||||
3250 | # Collapse whitespace against possible tabs and newlines in an |
||||||
3251 | # from googlegroups for instance. MIME::Entity seems to collapse | ||||||
3252 | # newlines, but not tabs. | ||||||
3253 | 0 | 0 | return non_empty (collapse_whitespace ($ret)); | ||||
3254 | } | ||||||
3255 | |||||||
3256 | # $display is a display part for the email "Foo", possibly empty "" | ||||||
3257 | # $email is a mailbox "foo@example.com", or undef or empty "" | ||||||
3258 | # return an rfc822 "Foo |
||||||
3259 | # | ||||||
3260 | sub email_format { | ||||||
3261 | 0 | 0 | 0 | 0 | my ($self, $display, $email) = @_; | ||
3262 | ### $display | ||||||
3263 | 0 | 0 | $display = Text::Trim::trim($display); | ||||
3264 | 0 | 0 | $email = Text::Trim::trim($email); | ||||
3265 | 0 | 0 | 0 | if (is_empty($display)) { | |||
3266 | 0 | 0 | 0 | if (is_empty($email)) { | |||
3267 | 0 | 0 | return; | ||||
3268 | } else { | ||||||
3269 | 0 | 0 | return $email; | ||||
3270 | } | ||||||
3271 | } | ||||||
3272 | 0 | 0 | 0 | if (is_empty($email)) { | |||
3273 | # think can't have empty <> or omitted, otherwise the quoted part is | ||||||
3274 | # still parsed as an address, certainly it's not rfc822 compliant to | ||||||
3275 | # omit | ||||||
3276 | 0 | 0 | $email = 'nobody@'.$self->uri_to_host; | ||||
3277 | } else { | ||||||
3278 | 0 | 0 | $email = $email; | ||||
3279 | } | ||||||
3280 | 0 | 0 | return email_phrase_quote_maybe($display) . " <$email>"; | ||||
3281 | } | ||||||
3282 | |||||||
3283 | # return $str with quotes like "Foo Bar" if it needs them to go in an email | ||||||
3284 | # display part | ||||||
3285 | sub email_phrase_quote_maybe { | ||||||
3286 | 0 | 0 | 0 | 0 | my ($str) = @_; | ||
3287 | 0 | 0 | 0 | return if ! defined $str; | |||
3288 | |||||||
3289 | # RFC2822 "atext" characters, with "-" last | ||||||
3290 | 0 | 0 | 0 | if ($str =~ m<[^[:alnum:][:space:]!#\$%&'*+/=?^_`{|}~-]>) { | |||
3291 | # strange chars, need to quote | ||||||
3292 | 0 | 0 | return email_phrase_quote($str); | ||||
3293 | } else { | ||||||
3294 | # alphanumeric and whitespace, no quotes | ||||||
3295 | 0 | 0 | return $str; | ||||
3296 | } | ||||||
3297 | } | ||||||
3298 | sub email_phrase_quote { | ||||||
3299 | 0 | 0 | 0 | 0 | my ($str) = @_; | ||
3300 | 0 | 0 | 0 | return if ! defined $str; | |||
3301 | 0 | 0 | $str =~ s/^"(.*)"$/$1/; # strip existing quotes | ||||
3302 | 0 | 0 | $str =~ s/(["\\])/\\$1/g; # escape internal quotes and backslashes | ||||
3303 | 0 | 0 | return "\"$str\""; | ||||
3304 | } | ||||||
3305 | |||||||
3306 | |||||||
3307 | #------------------------------------------------------------------------------ | ||||||
3308 | # rss_newest_only | ||||||
3309 | |||||||
3310 | { | ||||||
3311 | my %multiplier = (minute => 60, | ||||||
3312 | hour => 3600, | ||||||
3313 | day => 86400, | ||||||
3314 | week => 86400 * 7, | ||||||
3315 | month => 365.25 * 86400 / 12, | ||||||
3316 | year => 365.25 * 86400, | ||||||
3317 | ); | ||||||
3318 | # return a target time_t, or undef | ||||||
3319 | sub rss_newest_only_timet { | ||||||
3320 | 0 | 0 | 0 | 0 | my ($self) = @_; | ||
3321 | |||||||
3322 | 0 | 0 | 0 | if (defined (my $str = $self->{'rss_newest_only'})) { | |||
3323 | 0 | 0 | 0 | if ($str =~ /^\s*(\d+)\s*(minute|hour|day|week|month|year)s?\s*$/) { | |||
3324 | 0 | 0 | return time() - $1*$multiplier{$2}; | ||||
3325 | } | ||||||
3326 | } | ||||||
3327 | 0 | 0 | return undef; | ||||
3328 | } | ||||||
3329 | } | ||||||
3330 | |||||||
3331 | # return a number, or undef | ||||||
3332 | sub rss_newest_only_count { | ||||||
3333 | 0 | 0 | 0 | 0 | my ($self) = @_; | ||
3334 | 0 | 0 | 0 | if (defined (my $str = $self->{'rss_newest_only'})) { | |||
3335 | 0 | 0 | 0 | if (Scalar::Util::looks_like_number($str)) { | |||
3336 | ### rss_newest_only number: $str | ||||||
3337 | 0 | 0 | return $str; | ||||
3338 | } | ||||||
3339 | } | ||||||
3340 | 0 | 0 | return undef; | ||||
3341 | } | ||||||
3342 | |||||||
3343 | # return @items restricted or filtered by rss_newest_only | ||||||
3344 | sub rss_newest_only_items { | ||||||
3345 | 0 | 0 | 0 | 0 | my ($self, @items) = @_; | ||
3346 | |||||||
3347 | 0 | 0 | 0 | if (defined (my $count = $self->rss_newest_only_count)) { | |||
3348 | 0 | 0 | 0 | if ($count == 0) { | |||
3349 | # rss_newest_only=>0 means don't apply a newest | ||||||
3350 | 0 | 0 | return @items; | ||||
3351 | } | ||||||
3352 | 0 | 0 | my $before = scalar(@items); | ||||
3353 | 0 | 0 | require Sort::Key::Top; | ||||
3354 | 0 | 0 | 0 | @items = Sort::Key::Top::rnkeytop (sub { $self->item_to_timet($_) }, | |||
3355 | 0 | 0 | $count, @items); | ||||
3356 | |||||||
3357 | 0 | 0 | my $after = scalar(@items); | ||||
3358 | 0 | 0 | 0 | if ($before != $after) { | |||
3359 | 0 | 0 | $self->verbose (1, " rss_newest_only reduce by count from $before items to $after items"); | ||||
3360 | } | ||||||
3361 | 0 | 0 | return @items; | ||||
3362 | } | ||||||
3363 | |||||||
3364 | 0 | 0 | 0 | if (defined (my $target_timet = $self->rss_newest_only_timet)) { | |||
3365 | 0 | 0 | my $before = scalar(@items); | ||||
3366 | 0 | 0 | @items = grep { my $got_timet = $self->item_to_timet($_); | ||||
0 | 0 | ||||||
3367 | 0 | 0 | 0 | ! defined $got_timet || $got_timet >= $target_timet } | |||
3368 | @items; | ||||||
3369 | 0 | 0 | my $after = scalar(@items); | ||||
3370 | 0 | 0 | 0 | if ($before != $after) { | |||
3371 | 0 | 0 | $self->verbose (1, " rss_newest_only reduce by age from $before to $after items"); | ||||
3372 | } | ||||||
3373 | 0 | 0 | return @items; | ||||
3374 | } | ||||||
3375 | |||||||
3376 | 0 | 0 | 0 | if (defined (my $str = $self->{'rss_newest_only'})) { | |||
3377 | 0 | 0 | die "rss2leafnode: unrecognised rss_newest_only: ",$str; | ||||
3378 | } | ||||||
3379 | 0 | 0 | return @items; | ||||
3380 | } | ||||||
3381 | |||||||
3382 | |||||||
3383 | #------------------------------------------------------------------------------ | ||||||
3384 | # fetch RSS | ||||||
3385 | |||||||
3386 | my $map_xmlns | ||||||
3387 | = { | ||||||
3388 | 'http://purl.org/rss/1.0/' => 'rss', | ||||||
3389 | 'http://www.w3.org/2005/Atom' => 'atom', | ||||||
3390 | 'http://www.w3.org/1999/02/22-rdf-syntax-ns#' => 'rdf', | ||||||
3391 | 'http://purl.org/rss/1.0/modules/content/' => 'content', | ||||||
3392 | 'http://purl.org/rss/1.0/modules/slash/' => 'slash', | ||||||
3393 | 'http://purl.org/rss/1.0/modules/syndication/' => 'syn', | ||||||
3394 | 'http://purl.org/syndication/thread/1.0' => 'thr', | ||||||
3395 | 'http://wellformedweb.org/CommentAPI/' => 'wfw', | ||||||
3396 | 'http://www.w3.org/1999/xhtml' => 'xhtml', | ||||||
3397 | 'http://www.itunes.com/dtds/podcast-1.0.dtd' => 'itunes', | ||||||
3398 | 'http://rssnamespace.org/feedburner/ext/1.0' => 'feedburner', | ||||||
3399 | 'http://www.helex.gr/helex-schemas/xsd/CompanyDataAtomAttributes-v1.xsd' | ||||||
3400 | => 'hlxcd', | ||||||
3401 | |||||||
3402 | # http://www.prismstandard.org/specifications/2.0/PRISM_prism_namespace_2.0.pdf | ||||||
3403 | 'http://prismstandard.org/namespaces/basic/2.0/' => 'prism', | ||||||
3404 | |||||||
3405 | # http://www.rssboard.org/media-rss | ||||||
3406 | 'http://search.yahoo.com/mrss' => 'media', | ||||||
3407 | |||||||
3408 | 'http://www.w3.org/2003/01/geo/wgs84_pos#' => 'geo', | ||||||
3409 | 'http://www.georss.org/georss' => 'georss', | ||||||
3410 | 'http://www.pheedo.com/namespace/pheedo' => 'pheedo', | ||||||
3411 | 'http://api.twitter.com' => 'twitter', | ||||||
3412 | 'http://xmlns.com/foaf/0.1/' => 'foaf', | ||||||
3413 | 'http://status.net/ont/' => 'statusnet', | ||||||
3414 | 'http://rdfs.org/sioc/ns#' => 'sioc', | ||||||
3415 | 'http://www.slate.com' => 'slate', | ||||||
3416 | 'http://activitystrea.ms/spec/1.0/' => 'activity', | ||||||
3417 | 'http://ostatus.org/schema/1.0' => 'ostatus', | ||||||
3418 | |||||||
3419 | # http://tools.ietf.org/html/draft-snell-atompub-feed-index-10 | ||||||
3420 | 'http://purl.org/atompub/rank/1.0' => 're', | ||||||
3421 | |||||||
3422 | # per http://docs.jivesoftware.com/latest/documentation/rss.html#output | ||||||
3423 | 'http://www.jivesoftware.com/xmlns/jiveforums/rss' => 'jf', | ||||||
3424 | |||||||
3425 | # these two are different, but treat the same for now | ||||||
3426 | 'http://backend.userland.com/creativeCommonsRssModule'=>'creativeCommons', | ||||||
3427 | 'http://creativecommons.org/ns#' =>'creativeCommons', | ||||||
3428 | |||||||
3429 | # Common Alerts Protocol | ||||||
3430 | 'urn:oasis:names:tc:emergency:cap:1.1' => 'cap', | ||||||
3431 | |||||||
3432 | # central bank exchange rates format, | ||||||
3433 | # spec http://www.cbwiki.net/wiki/index.php/RSS-CBMain | ||||||
3434 | # eg. RBA http://www.rba.gov.au/rss/rss-cb-exchange-rates.xml | ||||||
3435 | 'http://www.cbwiki.net/wiki/index.php/Specification_1.1' => 'cb', | ||||||
3436 | |||||||
3437 | # earthquakes | ||||||
3438 | # eg. http://earthquake.usgs.gov/earthquakes/shakemap/rss.xml | ||||||
3439 | 'http://earthquake.usgs.gov/rss/1.0/' => 'eq', | ||||||
3440 | |||||||
3441 | 'http://purl.org/dc/elements/1.1/' => 'dc', | ||||||
3442 | 'http://purl.org/dc/terms/' => 'dcterms', | ||||||
3443 | |||||||
3444 | # purl.org might be supposed to be the home for wiki:, but it's a 404 | ||||||
3445 | # and usemod.com suggests its page instead | ||||||
3446 | # Spec at http://www.meatballwiki.org/wiki/ModWiki | ||||||
3447 | 'http://purl.org/rss/1.0/modules/wiki/' => 'wiki', | ||||||
3448 | 'http://www.usemod.com/cgi-bin/mb.pl?ModWiki' => 'wiki', | ||||||
3449 | |||||||
3450 | # not sure if this is supposed to be necessary, but without it | ||||||
3451 | # "xml:lang" attributes are turned into "lang" | ||||||
3452 | 'http://www.w3.org/XML/1998/namespace' => 'xml', | ||||||
3453 | }; | ||||||
3454 | |||||||
3455 | sub twig_parse { | ||||||
3456 | 1 | 1 | 0 | 15 | my ($self, $xml) = @_; | ||
3457 | ### twig_parse() ... | ||||||
3458 | |||||||
3459 | # default "discard_spaces" chucks leading and trailing space on content, | ||||||
3460 | # which is usually a good thing | ||||||
3461 | # | ||||||
3462 | 1 | 1777 | require XML::Twig; | ||||
3463 | 0 | XML::Twig->VERSION('3.34'); # for att_exists() | |||||
3464 | 0 | my $twig = XML::Twig->new (map_xmlns => $map_xmlns, | |||||
3465 | pretty_print => 'wrapped'); | ||||||
3466 | 0 | $twig->safe_parse ($xml); | |||||
3467 | 0 | my $err = $@; | |||||
3468 | ### $err | ||||||
3469 | |||||||
3470 | # Try to fix bad non-ascii chars by putting it through Encode::from_to(). | ||||||
3471 | # Encode::FB_DEFAULT substitutes U+FFFD when going to unicode, or question | ||||||
3472 | # mark "?" going to non-unicode. Mozilla does some sort of similar | ||||||
3473 | # liberal byte interpretation so as to at least display something from a | ||||||
3474 | # dodgy feed. | ||||||
3475 | # | ||||||
3476 | 0 | 0 | 0 | if ($err && $err =~ /not well-formed \(invalid token\) at (line \d+, column \d+, byte (\d+))/) { | |||
3477 | 0 | my $where = $1; | |||||
3478 | 0 | my $byte = ord(substr($xml,$2,1)); | |||||
3479 | 0 | 0 | if ($byte >= 128) { | ||||
3480 | 0 | 0 | my $charset = $twig->encoding // 'utf-8'; | ||||
3481 | 0 | $self->verbose (1, sprintf ("parse error, attempt re-code $charset for byte 0x%02X\n", $byte)); | |||||
3482 | 0 | require Encode; | |||||
3483 | 0 | my $recoded_xml = $xml; | |||||
3484 | 0 | Encode::from_to($recoded_xml, $charset, $charset, Encode::FB_DEFAULT()); | |||||
3485 | |||||||
3486 | 0 | $twig = XML::Twig->new (map_xmlns => $map_xmlns); | |||||
3487 | 0 | 0 | if ($twig->safe_parse ($recoded_xml)) { | ||||
3488 | 0 | $twig->root->set_att('rss2leafnode:fixup', | |||||
3489 | "Recoded bad bytes to charset $charset"); | ||||||
3490 | print __x("Feed {url}\n recoded {charset} to parse, expect substitutions for bad non-ascii\n ({where})\n", | ||||||
3491 | 0 | url => $self->{'uri'}, | |||||
3492 | charset => $charset, | ||||||
3493 | where => $where); | ||||||
3494 | 0 | undef $err; | |||||
3495 | } | ||||||
3496 | } | ||||||
3497 | } | ||||||
3498 | |||||||
3499 | # Or attempt to put it through XML::Liberal, if available. | ||||||
3500 | # | ||||||
3501 | 0 | 0 | if ($err) { | ||||
3502 | 0 | my $liberal_xml = $self->xml_liberal_correction($xml); | |||||
3503 | 0 | 0 | if (defined $liberal_xml) { | ||||
3504 | ### reparse xml liberal fixup with twig ... | ||||||
3505 | 0 | $twig = XML::Twig->new (map_xmlns => $map_xmlns); | |||||
3506 | 0 | 0 | if ($twig->safe_parse ($liberal_xml)) { | ||||
3507 | ### now ok ... | ||||||
3508 | 0 | $err = Text::Trim::trim($err); | |||||
3509 | 0 | $twig->root->set_att('rss2leafnode:fixup', | |||||
3510 | "XML::Liberal fixed: {error}", | ||||||
3511 | error => $err); | ||||||
3512 | print __x("Feed {url}\n parse error: {error}\n continuing with repairs by XML::Liberal\n", | ||||||
3513 | 0 | url => $self->{'uri'}, | |||||
3514 | error => $err); | ||||||
3515 | 0 | undef $err; | |||||
3516 | } | ||||||
3517 | } | ||||||
3518 | ### now err: $err | ||||||
3519 | } | ||||||
3520 | |||||||
3521 | 0 | 0 | if ($err) { | ||||
3522 | # XML::Parser seems to stick some spurious leading whitespace on the error | ||||||
3523 | 0 | $err = Text::Trim::trim($err); | |||||
3524 | |||||||
3525 | $self->verbose (1, __x("Parse error on URL {url}\n{error}", | ||||||
3526 | 0 | url => $self->{'uri'}, | |||||
3527 | error => $err)); | ||||||
3528 | 0 | return (undef, $err); | |||||
3529 | } | ||||||
3530 | |||||||
3531 | # Strip any explicit "rss:" or "atom:" namespace down to bare part. | ||||||
3532 | # Should be unambiguous and is easier than giving tag names both with and | ||||||
3533 | # without the namespace. Undocumented set_ns_as_default() might do this | ||||||
3534 | # ... or might not. | ||||||
3535 | # | ||||||
3536 | 0 | my $root = $twig->root; | |||||
3537 | 0 | App::RSS2Leafnode::XML::Twig::Other::elt_tree_strip_prefix ($root, 'atom'); | |||||
3538 | 0 | App::RSS2Leafnode::XML::Twig::Other::elt_tree_strip_prefix ($root, 'rss'); | |||||
3539 | |||||||
3540 | # somehow map_xmlns mangles default attributes like "decimals=...", prefer | ||||||
3541 | # to see them without rss: or atom: -- maybe | ||||||
3542 | # foreach my $child ($root->descendants_or_self) { | ||||||
3543 | # foreach my $attname ($child->att_names) { | ||||||
3544 | # if ($attname =~ /^(atom|rss):(.*)/) { | ||||||
3545 | # $child->change_att_name($attname, $2); | ||||||
3546 | # } | ||||||
3547 | # } | ||||||
3548 | # } | ||||||
3549 | |||||||
3550 | ### add xml base | ||||||
3551 | 0 | 0 | 0 | if (defined $self->{'uri'} && ! $root->att_exists('xml:base')) { | |||
3552 | 0 | $root->set_att ('xml:base', $self->{'uri'}); | |||||
3553 | } | ||||||
3554 | |||||||
3555 | ### success | ||||||
3556 | 0 | return ($twig, undef); | |||||
3557 | } | ||||||
3558 | |||||||
3559 | sub elt_to_channel { | ||||||
3560 | 0 | 0 | 0 | my ($elt) = @_; | |||
3561 | # parent for RSS or Atom, but sibling "channel" for RDF | ||||||
3562 | 0 | while ($elt->parent) { | |||||
3563 | 0 | $elt = $elt->parent; | |||||
3564 | } | ||||||
3565 | 0 | 0 | return ($elt->first_child('channel') | ||||
3566 | // $elt); | ||||||
3567 | } | ||||||
3568 | |||||||
3569 | # return a Message-ID string for this $item coming from $self->{'uri'} | ||||||
3570 | # | ||||||
3571 | sub item_to_msgid { | ||||||
3572 | 0 | 0 | 0 | my ($self, $item) = @_; | |||
3573 | |||||||
3574 | 0 | 0 | if (is_non_empty (my $id = $item->first_child_text('id'))) { | ||||
3575 | # Atom |
||||||
3576 | 0 | return $self->url_to_msgid ($id, $item->first_child_text('updated')); | |||||
3577 | } | ||||||
3578 | |||||||
3579 | 0 | my $guid; | |||||
3580 | 0 | my $isPermaLink = 0; | |||||
3581 | 0 | 0 | if (my $elt = $item->first_child('guid')) { | ||||
3582 | # ignore empty |
||||||
3583 | # http://abc.net.au/rn/podcast/feeds/sci.xml | ||||||
3584 | 0 | 0 | if (is_non_empty (my $str = collapse_whitespace ($elt->text))) { | ||||
3585 | 0 | $guid = $str; | |||||
3586 | 0 | 0 | $isPermaLink = (lc($elt->att('isPermaLink') // 'true') eq 'true'); | ||||
3587 | } | ||||||
3588 | } | ||||||
3589 | |||||||
3590 | 0 | 0 | if ($isPermaLink) { # |
||||
3591 | 0 | return $self->url_to_msgid ($guid); | |||||
3592 | } | ||||||
3593 | 0 | 0 | if (my $link = item_yahoo_permalink ($item)) { | ||||
3594 | 0 | return $self->url_to_msgid ($link); | |||||
3595 | } | ||||||
3596 | 0 | 0 | if (defined $guid) { # |
||||
3597 | 0 | return $self->url_to_msgid ($self->{'uri'}, $guid); | |||||
3598 | } | ||||||
3599 | |||||||
3600 | # nothing in the item, use the feed url and MD5 of some fields which | ||||||
3601 | # will hopefully distinguish it from other items at this url | ||||||
3602 | 0 | $self->verbose (2, ' msgid from MD5'); | |||||
3603 | return $self->url_to_msgid | ||||||
3604 | ($self->{'uri'}, | ||||||
3605 | md5_of_utf8 (join_non_empty ('', | ||||||
3606 | 0 | map {$item->first_child_text($_)} | |||||
0 | |||||||
3607 | qw(title | ||||||
3608 | author | ||||||
3609 | dc:creator | ||||||
3610 | description | ||||||
3611 | content | ||||||
3612 | link | ||||||
3613 | pubDate | ||||||
3614 | published | ||||||
3615 | updated | ||||||
3616 | )))); | ||||||
3617 | } | ||||||
3618 | # FIXME: is |
||||||
3619 | # |
||||||
3620 | @known{qw(/channel/item/guid | ||||||
3621 | /channel/item/id | ||||||
3622 | /channel/item/wordzilla:id | ||||||
3623 | /channel/item/slate:id | ||||||
3624 | )} = (); | ||||||
3625 | |||||||
3626 | # Return an "In-Reply-To:" value for $item, being a space-separated list of | ||||||
3627 | # Message-ID strings including angles <>, or undef if nothing. The message | ||||||
3628 | # ids match up to an Atom |
||||||
3629 | # | ||||||
3630 | # RFC 4685 has |
||||||
3631 | # reply to multiple originals. | ||||||
3632 | # | ||||||
3633 | # Eg. comment feeds under | ||||||
3634 | # http://wickedgooddinner.blogspot.com/feeds/posts/default | ||||||
3635 | # | ||||||
3636 | sub item_to_in_reply_to { | ||||||
3637 | 0 | 0 | 0 | my ($self, $item) = @_; | |||
3638 | |||||||
3639 | 0 | my @ids; | |||||
3640 | 0 | foreach my $elt ($item->children('thr:in-reply-to')) { | |||||
3641 | 0 | 0 | my $ref = ($elt->att('thr:ref') | ||||
0 | |||||||
0 | |||||||
3642 | // $elt->att('ref') | ||||||
3643 | // $elt->att('atom:ref') # comes out atom: under map_xmlns ... | ||||||
3644 | // next); | ||||||
3645 | # probably shouldn't be relative actually ... | ||||||
3646 | 0 | $ref = App::RSS2Leafnode::XML::Twig::Other::elt_xml_based_uri ($elt, $ref); | |||||
3647 | 0 | push @ids, $self->url_to_msgid ($ref); | |||||
3648 | } | ||||||
3649 | 0 | 0 | if (@ids) { | ||||
3650 | 0 | return join (' ', @ids); | |||||
3651 | } else { | ||||||
3652 | 0 | return undef; | |||||
3653 | } | ||||||
3654 | } | ||||||
3655 | @known{qw(/channel/item/thr:in-reply-to | ||||||
3656 | )} = (); | ||||||
3657 | |||||||
3658 | # Return a string of comma separated keywords per RFC1036 and RFC2822. | ||||||
3659 | # | ||||||
3660 | # RSS |
||||||
3661 | # that in as a bit of a fallback, being better than nothing for | ||||||
3662 | # classification. | ||||||
3663 | # | ||||||
3664 | # Atom |
||||||
3665 | # attribute being the displayable part. Have seen only the "term" attribute | ||||||
3666 | # though. | ||||||
3667 | # | ||||||
3668 | # |
||||||
3669 | # it in for more classification for now. Can have child |
||||||
3670 | # elements as sub-categories, but don't worry about them, haven't seen any | ||||||
3671 | # real ones, only the sample at | ||||||
3672 | # http://www.apple.com/itunes/podcasts/specs.html#example | ||||||
3673 | # | ||||||
3674 | # |
||||||
3675 | # |
||||||
3676 | # particularly informative. | ||||||
3677 | # | ||||||
3678 | # |
||||||
3679 | # should be in the keywords if it's also in the body text, but at least | ||||||
3680 | # offers a bit of classification in the headers. | ||||||
3681 | # | ||||||
3682 | # |
||||||
3683 | # want a bit of decoding. Not much used, but for instance | ||||||
3684 | # http://www.gdacs.org/xml/RSSTC.xml | ||||||
3685 | # http://earthquake.usgs.gov/eqcenter/recenteqsww/catalogs/eqs7day-M5.xml | ||||||
3686 | # | ||||||
3687 | # How much value is there in the channel keywords? | ||||||
3688 | # | ||||||
3689 | { | ||||||
3690 | my $re = qr/^(category | ||||||
3691 | |itunes:category | ||||||
3692 | |cap:category | ||||||
3693 | |itunes:keywords | ||||||
3694 | |media:keywords | ||||||
3695 | |dc:subject | ||||||
3696 | |slash:section | ||||||
3697 | |slate:topic | ||||||
3698 | )$/x; | ||||||
3699 | sub item_to_keywords { | ||||||
3700 | 0 | 0 | 0 | my ($self, $item) = @_; | |||
3701 | 0 | my $channel = elt_to_channel($item); | |||||
3702 | |||||||
3703 | return join_non_empty | ||||||
3704 | (', ', | ||||||
3705 | List::MoreUtils::uniq | ||||||
3706 | 0 | (map { collapse_whitespace($_) } | |||||
3707 | 0 | map { split /,/ } | |||||
3708 | 0 | 0 | map { ($_->att('text') # itunes:category | ||||
0 | |||||||
0 | |||||||
0 | |||||||
0 | |||||||
0 | |||||||
3709 | // $_->att('itunes:text') # itunes:category | ||||||
3710 | // $_->att('atom:label') # atom |
||||||
3711 | // $_->att('label') # atom |
||||||
3712 | // $_->att('atom:term') # atom |
||||||
3713 | // $_->att('term') # atom |
||||||
3714 | // $_->text) } # other | ||||||
3715 | ($item->children($re), | ||||||
3716 | $channel->children($re), | ||||||
3717 | # |
||||||
3718 | 0 | map {$_->children('cb:keyword')} $item->children, | |||||
0 | |||||||
3719 | ))); | ||||||
3720 | } | ||||||
3721 | # maybe could show |
||||||
3722 | # too, for now just omit | ||||||
3723 | @known{qw(/channel/category | ||||||
3724 | /channel/itunes:category | ||||||
3725 | /channel/itunes:category/itunes:category | ||||||
3726 | |||||||
3727 | /channel/item/category | ||||||
3728 | /channel/item/itunes:keywords | ||||||
3729 | /channel/item/media:keywords | ||||||
3730 | /channel/item/slash:section | ||||||
3731 | /channel/item/slate:topic | ||||||
3732 | /channel/item/slate:section | ||||||
3733 | )} = (); | ||||||
3734 | } | ||||||
3735 | |||||||
3736 | { | ||||||
3737 | # Feturn a string for the "Importance:" header of RFC 1911, RFC 2156 | ||||||
3738 | # voice and X.400 messaging. Possible values 'high', 'normal', 'low'. | ||||||
3739 | # 'normal' is the header default, return undef in that case in the | ||||||
3740 | # interests of not junking up headers with defaults | ||||||
3741 | # | ||||||
3742 | my %cap_severity_high = (extreme => 1, | ||||||
3743 | severe => 1); | ||||||
3744 | my %cap_severity_normal = (moderate => 1); | ||||||
3745 | my %cap_severity_low = (minor => 1); | ||||||
3746 | |||||||
3747 | sub item_to_importance { | ||||||
3748 | 0 | 0 | 0 | my ($self, $item) = @_; | |||
3749 | |||||||
3750 | 0 | 0 | my $cap_severity = lc($item->first_child_trimmed_text('cap:severity') | ||||
3751 | // ''); | ||||||
3752 | 0 | 0 | my $wiki_importance = ($item->first_child_trimmed_text('wiki:importance') | ||||
3753 | // ''); | ||||||
3754 | 0 | 0 | if ($cap_severity) { | ||||
3755 | 0 | $self->verbose (2, " CAP severity: ",$cap_severity); | |||||
3756 | 0 | $self->verbose (2, " Wiki importance: ",$wiki_importance); | |||||
3757 | } | ||||||
3758 | |||||||
3759 | 0 | 0 | if ($cap_severity_high{$cap_severity}) { | ||||
3760 | 0 | return 'high'; | |||||
3761 | } | ||||||
3762 | 0 | 0 | if ($cap_severity_normal{$cap_severity}) { | ||||
3763 | 0 | return undef; # default "normal" | |||||
3764 | } | ||||||
3765 | 0 | 0 | 0 | if ($cap_severity_low{$cap_severity} | |||
3766 | || $wiki_importance eq 'minor') { | ||||||
3767 | 0 | return 'low'; | |||||
3768 | } | ||||||
3769 | 0 | return undef; # unknown | |||||
3770 | } | ||||||
3771 | @known{qw(/channel/item/wiki:importance | ||||||
3772 | )} = (); | ||||||
3773 | } | ||||||
3774 | { | ||||||
3775 | # Return a string for the "Priority:" header of RFC 1327, RFC 2156. | ||||||
3776 | # Possible values 'urgent', 'normal', 'non-urgent'. | ||||||
3777 | # 'normal' is the header default, return undef in that case in the | ||||||
3778 | # interests of not junking up headers with defaults | ||||||
3779 | # | ||||||
3780 | # |
||||||
3781 | # for when response action should be taken. Is the |
||||||
3782 | # better indicator of transmission priority? | ||||||
3783 | # | ||||||
3784 | my %cap_severity_urgent = (extreme => 1, | ||||||
3785 | severe => 1); | ||||||
3786 | my %cap_severity_normal = (moderate => 1); | ||||||
3787 | |||||||
3788 | sub item_to_priority { | ||||||
3789 | 0 | 0 | 0 | my ($self, $item) = @_; | |||
3790 | |||||||
3791 | 0 | 0 | my $cap_severity = lc($item->first_child_trimmed_text('cap:severity') | ||||
3792 | // ''); | ||||||
3793 | |||||||
3794 | 0 | 0 | if ($cap_severity_urgent{$cap_severity}) { | ||||
3795 | 0 | return 'urgent'; | |||||
3796 | } | ||||||
3797 | 0 | 0 | if ($cap_severity_normal{$cap_severity}) { | ||||
3798 | 0 | return undef; # default "normal" | |||||
3799 | } | ||||||
3800 | 0 | if (0) { # nothing for this yet | |||||
3801 | return 'non-urgent'; | ||||||
3802 | } | ||||||
3803 | 0 | return undef; # unknown | |||||
3804 | } | ||||||
3805 | } | ||||||
3806 | |||||||
3807 | # return a string for the slightly unofficial "Precedence:" header | ||||||
3808 | # might be able to identify lists gatewayed to RSS and give "list" for them | ||||||
3809 | # maybe "bulk" would suit low priority stuff | ||||||
3810 | # for now nothing | ||||||
3811 | # | ||||||
3812 | # sub item_to_precedence { | ||||||
3813 | # my ($self, $item) = @_; | ||||||
3814 | # return undef; # nothing | ||||||
3815 | # } | ||||||
3816 | |||||||
3817 | # return the host part of $self->{'uri'}, or "localhost" if none | ||||||
3818 | sub uri_to_host { | ||||||
3819 | 0 | 0 | 0 | my ($self) = @_; | |||
3820 | 0 | my $uri = $self->{'uri'}; | |||||
3821 | ### uri_to_host(): $uri | ||||||
3822 | 0 | 0 | return (non_empty ($uri && $uri->can('host') && $uri->host) | ||||
0 | |||||||
3823 | // 'localhost'); | ||||||
3824 | } | ||||||
3825 | |||||||
3826 | sub item_to_subject { | ||||||
3827 | 0 | 0 | 0 | my ($self, $item) = @_; | |||
3828 | |||||||
3829 | # Atom |
||||||
3830 | return | ||||||
3831 | 0 | 0 | (elt_to_rendered_line ($item->first_child('title')) | ||||
0 | |||||||
0 | |||||||
0 | |||||||
3832 | |||||||
3833 | # |
||||||
3834 | # present without a plain |
||||||
3835 | # | ||||||
3836 | // elt_to_rendered_line ($item->first_child('dc:title')) | ||||||
3837 | |||||||
3838 | # eg. https://archive.org/services/collection-rss.php has |
||||||
3839 | # in addition to plain |
||||||
3840 | # |
||||||
3841 | # | ||||||
3842 | // elt_to_rendered_line ($item->first_child('media:title')) | ||||||
3843 | |||||||
3844 | # |
||||||
3845 | # better than nothing. Not sure have ever actually seen |
||||||
3846 | # without |
||||||
3847 | # | ||||||
3848 | // elt_to_rendered_line ($item->first_child('dc:subject')) | ||||||
3849 | |||||||
3850 | // __('no subject')); | ||||||
3851 | } | ||||||
3852 | @known{qw(/channel/title | ||||||
3853 | /channel/dc:subject | ||||||
3854 | /channel/subtitle | ||||||
3855 | /channel/itunes:subtitle | ||||||
3856 | |||||||
3857 | /channel/item/dc:subject | ||||||
3858 | /channel/item/title | ||||||
3859 | /channel/item/media:title | ||||||
3860 | /channel/item/dc:title | ||||||
3861 | /channel/item/itunes:title | ||||||
3862 | /channel/item/itunes:subtitle --not-using-this-as-yet | ||||||
3863 | /channel/item/slate:menuline --copy-of-subject-it-seems | ||||||
3864 | /channel/item/slate:rubric --blog-title | ||||||
3865 | /channel/item/slate:blog --blog-title | ||||||
3866 | /channel/item/slate:legacy_url --same-as-link-it-seems | ||||||
3867 | )} = (); | ||||||
3868 | |||||||
3869 | |||||||
3870 | # return language code string for Content-Language, or undef | ||||||
3871 | # return is per RFC 1766, RFC 3066, RFC 4646 | ||||||
3872 | # | ||||||
3873 | # xml:lang is defined to be per RFC 4646, no mangling needed | ||||||
3874 | # RSS |
||||||
3875 | # |
||||||
3876 | # cf. I18N::LangTags if mangling might be needed one day | ||||||
3877 | # | ||||||
3878 | sub item_to_language { | ||||||
3879 | 0 | 0 | 0 | my ($self, $item) = @_; | |||
3880 | 0 | my $lang; | |||||
3881 | |||||||
3882 | 0 | 0 | if (my $elt = $item->first_child('content')) { | ||||
3883 | 0 | $lang = non_empty ($elt->att('xml:lang')); | |||||
3884 | } | ||||||
3885 | # Either |
||||||
3886 | # xml:lang="" tag, in the item itself or in channel, and maybe xml:lang in | ||||||
3887 | # toplevel |
||||||
3888 | # xml:lang, not a |
||||||
3889 | 0 | for ( ; $item; $item = $item->parent) { | |||||
3890 | 0 | 0 | $lang //= (non_empty ($item->first_child_trimmed_text | ||||
0 | |||||||
0 | |||||||
3891 | (qr/^((dc:)?language|twitter:lang)$/)) | ||||||
3892 | // non_empty ($item->att('xml:lang')) | ||||||
3893 | // next); | ||||||
3894 | } | ||||||
3895 | 0 | 0 | return ($lang // $self->{'resp'}->content_language); | ||||
3896 | } | ||||||
3897 | @known{qw(/channel/language | ||||||
3898 | /channel/dc:language | ||||||
3899 | /channel/twitter:lang | ||||||
3900 | /channel/item/language | ||||||
3901 | /channel/item/dc:language | ||||||
3902 | /channel/item/twitter:lang | ||||||
3903 | )} = (); | ||||||
3904 | |||||||
3905 | # return arrayref of copyright strings | ||||||
3906 | # Keep all of multiple rights/license/etc in the interests of preserving all | ||||||
3907 | # statements. | ||||||
3908 | sub item_to_copyright { | ||||||
3909 | 0 | 0 | 0 | my ($self, $item) = @_; | |||
3910 | 0 | my $channel = elt_to_channel($item); | |||||
3911 | |||||||
3912 | # |
||||||
3913 | # suppress the latter in the presence of the former (dcterms: collapsed to | ||||||
3914 | # dc: by the map_xmlns). | ||||||
3915 | # | ||||||
3916 | # Atom |
||||||
3917 | # always plain text | ||||||
3918 | # | ||||||
3919 | 0 | my $re = qr/^(rights # Atom | |||||
3920 | |copyright # RSS, don't think entity-encoded html allowed there | ||||||
3921 | |dcterms:license | ||||||
3922 | |dc:rights | ||||||
3923 | |creativeCommons:licen[cs]e | ||||||
3924 | )$/x; | ||||||
3925 | # Atom sub-elem |
||||||
3926 | 0 | my @parents = ($item, $channel, $item->children('source')); | |||||
3927 | |||||||
3928 | 0 | my @strings; | |||||
3929 | 0 | foreach my $elt (map {$_->children($re)} @parents) { | |||||
0 | |||||||
3930 | 0 | push @strings, | |||||
3931 | join_non_empty(' ', | ||||||
3932 | elt_to_rendered_line($elt), | ||||||
3933 | # eg. |
||||||
3934 | $elt->att('rdf:resource')); | ||||||
3935 | } | ||||||
3936 | |||||||
3937 | # | ||||||
3938 | 0 | foreach my $link (map {$_->children('link')} @parents) { | |||||
0 | |||||||
3939 | ### link for copyright: $link->sprint | ||||||
3940 | 0 | 0 | 0 | if (($link->att('atom:rel')//$link->att('rel')//'') eq 'license') { | |||
0 | |||||||
3941 | 0 | 0 | push @strings, $link->att('atom:href')//$link->att('href'); | ||||
3942 | } | ||||||
3943 | } | ||||||
3944 | ### @strings | ||||||
3945 | 0 | return [ List::MoreUtils::uniq(grep {defined} @strings) ]; | |||||
0 | |||||||
3946 | } | ||||||
3947 | @known{qw(/channel/copyright | ||||||
3948 | /channel/rights | ||||||
3949 | /channel/dc:rights | ||||||
3950 | /channel/dc:license | ||||||
3951 | /channel/creativeCommons:licence | ||||||
3952 | /channel/creativeCommons:license | ||||||
3953 | /channel/item/dc:rights | ||||||
3954 | /channel/item/dc:license | ||||||
3955 | /channel/item/creativeCommons:licence | ||||||
3956 | /channel/item/creativeCommons:license | ||||||
3957 | )} = (); | ||||||
3958 | # /channel/item/media:credit --nothing-much-in-this-one | ||||||
3959 | |||||||
3960 | |||||||
3961 | # return string or undef | ||||||
3962 | sub item_to_generator { | ||||||
3963 | 0 | 0 | 0 | my ($self, $item) = @_; | |||
3964 | 0 | my $channel = elt_to_channel($item); | |||||
3965 | 0 | my @strings; | |||||
3966 | |||||||
3967 | # both RSS and Atom use |
||||||
3968 | # Atom can include version="" and uri="" | ||||||
3969 | 0 | 0 | if (my $generator = $channel->first_child('generator')) { | ||||
3970 | 0 | push @strings, join_non_empty (' ', | |||||
3971 | $generator->text, | ||||||
3972 | $generator->att('atom:version'), | ||||||
3973 | $generator->att('version'), | ||||||
3974 | $generator->att('atom:uri'), | ||||||
3975 | $generator->att('uri')); | ||||||
3976 | } | ||||||
3977 | |||||||
3978 | # FIXME: is this bit right? | ||||||
3979 | # | ||||||
3980 | # source="<a href="http://nongnu.org/identica-mode/" rel="nofollow">Emacs Identica-mode</a>" | ||||||
3981 | # source_link="http://nongnu.org/identica-mode/"> | ||||||
3982 | # | ||||||
3983 | 0 | 0 | if (my $notice = $item->first_child('statusnet:notice_info')) { | ||||
3984 | 0 | 0 | if (defined (my $html = $notice->att('atom:source'))) { | ||||
3985 | 0 | push @strings, join_non_empty (' ', | |||||
3986 | html_to_rendered_line($html), | ||||||
3987 | $notice->att('atom:source_link')); | ||||||
3988 | } | ||||||
3989 | } | ||||||
3990 | |||||||
3991 | 0 | return collapse_whitespace (join_non_empty (', ', @strings)); | |||||
3992 | } | ||||||
3993 | @known{qw(/channel/item/statusnet:notice_info | ||||||
3994 | )} = (); | ||||||
3995 | |||||||
3996 | # return URL string or undef/empty | ||||||
3997 | sub item_to_feedburner { | ||||||
3998 | 0 | 0 | 0 | my ($self, $item) = @_; | |||
3999 | 0 | my $channel = elt_to_channel($item); | |||||
4000 | 0 | 0 | my $elt = $channel->first_child('feedburner:info') || return; | ||||
4001 | 0 | 0 | my $uri = $elt->att('uri') // return; | ||||
4002 | 0 | return URI->new_abs ($uri, 'http://feeds.feedburner.com/')->as_string; | |||||
4003 | } | ||||||
4004 | |||||||
4005 | # $elt is an Atom |
||||||
4006 | sub atom_content_flavour { | ||||||
4007 | 0 | 0 | 0 | my ($elt) = @_; | |||
4008 | 0 | 0 | if (! defined $elt) { return ''; } | ||||
0 | |||||||
4009 | 0 | 0 | my $type = ($elt->att('atom:type') // $elt->att('type')); | ||||
4010 | 0 | 0 | 0 | if ($elt->att('atom:src') || $elt->att('src')) { | |||
4011 | # |
||||||
4012 | 0 | return 'link'; | |||||
4013 | } | ||||||
4014 | 0 | 0 | 0 | if (! defined $type | |||
0 | |||||||
0 | |||||||
0 | |||||||
4015 | || $type eq 'html' | ||||||
4016 | || $type eq 'xhtml' | ||||||
4017 | || $type eq 'application/xhtml+xml' | ||||||
4018 | || $type =~ m{^text/}) { | ||||||
4019 | 0 | return 'body'; | |||||
4020 | } | ||||||
4021 | 0 | return 'attach'; | |||||
4022 | } | ||||||
4023 | |||||||
4024 | sub html_wrap_fragment { | ||||||
4025 | 0 | 0 | 0 | my ($item, $fragment, $language) = @_; | |||
4026 | 0 | 0 | my $charset = (is_ascii($fragment) ? 'us-ascii' : 'utf-8'); | ||||
4027 | 0 | my $base_uri = App::RSS2Leafnode::XML::Twig::Other::elt_xml_base($item); | |||||
4028 | 0 | 0 | my $base_header = (defined $base_uri | ||||
4029 | ? " |
||||||
4030 | : ''); | ||||||
4031 | 0 | 0 | if (is_non_empty ($language)) { | ||||
4032 | 0 | $language = " lang=\"$Entitize{$language}\""; | |||||
4033 | } else { | ||||||
4034 | 0 | $language = ''; | |||||
4035 | } | ||||||
4036 | 0 | return (<<"HERE", $charset); | |||||
4037 | |||||||
4038 | |||||||
4039 | |||||||
4040 | |||||||
4041 | $base_header | ||||||
4042 | |||||||
4043 | $fragment | ||||||
4044 | |||||||
4045 | HERE | ||||||
4046 | } | ||||||
4047 | |||||||
4048 | # $self->{'rss_charset_override'}, if set, means the bytes are actually in | ||||||
4049 | # that charset. Enforce this by replacing the " | ||||||
4050 | # bytes. Do a decode() and re-encode() to cope with non-ascii like say | ||||||
4051 | # utf-16. | ||||||
4052 | # | ||||||
4053 | # XML::RSS::LibXML has an "encoding" option on its new(), but that's for | ||||||
4054 | # feed creation or something, a parse() still follows the tag. | ||||||
4055 | # | ||||||
4056 | sub enforce_rss_charset_override { | ||||||
4057 | 0 | 0 | 0 | my ($self, $xml) = @_; | |||
4058 | 0 | 0 | if (my $charset = $self->{'rss_charset_override'}) { | ||||
4059 | 0 | $xml = Encode::decode ($charset, $xml); | |||||
4060 | 0 | 0 | if ($xml =~ s/(<\?xml[^>]*encoding="?)([^">]+)/$1$charset/i) { | ||||
0 | |||||||
4061 | 0 | $self->verbose (2, "replace encoding=$2 tag with encoding=$charset"); | |||||
4062 | } elsif ($xml =~ s/(<\?xml[^?>]*)/$1 encoding="$charset"/i) { | ||||||
4063 | 0 | $self->verbose (2, "insert encoding=\"$charset\""); | |||||
4064 | } else { | ||||||
4065 | 0 | my $str = "\n"; | |||||
4066 | 0 | $self->verbose (2, "insert $str"); | |||||
4067 | 0 | $xml = $str . $xml; | |||||
4068 | } | ||||||
4069 | 0 | $self->verbose (3, "xml now:\n$xml\n"); | |||||
4070 | 0 | $xml = Encode::encode ($charset, $xml); | |||||
4071 | } | ||||||
4072 | 0 | return $xml; | |||||
4073 | } | ||||||
4074 | |||||||
4075 | # slightly experimental extract of "cap" fields as from | ||||||
4076 | # http://www.nws.noaa.gov/alerts-beta/ | ||||||
4077 | # http://www.weather.gov/alerts-beta/ca.php?x=0 | ||||||
4078 | sub item_common_alert_protocol { | ||||||
4079 | 0 | 0 | 0 | my ($self, $item, $want_html) = @_; | |||
4080 | 0 | my @fields; | |||||
4081 | 0 | foreach my $elt ($item->children(qr/^cap:/)) { | |||||
4082 | 0 | (my $field = $elt->name) =~ s/^cap://; | |||||
4083 | 0 | 0 | 0 | if ($field eq 'geocode' || $field eq 'parameter') { | |||
4084 | # dunno how to show these yet ... | ||||||
4085 | 0 | next; | |||||
4086 | } | ||||||
4087 | 0 | $known{'/channel/item/'.$elt->name} = undef; | |||||
4088 | |||||||
4089 | 0 | my $value = elt_to_rendered_line ($elt); | |||||
4090 | 0 | $value = Text::Trim::trim ($value); | |||||
4091 | 0 | 0 | if (is_non_empty ($value)) { | ||||
4092 | 0 | push @fields, [ "\u$field: ", $value ]; | |||||
4093 | } | ||||||
4094 | } | ||||||
4095 | 0 | 0 | if (! @fields) { | ||||
4096 | 0 | return ''; | |||||
4097 | } | ||||||
4098 | # FIXME: This $width padding doesn't come out in html, only in text. The | ||||||
4099 | # NOAA is Atom plain text, so that one is ok at least. | ||||||
4100 | 0 | my $width = max(map {length $_->[0]} @fields); | |||||
0 | |||||||
4101 | 0 | @fields = map { my $field = $_->[0]; | |||||
0 | |||||||
4102 | 0 | my $value = $_->[1]; | |||||
4103 | 0 | $field = sprintf ('%-*s', $width, $field); | |||||
4104 | 0 | $self->text_wrap ($value, $field) | |||||
4105 | } @fields; | ||||||
4106 | 0 | 0 | if ($want_html) { | ||||
4107 | return " \n" |
||||||
4108 | 0 | . join(" \n", map {$Entitize{$_}} @fields) |
|||||
0 | |||||||
4109 | . "\n\n"; | ||||||
4110 | } else { | ||||||
4111 | 0 | return "\n" | |||||
4112 | . join("\n", @fields) | ||||||
4113 | . "\n"; | ||||||
4114 | } | ||||||
4115 | } | ||||||
4116 | |||||||
4117 | sub item_unknowns { | ||||||
4118 | 0 | 0 | 0 | my ($self, $item, $want_html) = @_; | |||
4119 | ### item_unknowns() ... | ||||||
4120 | |||||||
4121 | 0 | my $xml = ''; | |||||
4122 | 0 | 0 | foreach my $elt (map {$_->tag eq 'media:group' # descend into media:group | ||||
0 | |||||||
4123 | ? $_->children : $_} | ||||||
4124 | $item->children) { | ||||||
4125 | 0 | 0 | next if $elt->tag =~ /^#/; # text | ||||
4126 | 0 | 0 | next if App::RSS2Leafnode::XML::Twig::Other::elt_is_empty($elt); | ||||
4127 | 0 | my $path = $elt->path; | |||||
4128 | 0 | $path =~ s{^/(rss|channel)/channel}{/channel}; | |||||
4129 | 0 | $path =~ s{^/(feed|rdf:RDF)}{/channel}; | |||||
4130 | 0 | $path =~ s{^/channel/entry}{/channel/item}; | |||||
4131 | 0 | 0 | next if $path =~ m{/xhtml}; | ||||
4132 | 0 | 0 | next if $path =~ m{^/channel/item/(description|content:encoded)/}; | ||||
4133 | 0 | 0 | next if exists $known{$path}; | ||||
4134 | ### unknown path: $path | ||||||
4135 | |||||||
4136 | 0 | require Text::Wrap; | |||||
4137 | 0 | my $part = do { | |||||
4138 | 0 | local $Text::Wrap::columns = $self->{'render_width'} + 1 + 4; | |||||
4139 | 0 | local $Text::Wrap::huge = 'overflow'; # don't break long words | |||||
4140 | 0 | local $Text::Wrap::unexpand = 0; # no tabs in output | |||||
4141 | 0 | $elt->sprint | |||||
4142 | }; | ||||||
4143 | 0 | $part =~ s/^ //mg; # indentation from element depth | |||||
4144 | 0 | $part =~ s/^\n+//; # leading blank lines | |||||
4145 | 0 | $xml .= $part; | |||||
4146 | } | ||||||
4147 | 0 | 0 | if ($xml eq '') { | ||||
4148 | 0 | return ''; | |||||
4149 | } | ||||||
4150 | ### $xml | ||||||
4151 | |||||||
4152 | 0 | 0 | if ($want_html) { | ||||
4153 | 0 | return "\n \n" . __('Further feed XML:') . " |
|||||
4154 | . "$Entitize{$xml}\n\n"; |
||||||
4155 | } else { | ||||||
4156 | 0 | return "\n" . __('Further feed XML:') . "\n" . $xml; | |||||
4157 | } | ||||||
4158 | } | ||||||
4159 | |||||||
4160 | @known{qw(/channel/item/media:group/media:title | ||||||
4161 | /channel/item/media:group/media:description | ||||||
4162 | /channel/item/media:group/media:credit | ||||||
4163 | /channel/item/media:group/media:player | ||||||
4164 | /channel/item/media:group/media:thumbnail | ||||||
4165 | /channel/item/media:group/media:content | ||||||
4166 | /channel/item/media:group/media:copyright | ||||||
4167 | |||||||
4168 | --ENHANCE-ME--nothing-for-these-yet | ||||||
4169 | /channel/item/media:group/media:category | ||||||
4170 | /channel/item/media:group/media:rating | ||||||
4171 | )} = (); # hash slice | ||||||
4172 | |||||||
4173 | sub media_group_to_html { | ||||||
4174 | 0 | 0 | 0 | my ($self, $group) = @_; | |||
4175 | ### media_group_to_html(): "$group" | ||||||
4176 | |||||||
4177 | 0 | my $ret = " \n"; |
|||||
4178 | 0 | my @lines; | |||||
4179 | |||||||
4180 | 0 | foreach my $elt ($group->children('media:title'), | |||||
4181 | $group->children('media:description')) { | ||||||
4182 | 0 | push @lines, elt_to_html($elt); | |||||
4183 | } | ||||||
4184 | |||||||
4185 | 0 | foreach my $elt ($group->children('media:credit')) { | |||||
4186 | 0 | my $html = elt_to_html($elt); | |||||
4187 | 0 | 0 | if (defined (my $role = non_empty($elt->att('role')))) { | ||||
4188 | 0 | $html .= " ($Entitize{$role})"; | |||||
4189 | } | ||||||
4190 | 0 | push @lines, $html; | |||||
4191 | } | ||||||
4192 | 0 | foreach my $elt ($group->children('media:player'), | |||||
4193 | $group->children('media:thumbnail'), | ||||||
4194 | $group->children('media:content')) { | ||||||
4195 | 0 | 0 | my $url = $elt->att('url') // next; | ||||
4196 | 0 | my $abs_url = App::RSS2Leafnode::XML::Twig::Other::elt_xml_based_uri | |||||
4197 | ($group, $url); | ||||||
4198 | |||||||
4199 | 0 | my $html = " | |||||
4200 | 0 | 0 | if (defined (my $type = non_empty($elt->att('type')))) { | ||||
4201 | 0 | $html .= " type=\"$Entitize{$type}\""; | |||||
4202 | } | ||||||
4203 | 0 | 0 | if (defined (my $lang = non_empty($elt->att('lang')))) { | ||||
4204 | 0 | $html .= " hreflang=\"$Entitize{$lang}\""; | |||||
4205 | } | ||||||
4206 | 0 | $html .= ">$Entitize{$url}$url"; | |||||
4207 | { | ||||||
4208 | 0 | my @paren; | |||||
0 | |||||||
4209 | 0 | 0 | if (defined (my $size = non_empty($elt->att('fileSize')))) { | ||||
4210 | 0 | push @paren, $self->format_size_in_bytes($size); | |||||
4211 | } | ||||||
4212 | 0 | 0 | if (defined (my $duration = non_empty($elt->att('duration')))) { | ||||
4213 | 0 | 0 | if ($duration !~ /:/) { | ||||
4214 | 0 | $duration = __px('s-for-seconds', '{duration}s', | |||||
4215 | duration => $duration); | ||||||
4216 | } | ||||||
4217 | 0 | push @paren, $duration; | |||||
4218 | } | ||||||
4219 | 0 | 0 | if (@paren) { | ||||
4220 | 0 | $html .= $Entitize{' (' . join(', ',@paren). ')'}; | |||||
4221 | } | ||||||
4222 | } | ||||||
4223 | 0 | $html .= "\n"; | |||||
4224 | 0 | push @lines, $html; | |||||
4225 | } | ||||||
4226 | |||||||
4227 | 0 | foreach my $elt ($group->children('media:copyright')) { | |||||
4228 | 0 | push @lines, "Copyright: ".elt_to_html($elt); | |||||
4229 | } | ||||||
4230 | |||||||
4231 | ### total lines: scalar(@lines) | ||||||
4232 | 0 | return " \n" . join(" |
|||||
4233 | } | ||||||
4234 | |||||||
4235 | sub elt_to_html { | ||||||
4236 | 0 | 0 | 0 | my ($elt) = @_; | |||
4237 | 0 | 0 | defined $elt or return; | ||||
4238 | |||||||
4239 | 0 | my $type = elt_content_type ($elt); | |||||
4240 | 0 | 0 | if ($type eq 'xhtml') { | ||||
4241 | 0 | return elt_xhtml_to_html($elt); | |||||
4242 | } | ||||||
4243 | 0 | my $str = elt_subtext($elt); | |||||
4244 | 0 | 0 | if ($type eq 'html') { | ||||
4245 | 0 | return $str; | |||||
4246 | } else { | ||||||
4247 | 0 | return $Entitize{$str}; | |||||
4248 | } | ||||||
4249 | } | ||||||
4250 | |||||||
4251 | # $body construction below | ||||||
4252 | @known{qw(/channel/item/description | ||||||
4253 | /channel/item/dc:description | ||||||
4254 | /channel/item/itunes:summary | ||||||
4255 | /channel/item/content:encoded | ||||||
4256 | /channel/item/summary | ||||||
4257 | )} = (); | ||||||
4258 | |||||||
4259 | # $item is an XML::Twig::Elt | ||||||
4260 | # | ||||||
4261 | sub fetch_rss_process_one_item { | ||||||
4262 | 0 | 0 | 0 | my ($self, $item) = @_; | |||
4263 | 0 | my $subject = $self->item_to_subject ($item); | |||||
4264 | 0 | $self->verbose (1, ' ', __x('item: {subject}', subject => $subject)); | |||||
4265 | |||||||
4266 | 0 | my $msgid = $self->item_to_msgid ($item); | |||||
4267 | 0 | my $new = 0; | |||||
4268 | |||||||
4269 | 0 | 0 | if (! $self->nntp_message_id_exists ($msgid)) { | ||||
4270 | 0 | my $channel = elt_to_channel($item); | |||||
4271 | 0 | my ($from, $sender, @from_links) = $self->item_to_from($item); | |||||
4272 | 0 | my @links = ($self->item_to_links ($item), | |||||
4273 | @from_links); | ||||||
4274 | |||||||
4275 | # For comments feeds show "Re: Foo" as the subject. Haven't seen a | ||||||
4276 | # comments feed with anything useful in the |
||||||
4277 | # including it at the start of the message body if it was any good. | ||||||
4278 | # | ||||||
4279 | # | ||||||
4280 | # http://www.netzpolitik.org/feed/ has |
||||||
4281 | # just "Von: Foo" where Foo is the poster's name. | ||||||
4282 | # | ||||||
4283 | # my $dummy = $self->DUMMY_EMAIL_ADDRESS; | ||||||
4284 | # if ($from =~ /(.*) <\Q$dummy\E>$/ | ||||||
4285 | # && $subject eq "Von: $1") { | ||||||
4286 | # $subject = $self->{'getting_rss_comments'}; | ||||||
4287 | # } | ||||||
4288 | # | ||||||
4289 | 0 | 0 | if (defined $self->{'getting_rss_comments'}) { | ||||
4290 | 0 | $subject = $self->{'getting_rss_comments'}; | |||||
4291 | } | ||||||
4292 | |||||||
4293 | 0 | my $list_post = googlegroups_link_email(@links); | |||||
4294 | 0 | 0 | my $precedence = (defined $list_post ? 'list' : undef); | ||||
4295 | 0 | my $language = $self->item_to_language($item); | |||||
4296 | |||||||
4297 | # RSS |
||||||
4298 | # http://www.w3.org/TR/REC-PICS-labels | ||||||
4299 | # ENHANCE-ME: Maybe transform |
||||||
4300 | # PICS too maybe, unless it only applies to the enclosure as such. Maybe | ||||||
4301 | # |
||||||
4302 | 0 | my $pics_label = collapse_whitespace ($channel->first_child_text('rating')); | |||||
4303 | |||||||
4304 | # Crib: an undef value for a header means omit that header, which is good | ||||||
4305 | # for say the merely optional "Content-Language" | ||||||
4306 | # | ||||||
4307 | # there can be multiple "feed" links from Atom ... | ||||||
4308 | # 'X-RSS-Feed-Link:' => $channel->{'link'}, | ||||||
4309 | # | ||||||
4310 | my %headers | ||||||
4311 | = ('Path:' => scalar ($self->uri_to_host), | ||||||
4312 | 'Newsgroups:' => $self->{'nntp_group'}, | ||||||
4313 | From => $from, | ||||||
4314 | Sender => $from, | ||||||
4315 | Subject => $subject, | ||||||
4316 | Keywords => scalar ($self->item_to_keywords($item)), | ||||||
4317 | Date => scalar ($self->item_to_date($item)), | ||||||
4318 | 'In-Reply-To:' => scalar ($self->item_to_in_reply_to($item)), | ||||||
4319 | References => $self->{'References:'}, | ||||||
4320 | 'Message-ID' => $msgid, | ||||||
4321 | 'Content-Language:' => $language, | ||||||
4322 | 'Importance:' => scalar ($self->item_to_importance($item)), | ||||||
4323 | 'Priority:' => scalar ($self->item_to_priority($item)), | ||||||
4324 | 'Face:' => scalar ($self->item_to_face($item)), | ||||||
4325 | 'List-Post:' => $list_post, | ||||||
4326 | 'Precedence:' => $precedence, | ||||||
4327 | 'PICS-Label:' => $pics_label, | ||||||
4328 | 'X-Copyright:' => scalar ($self->item_to_copyright($item)), | ||||||
4329 | 0 | 'X-RSS-URL:' => scalar ($self->{'uri'}->as_string), | |||||
4330 | 'X-RSS-Feedburner:' => scalar ($self->item_to_feedburner($item)), | ||||||
4331 | 'X-RSS-Generator:' => scalar ($self->item_to_generator($item)), | ||||||
4332 | ); | ||||||
4333 | |||||||
4334 | 0 | my $attach_elt; | |||||
4335 | |||||||
4336 | # |
||||||
4337 | # a copy of |
||||||
4338 | # | ||||||
4339 | # ENHANCE-ME: |
||||||
4340 | # as well as |
||||||
4341 | # | ||||||
4342 | my $body = ( | ||||||
4343 | # |
||||||
4344 | # |
||||||
4345 | $item->first_child('content:encoded') | ||||||
4346 | || $item->first_child('description') | ||||||
4347 | || $item->first_child('dc:description') | ||||||
4348 | || $item->first_child('itunes:summary') | ||||||
4349 | 0 | 0 | || do { | ||||
4350 | # Atom spec is for no more than one |
||||||
4351 | # Exclude "link", and leave "attach" to code below. | ||||||
4352 | my $elt = $item->first_child('content'); | ||||||
4353 | my $flavour = atom_content_flavour($elt); | ||||||
4354 | ($flavour eq 'link' ? undef | ||||||
4355 | : $flavour eq 'attach' ? do { $attach_elt = $elt; undef } | ||||||
4356 | : $elt) | ||||||
4357 | } | ||||||
4358 | || $item->first_child('summary')); # Atom | ||||||
4359 | |||||||
4360 | 0 | my $body_type = elt_content_type ($body); | |||||
4361 | 0 | $self->verbose (3, ' body_type from elt: ', $body_type); | |||||
4362 | 0 | my $body_charset = 'utf-8'; | |||||
4363 | 0 | my $body_base_url = App::RSS2Leafnode::XML::Twig::Other::elt_xml_base ($body); | |||||
4364 | 0 | 0 | if (! defined $body_type) { # no $body element at all | ||||
0 | |||||||
0 | |||||||
0 | |||||||
0 | |||||||
4365 | 0 | $body = ''; | |||||
4366 | 0 | $body_type = 'text/plain'; | |||||
4367 | |||||||
4368 | } elsif ($body_type eq 'xhtml') { # Atom | ||||||
4369 | 0 | $body = elt_xhtml_to_html ($body); | |||||
4370 | 0 | $body_type = 'html'; | |||||
4371 | |||||||
4372 | } elsif ($body_type eq 'html') { # RSS or Atom | ||||||
4373 | 0 | $body = elt_subtext($body); | |||||
4374 | |||||||
4375 | } elsif ($body_type eq 'text') { # Atom 'text' to be flowed | ||||||
4376 | # should be text-only, no sub-elements, but extract sub-elements to | ||||||
4377 | # cope with dodgy feeds with improperly escaped html etc | ||||||
4378 | 0 | $body = $self->text_wrap (elt_subtext ($body)); | |||||
4379 | 0 | $body_type = 'text/plain'; | |||||
4380 | } elsif ($body_type =~ m{^text/}) { # Atom mime text type | ||||||
4381 | 0 | $body = elt_subtext ($body); | |||||
4382 | |||||||
4383 | } else { # Atom base64 something | ||||||
4384 | 0 | $body = MIME::Base64::decode ($body->text); | |||||
4385 | 0 | $body_charset = undef; | |||||
4386 | } | ||||||
4387 | 0 | 0 | $self->verbose (3, " body: $body_type charset=", | ||||
4388 | $body_charset//'undef', "\n", | ||||||
4389 | "$body\n"); | ||||||
4390 | |||||||
4391 | 0 | 0 | my $body_is_html = ($body_type eq 'html'|| $body_type eq 'text/html'); | ||||
4392 | 0 | 0 | my $links_want_html = ($body_is_html && ! $self->{'render'}); | ||||
4393 | 0 | 0 | $self->verbose (3, " links_want_html: ", | ||||
4394 | ($links_want_html ? "yes" : "no")); | ||||||
4395 | |||||||
4396 | # sort downloadables to the start, then by "priority" | ||||||
4397 | 2 | 2 | 1083 | use sort 'stable'; | |||
2 | 758 | ||||||
2 | 9 | ||||||
4398 | 0 | @links = sort {($b->{'download'}||0) <=> ($a->{'download'}||0) | |||||
4399 | 0 | 0 | 0 | || ($b->{'priority'}||0) <=> ($a->{'priority'}||0)} | |||
0 | |||||||
0 | |||||||
0 | |||||||
4400 | @links; | ||||||
4401 | 0 | 0 | my $links_str = ($links_want_html | ||||
4402 | ? links_to_html(@links) | ||||||
4403 | : links_to_text(@links)); | ||||||
4404 | 0 | $links_str .= $self->item_common_alert_protocol($item, $links_want_html); | |||||
4405 | 0 | my @parts; | |||||
4406 | |||||||
4407 | # |
||||||
4408 | { | ||||||
4409 | 0 | my $content = join ("\n", | |||||
4410 | 0 | map {$self->media_group_to_html($_)} | |||||
0 | |||||||
4411 | $item->children('media:group')); | ||||||
4412 | 0 | 0 | if (is_non_empty($content)) { | ||||
4413 | 0 | ($content, my $charset) = html_wrap_fragment ($item, $content); | |||||
4414 | 0 | my $content_type = 'text/html'; | |||||
4415 | 0 | ($content, $content_type, $charset, my $rendered) | |||||
4416 | = $self->render_maybe ($content, $content_type, $charset, | ||||||
4417 | $body_base_url); | ||||||
4418 | ### media group content: $content | ||||||
4419 | 0 | 0 | if ($content_type eq 'text/plain') { | ||||
4420 | 0 | $links_str .= $content; | |||||
4421 | } else { | ||||||
4422 | 0 | $content = Encode::encode ($charset, $content); | |||||
4423 | 0 | push @parts, $self->mime_build ({}, # headers | |||||
4424 | Type => $content_type, | ||||||
4425 | Charset => $charset, | ||||||
4426 | Data => $content); | ||||||
4427 | } | ||||||
4428 | } | ||||||
4429 | } | ||||||
4430 | |||||||
4431 | 0 | 0 | if (is_non_empty(my $content | ||||
4432 | = $self->item_unknowns($item, $links_want_html))) { | ||||||
4433 | 0 | 0 | my $content_type = ($links_want_html ? 'text/html' : 'text/plain'); | ||||
4434 | 0 | 0 | if (@parts) { | ||||
4435 | 0 | 0 | my $charset = (is_ascii($content) ? 'us-ascii' : 'utf-8'); | ||||
4436 | 0 | $content = Encode::encode ($charset, $content); | |||||
4437 | 0 | push @parts, $self->mime_build ({}, # headers | |||||
4438 | Type => $content_type, | ||||||
4439 | Charset => $charset, | ||||||
4440 | Data => $content); | ||||||
4441 | } else { | ||||||
4442 | 0 | $links_str .= $content; | |||||
4443 | } | ||||||
4444 | } | ||||||
4445 | |||||||
4446 | 0 | 0 | if ($self->{'rss_get_links'}) { | ||||
4447 | 0 | foreach my $l (@links) { | |||||
4448 | 0 | 0 | next if ! $l->{'download'}; | ||||
4449 | 0 | my $url = $l->{'uri'}; | |||||
4450 | $self->verbose (1, ' ', __x('link: "{name}" {url}', | ||||||
4451 | 0 | name => $l->{'name'}, | |||||
4452 | url => $url)); | ||||||
4453 | 0 | require HTTP::Request; | |||||
4454 | 0 | my $req = HTTP::Request->new (GET => $url); | |||||
4455 | 0 | my $resp = $self->ua->request($req); | |||||
4456 | 0 | $resp = $self->aireview_follow ($url, $resp); | |||||
4457 | |||||||
4458 | 0 | 0 | if (! $resp->is_success) { | ||||
4459 | print __x("rss2leafnode: {url}\n {status}\n", | ||||||
4460 | 0 | url => $l->{'uri'}, | |||||
4461 | status => $resp->status_line); | ||||||
4462 | my $msg = __x("Cannot download link {url}\n {status}", | ||||||
4463 | 0 | url => $l->{'uri'}, | |||||
4464 | status => $resp->status_line); | ||||||
4465 | 0 | 0 | if ($links_want_html) { | ||||
4466 | 0 | $msg = $Entitize{$msg}; | |||||
4467 | 0 | $msg =~ s/\n/ /; |
|||||
4468 | 0 | $links_str .= " $msg\n \n"; |
|||||
4469 | } else { | ||||||
4470 | 0 | $links_str .= "\n$msg\n"; | |||||
4471 | } | ||||||
4472 | 0 | next; | |||||
4473 | } | ||||||
4474 | |||||||
4475 | # suspect little value in a description when inlined | ||||||
4476 | # 'Content-Description:' => mimewords_non_ascii($l->{'title'}) | ||||||
4477 | # favicon used for Face if nothing in the item | ||||||
4478 | # | ||||||
4479 | 0 | $self->enforce_html_charset_from_content ($resp); | |||||
4480 | 0 | 0 | $headers{'Face:'} ||= $self->http_resp_to_face($resp); | ||||
4481 | 0 | $self->http_resp_extract_main($resp); | |||||
4482 | 0 | push @parts, $self->mime_part_from_response($resp); | |||||
4483 | } | ||||||
4484 | } | ||||||
4485 | 0 | 0 | 0 | if ($links_want_html && $body_type eq 'html') { | |||
4486 | # append to html fragment | ||||||
4487 | 0 | $body .= $links_str; | |||||
4488 | 0 | undef $links_str; | |||||
4489 | } | ||||||
4490 | |||||||
4491 | 0 | 0 | if ($body_type eq 'html') { | ||||
4492 | 0 | ($body, $body_charset) = html_wrap_fragment ($item, $body, $language); | |||||
4493 | 0 | $body_type = 'text/html'; | |||||
4494 | } | ||||||
4495 | 0 | 0 | if (defined $body_charset) { | ||||
4496 | 0 | $body = Encode::encode ($body_charset, $body); | |||||
4497 | } | ||||||
4498 | |||||||
4499 | 0 | ($body, $body_type, $body_charset) | |||||
4500 | = $self->render_maybe ($body, $body_type, $body_charset, $body_base_url); | ||||||
4501 | |||||||
4502 | 0 | 0 | if ($body_type eq 'text/plain') { | ||||
4503 | # remove trailing whitespace from any text | ||||||
4504 | 0 | $body =~ s/\s+$//; | |||||
4505 | 0 | $body .= "\n"; | |||||
4506 | |||||||
4507 | 0 | 0 | if (! $links_want_html) { | ||||
4508 | # append to text/plain, either atom type=text or rendered html | ||||||
4509 | 0 | 0 | unless (is_empty ($links_str)) { | ||||
4510 | 0 | $links_str = Encode::encode ($body_charset, $links_str); | |||||
4511 | 0 | $body .= "\n$links_str\n"; | |||||
4512 | } | ||||||
4513 | 0 | undef $links_str; | |||||
4514 | } | ||||||
4515 | } | ||||||
4516 | |||||||
4517 | 0 | 0 | unless (is_empty ($links_str)) { | ||||
4518 | 0 | my $links_type; | |||||
4519 | my $links_charset; | ||||||
4520 | 0 | 0 | if ($links_want_html) { | ||||
4521 | 0 | $links_type = 'text/html'; | |||||
4522 | 0 | ($links_str, $links_charset) = html_wrap_fragment ($item, $links_str); | |||||
4523 | } else { | ||||||
4524 | 0 | $links_type = 'text/plain'; | |||||
4525 | 0 | 0 | $links_charset = (is_ascii($links_str) ? 'us-ascii' : 'utf-8'); | ||||
4526 | } | ||||||
4527 | 0 | $links_str = Encode::encode ($links_charset, $links_str); | |||||
4528 | 0 | unshift @parts, $self->mime_build ({}, | |||||
4529 | Type => $links_type, | ||||||
4530 | Encoding => $links_charset, | ||||||
4531 | Data => $links_str); | ||||||
4532 | } | ||||||
4533 | |||||||
4534 | |||||||
4535 | 0 | my $top = $self->mime_build (\%headers, | |||||
4536 | Top => 1, | ||||||
4537 | Type => $body_type, | ||||||
4538 | Charset => $body_charset, | ||||||
4539 | Data => $body); | ||||||
4540 | |||||||
4541 | # Atom |
||||||
4542 | 0 | 0 | if ($attach_elt) { | ||||
4543 | # ENHANCE-ME: this decodes base64 from the xml and then re-encodes for | ||||||
4544 | # the mime, is it possible to pass straight in? | ||||||
4545 | unshift @parts, $self->mime_build | ||||||
4546 | 0 | 0 | ({ 'Content-Location:' => $self->{'uri'}->as_string }, | ||||
4547 | Type => scalar ($attach_elt->att('atom:type') | ||||||
4548 | // $attach_elt->att('type')), | ||||||
4549 | Encoding => 'base64', | ||||||
4550 | Data => MIME::Base64::decode($attach_elt->text)); | ||||||
4551 | } | ||||||
4552 | |||||||
4553 | 0 | $self->verbose (2, 'parts count: ',scalar(@parts)); | |||||
4554 | 0 | foreach my $part (@parts) { | |||||
4555 | 0 | $top->make_multipart; | |||||
4556 | 0 | $top->add_part ($part); | |||||
4557 | } | ||||||
4558 | |||||||
4559 | 0 | mime_entity_lines($top); | |||||
4560 | 0 | 0 | $self->nntp_post($top) || return 0; | ||||
4561 | 0 | $self->verbose (1, ' ', __('posted')); | |||||
4562 | 0 | $new++; | |||||
4563 | } | ||||||
4564 | |||||||
4565 | # ENHANCE-ME: check the replies count to see if more to fetch | ||||||
4566 | 0 | 0 | if ($self->{'rss_get_comments'}) { | ||||
4567 | 0 | my ($comments_rss_url, $comments_count) | |||||
4568 | = $self->item_to_comments_rss($item); | ||||||
4569 | ### rss_get_comments: $comments_rss_url, $comments_count | ||||||
4570 | 0 | 0 | if (defined $comments_rss_url) { | ||||
4571 | |||||||
4572 | # ENHANCE-ME: There's also a thr:updated in RFC 4685, but haven't seen | ||||||
4573 | # that ever actually used. | ||||||
4574 | 0 | my $status = $self->status_geturl ($comments_rss_url); | |||||
4575 | 0 | 0 | 0 | if (defined $status->{'comments_count'} | |||
0 | |||||||
4576 | && defined $comments_count | ||||||
4577 | && $status->{'comments_count'} == $comments_count) { | ||||||
4578 | 0 | $self->verbose (1, ' ', __x('comments count unchanged: {count}', | |||||
4579 | count => $comments_count)); | ||||||
4580 | |||||||
4581 | } else { | ||||||
4582 | 0 | local $self->{'rss_get_links'} = 0; | |||||
4583 | 0 | local $self->{'rss_get_comments'} = 0; | |||||
4584 | 0 | local $self->{'comments_count'} = $comments_count; | |||||
4585 | # "Re:" is not translated, variants of that are very annoying | ||||||
4586 | 0 | local $self->{'getting_rss_comments'} = "Re: $subject"; | |||||
4587 | 0 | local $self->{'References:'} = $msgid; | |||||
4588 | 0 | $new += fetch_rss ($self, $self->{'nntp_group'}, $comments_rss_url); | |||||
4589 | } | ||||||
4590 | } | ||||||
4591 | } | ||||||
4592 | 0 | return $new; | |||||
4593 | } | ||||||
4594 | |||||||
4595 | sub item_to_comments_rss { | ||||||
4596 | 0 | 0 | 0 | my ($self, $item) = @_; | |||
4597 | 0 | my ($url, $url_elt); | |||||
4598 | |||||||
4599 | # Atom | ||||||
4600 | # href='http:/...' /> | ||||||
4601 | 0 | foreach my $elt ($item->children('link')) { | |||||
4602 | 0 | 0 | my $rel = ($elt->att('rel') | ||||
0 | |||||||
4603 | // $elt->att('atom:rel') | ||||||
4604 | // next); | ||||||
4605 | 0 | 0 | $rel eq 'replies' or next; | ||||
4606 | 0 | 0 | $self->atom_link_is_rss($elt) or next; | ||||
4607 | 0 | 0 | my $href = ($elt->att('href') | ||||
4608 | // $elt->att('atom:href')); | ||||||
4609 | 0 | 0 | if (is_non_empty ($href)) { | ||||
4610 | 0 | $url = $href; | |||||
4611 | 0 | $elt = $url_elt; | |||||
4612 | } | ||||||
4613 | } | ||||||
4614 | |||||||
4615 | # |
||||||
4616 | # it appeared in the spec page as wfw:commentRSS, so ignore case | ||||||
4617 | 0 | 0 | if (! defined $url) { | ||||
4618 | 0 | my $u = $item->first_child_trimmed_text (qr/^wfw:commentRss$/i); | |||||
4619 | 0 | 0 | if (is_non_empty ($u)) { | ||||
4620 | 0 | $url = $u; | |||||
4621 | } | ||||||
4622 | } | ||||||
4623 | |||||||
4624 | 0 | 0 | return ($url, | ||||
4625 | (defined($url) && $self->item_elt_comments_count($item,$url_elt))); | ||||||
4626 | } | ||||||
4627 | |||||||
4628 | # |
||||||
4629 | # are which are replies, there's no comments link as such for it to refer | ||||||
4630 | # to, it seems | ||||||
4631 | sub item_elt_comments_count { | ||||||
4632 | 0 | 0 | 0 | my ($self, $item, $elt) = @_; | |||
4633 | 0 | 0 | return (($elt && $elt->att('thr:count')) | ||||
0 | |||||||
0 | |||||||
0 | |||||||
0 | |||||||
0 | |||||||
0 | |||||||
4634 | // ($elt && $elt->att('count')) | ||||||
4635 | // ($elt && $elt->att('atom:count')) | ||||||
4636 | // non_empty ($item->first_child_trimmed_text('thr:total')) | ||||||
4637 | // non_empty ($item->first_child_trimmed_text('slash:comments'))); | ||||||
4638 | } | ||||||
4639 | @known{qw(/channel/item/jf:replyCount | ||||||
4640 | )} = (); | ||||||
4641 | |||||||
4642 | # $group is a string, the name of a local newsgroup | ||||||
4643 | # $url is a string, an RSS feed to be read | ||||||
4644 | # | ||||||
4645 | sub fetch_rss { | ||||||
4646 | 0 | 0 | 1 | my ($self, $group, $url, %options) = @_; | |||
4647 | 0 | local @{$self}{keys %options} = values %options; # hash slice | |||||
0 | |||||||
4648 | 0 | $self->verbose (2, "fetch_rss: $group $url"); | |||||
4649 | |||||||
4650 | 0 | my $group_uri = URI->new($group,'news'); | |||||
4651 | 0 | local $self->{'nntp_host'} = uri_to_nntp_host ($group_uri); | |||||
4652 | 0 | local $self->{'nntp_group'} = $group = $group_uri->group; | |||||
4653 | 0 | 0 | $self->nntp_group_check($group) or return 0; | ||||
4654 | |||||||
4655 | # an in-memory cookie jar, used only per-RSS feed and then discarded, | ||||||
4656 | # which means only kept for fetching for $self->{'rss_get_links'} from a | ||||||
4657 | # feed | ||||||
4658 | 0 | $self->ua->cookie_jar({}); | |||||
4659 | |||||||
4660 | 0 | 0 | if (defined $self->{'getting_rss_comments'}) { | ||||
4661 | 0 | $self->verbose (1, ' ', __x('rss comments: {url}', url => $url)); | |||||
4662 | } else { | ||||||
4663 | 0 | $self->verbose (1, __x('feed: {url}', url => $url)); | |||||
4664 | } | ||||||
4665 | 0 | require HTTP::Request; | |||||
4666 | 0 | my $req = HTTP::Request->new (GET => $url); | |||||
4667 | 0 | 0 | $self->status_etagmod_req($req,1) || return 0; | ||||
4668 | |||||||
4669 | # $req->uri can be a URI object or a string | ||||||
4670 | 0 | local $self->{'uri'} = URI->new ($req->uri); | |||||
4671 | |||||||
4672 | 0 | my $resp = $self->ua->request($req); | |||||
4673 | 0 | 0 | if ($resp->code == 304) { | ||||
4674 | 0 | $self->status_unchanged ($url); | |||||
4675 | 0 | return 0; | |||||
4676 | } | ||||||
4677 | 0 | 0 | if (! $resp->is_success) { | ||||
4678 | 0 | print __x("rss2leafnode: {url}\n {status}\n", | |||||
4679 | url => $url, | ||||||
4680 | status => $resp->status_line); | ||||||
4681 | 0 | return 0; | |||||
4682 | } | ||||||
4683 | 0 | local $self->{'resp'} = $resp; | |||||
4684 | 0 | my $bytes = length($resp->as_string); | |||||
4685 | |||||||
4686 | 0 | $self->verbose (3, "response:", $resp->dump, "\n"); # extra newline | |||||
4687 | 0 | 0 | $resp->decode | ||||
4688 | or die "Oops, cannot decode Content-Encoding: ", | ||||||
4689 | $self->header("Content-Encoding"); | ||||||
4690 | |||||||
4691 | 0 | my $xml = $resp->content; # raw bytes | |||||
4692 | 0 | $xml = $self->enforce_rss_charset_override ($xml); | |||||
4693 | |||||||
4694 | 0 | my ($twig, $err) = $self->twig_parse($xml); | |||||
4695 | 0 | 0 | if (defined $err) { | ||||
4696 | 0 | my $message = __x("XML::Twig parse error on\n\n {url}\n\n", | |||||
4697 | url => $url); | ||||||
4698 | 0 | 0 | if ($resp->request->uri ne $url) { | ||||
4699 | 0 | $message .= __x("which redirected to\n\n {url}\n\n", | |||||
4700 | url => $resp->request->uri); | ||||||
4701 | } | ||||||
4702 | 0 | $message .= $err . "\n\n" . __("Raw XML below.\n") . "\n"; | |||||
4703 | 0 | $self->error_message | |||||
4704 | (__x("Error parsing {url}", url => $url), | ||||||
4705 | $message, $xml); | ||||||
4706 | # after successful error message to news | ||||||
4707 | 0 | $self->status_etagmod_resp ($url, $resp); | |||||
4708 | 0 | return 0; | |||||
4709 | } | ||||||
4710 | 0 | 0 | if ($self->{'verbose'} >= 3) { | ||||
4711 | 0 | require Data::Dumper; | |||||
4712 | 0 | $self->verbose (3, | |||||
4713 | Data::Dumper->new([$twig->root],['root']) | ||||||
4714 | ->Indent(1)->Sortkeys(1)->Dump); | ||||||
4715 | } | ||||||
4716 | |||||||
4717 | # "item" for RSS/RDF, "entry" for Atom | ||||||
4718 | 0 | my @items = $twig->descendants(qr/^(item|entry)$/); | |||||
4719 | |||||||
4720 | 0 | @items = $self->rss_newest_only_items(@items); | |||||
4721 | |||||||
4722 | 0 | my $new = 0; | |||||
4723 | 0 | foreach my $item (@items) { | |||||
4724 | 0 | $new += $self->fetch_rss_process_one_item ($item); | |||||
4725 | } | ||||||
4726 | |||||||
4727 | 0 | 0 | if ($self->{'verbose'} >= 2) { | ||||
4728 | 0 | my $jar = $self->ua->cookie_jar; | |||||
4729 | 0 | 0 | 0 | if ($jar && (my $str = $jar->as_string ne '')) { | |||
4730 | 0 | $self->verbose (2, "accumulated cookies from this feed:\n", $str); | |||||
4731 | } else { | ||||||
4732 | 0 | $self->verbose (2, 'no cookies from this feed'); | |||||
4733 | } | ||||||
4734 | } | ||||||
4735 | 0 | $self->ua->cookie_jar (undef); | |||||
4736 | |||||||
4737 | 0 | $self->status_etagmod_resp ($url, $resp, $twig); | |||||
4738 | 0 | say __xn('{group}: {count} new article, from {bytes} bytes feed', | |||||
4739 | '{group}: {count} new articles, from {bytes} bytes feed', | ||||||
4740 | $new, | ||||||
4741 | group => $group, | ||||||
4742 | count => $new, | ||||||
4743 | bytes => $bytes); | ||||||
4744 | |||||||
4745 | 0 | return $new; | |||||
4746 | } | ||||||
4747 | |||||||
4748 | 1; | ||||||
4749 | __END__ |