blib/lib/App/RSS2Leafnode.pm | |||
---|---|---|---|
Criterion | Covered | Total | % |
statement | 75 | 1586 | 4.7 |
branch | 0 | 718 | 0.0 |
condition | 0 | 532 | 0.0 |
subroutine | 26 | 162 | 16.0 |
pod | 4 | 123 | 3.2 |
total | 105 | 3121 | 3.3 |
line | stmt | bran | cond | sub | pod | time | code |
---|---|---|---|---|---|---|---|
1 | # Copyright 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014 Kevin Ryde | ||||||
2 | # | ||||||
3 | # This file is part of RSS2Leafnode. | ||||||
4 | # | ||||||
5 | # RSS2Leafnode is free software; you can redistribute it and/or modify it | ||||||
6 | # under the terms of the GNU General Public License as published by the Free | ||||||
7 | # Software Foundation; either version 3, or (at your option) any later | ||||||
8 | # version. | ||||||
9 | # | ||||||
10 | # RSS2Leafnode is distributed in the hope that it will be useful, but | ||||||
11 | # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY | ||||||
12 | # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License | ||||||
13 | # for more details. | ||||||
14 | # | ||||||
15 | # You should have received a copy of the GNU General Public License along | ||||||
16 | # with RSS2Leafnode. If not, see |
||||||
17 | |||||||
18 | |||||||
19 | # maybe: | ||||||
20 | # location links | ||||||
21 | # http://maps.google.com/maps?ll=-35.066667,148.1 | ||||||
22 | # http://maps.google.com/maps?ll=-35.066667,148.1&spn=0.01,0.01&t=m | ||||||
23 | # | ||||||
24 | # | ||||||
25 | # when site has mutliple names for a page, relative or absolute | ||||||
26 | |||||||
27 | |||||||
28 | |||||||
29 | package App::RSS2Leafnode; | ||||||
30 | 1 | 1 | 1476 | use 5.010; | |||
1 | 4 | ||||||
1 | 37 | ||||||
31 | 1 | 1 | 4 | use strict; | |||
1 | 2 | ||||||
1 | 25 | ||||||
32 | 1 | 1 | 5 | use warnings; | |||
1 | 1 | ||||||
1 | 20 | ||||||
33 | 1 | 1 | 4 | use Carp; | |||
1 | 1 | ||||||
1 | 110 | ||||||
34 | 1 | 1 | 1173 | use Encode; | |||
1 | 12210 | ||||||
1 | 88 | ||||||
35 | 1 | 1 | 6753 | use Hash::Util::FieldHash; | |||
1 | 1138 | ||||||
1 | 46 | ||||||
36 | 1 | 1 | 7 | use List::Util 'min', 'max'; | |||
1 | 1 | ||||||
1 | 94 | ||||||
37 | 1 | 1 | 886 | use List::MoreUtils; | |||
1 | 1222 | ||||||
1 | 43 | ||||||
38 | 1 | 1 | 1036 | use POSIX (); # ENOENT, etc | |||
1 | 6409 | ||||||
1 | 26 | ||||||
39 | 1 | 1 | 17 | use Scalar::Util; | |||
1 | 2 | ||||||
1 | 48 | ||||||
40 | 1 | 1 | 861 | use Text::Trim 1.02; # version 1.02 for undef support | |||
1 | 588 | ||||||
1 | 68 | ||||||
41 | 1 | 1 | 798 | use URI; | |||
1 | 8036 | ||||||
1 | 28 | ||||||
42 | 1 | 1 | 791 | use HTML::Entities::Interpolate; | |||
1 | 7548 | ||||||
1 | 5 | ||||||
43 | |||||||
44 | 1 | 1 | 545 | use App::RSS2Leafnode::XML::Twig::Other; | |||
1 | 2 | ||||||
1 | 43 | ||||||
45 | |||||||
46 | # version 1.17 for __p(), and version 1.16 for turn_utf_8_on() | ||||||
47 | 1 | 1 | 943 | use Locale::TextDomain 1.17; | |||
1 | 9373 | ||||||
1 | 7 | ||||||
48 | 1 | 1 | 9041 | use Locale::TextDomain ('App-RSS2Leafnode'); | |||
1 | 3 | ||||||
1 | 5 | ||||||
49 | BEGIN { | ||||||
50 | 1 | 1 | 25 | use Locale::Messages; | |||
1 | 2 | ||||||
1 | 59 | ||||||
51 | 1 | 1 | 5 | Locale::Messages::bind_textdomain_codeset ('App-RSS2Leafnode','UTF-8'); | |||
52 | 1 | 19 | Locale::Messages::bind_textdomain_filter ('App-RSS2Leafnode', | ||||
53 | \&Locale::Messages::turn_utf_8_on); | ||||||
54 | } | ||||||
55 | |||||||
56 | # uncomment this to run the ### lines | ||||||
57 | # use Smart::Comments; | ||||||
58 | |||||||
59 | our $VERSION; | ||||||
60 | BEGIN { | ||||||
61 | 1 | 1 | 750 | $VERSION = 77; | |||
62 | } | ||||||
63 | |||||||
64 | ## no critic (ProhibitFixedStringMatches) | ||||||
65 | |||||||
66 | |||||||
67 | # Cribs: | ||||||
68 | # | ||||||
69 | # RSS | ||||||
70 | # http://my.netscape.com/publish/help/ | ||||||
71 | # RSS 0.9 spec. | ||||||
72 | # http://my.netscape.com/publish/help/mnn20/quickstart.html | ||||||
73 | # RSS 0.91 spec. | ||||||
74 | # http://purl.org/rss/1.0/ | ||||||
75 | # RSS 1.0 spec. | ||||||
76 | # http://www.rssboard.org/rss-specification | ||||||
77 | # http://www.rssboard.org/files/rss-2.0-sample.xml | ||||||
78 | # RSS 2.0 spec and sample. | ||||||
79 | # | ||||||
80 | # http://www.rssboard.org/rss-profile | ||||||
81 | # "Best practices." | ||||||
82 | # | ||||||
83 | # Dublin Core | ||||||
84 | # RFC 5013 -- summary | ||||||
85 | # http://dublincore.org/documents/dcmi-terms/ -- dc/terms | ||||||
86 | # | ||||||
87 | # Atom | ||||||
88 | # RFC 4287 -- Atom spec | ||||||
89 | # RFC 3339 -- ISO timestamps as used in Atom | ||||||
90 | # RFC 4685 -- "thr" threading extensions | ||||||
91 | # RFC 4946 -- | ||||||
92 | # RFC 5005 -- etc paging and archiving | ||||||
93 | # http://diveintomark.org/archives/2004/05/28/howto-atom-id | ||||||
94 | # Making an |
||||||
95 | # http://www.iana.org/assignments/link-relations/link-relations.xhtml | ||||||
96 | # assigned values | ||||||
97 | # | ||||||
98 | # RSS Modules: | ||||||
99 | # http://www.meatballwiki.org/wiki/ModWiki -- wiki | ||||||
100 | # http://web.resource.org/rss/1.0/modules/slash/ | ||||||
101 | # http://code.google.com/apis/feedburner/feedburner_namespace_reference.html | ||||||
102 | # http://backend.userland.com/creativeCommonsRSSModule | ||||||
103 | # | ||||||
104 | # http://web.resource.org/rss/1.0/modules/content/ | ||||||
105 | # http://www.rssboard.org/rss-profile#namespace-elements-content | ||||||
106 | # http://validator.w3.org/feed/docs/warning/NeedDescriptionBeforeContent.html | ||||||
107 | # |
||||||
108 | # | ||||||
109 | # http://www.apple.com/itunes/podcasts/specs.html | ||||||
110 | # http://www.feedforall.com/itunes.htm | ||||||
111 | # http://www.w3.org/2003/01/geo/wgs84_pos -- |
||||||
112 | # http://www.georss.org/ | ||||||
113 | # http://www.georss.org/Encodings | ||||||
114 | # http://www.georss.org/atom | ||||||
115 | # http://www.georss.org/rdf_rss1 | ||||||
116 | # | ||||||
117 | # http://activitystrea.ms/specs/atom/1.0/ | ||||||
118 | # activity: | ||||||
119 | # http://prismstandard.org/namespaces/basic/2.0/ | ||||||
120 | # http://www.prismstandard.org/specifications/2.0/PRISM_prism_namespace_2.0.pdf | ||||||
121 | # Prism | ||||||
122 | # | ||||||
123 | # URIs | ||||||
124 | # RFC 1738, RFC 2396, RFC 3986 -- URI formats (news/nntp in 1738) | ||||||
125 | # draft-ellermann-news-nntp-uri-11.txt -- news/nntp update | ||||||
126 | # RFC 2732 -- ipv6 "[]" hostnames | ||||||
127 | # RFC 2141 -- urn: | ||||||
128 | # RFC 4122 -- uuid format (as under urn:uuid:) | ||||||
129 | # RFC 4151 -- tag: | ||||||
130 | # RFC 1034, RFC 1123 -- domain names | ||||||
131 | # RFC 2606 -- reserved domain names ".invalid" | ||||||
132 | # | ||||||
133 | # XML | ||||||
134 | # http://www.w3.org/TR/xmlbase/ -- xml:base | ||||||
135 | # RFC 3023 text/xml etc media types | ||||||
136 | # | ||||||
137 | # Mail Messages | ||||||
138 | # RFC 850, RFC 1036 | ||||||
139 | # -- News message format, inc headers and rnews format | ||||||
140 | # RFC 2822, RFC 5322, RFC 5536 | ||||||
141 | # -- Email message format. | ||||||
142 | # RFC 2076, RFC 4021 -- headers summary. | ||||||
143 | # RFC 2557 -- MHTML Content-Location | ||||||
144 | # RFC 1864 -- Content-MD5 header | ||||||
145 | # RFC 2369 -- List-Post header and friends | ||||||
146 | # http://www.ietf.org/proceedings/98dec/I-D/draft-ietf-drums-mail-followup-to-00.txt | ||||||
147 | # Draft "Mail-Followup-To" header. | ||||||
148 | # | ||||||
149 | # RFC 1327 -- X.400 to RFC822 introducing Language header | ||||||
150 | # RFC 3282 -- Content-Language header | ||||||
151 | # RFC 1766, RFC 3066, RFC 4646 -- language tag form | ||||||
152 | # | ||||||
153 | # | ||||||
154 | # NNTP | ||||||
155 | # RFC 977 -- NNTP | ||||||
156 | # RFC 2616 -- HTTP/1.1 Accept-Encoding header | ||||||
157 | # RFC 2980 -- NNTP extensions | ||||||
158 | # | ||||||
159 | # RFC 4642 -- NNTP with SSL | ||||||
160 | # | ||||||
161 | # For XML in Perl there's several ways to do it! | ||||||
162 | # - XML::Parser looks likely for stream/event processing, but its builtin | ||||||
163 | # tree mode is very basic. | ||||||
164 | # - XML::Twig extends XML::Parser to a good tree, though the docs are | ||||||
165 | # slightly light on. It only does a subset of "XPath" but the | ||||||
166 | # functions/regexps are more perl-like for matching and there's various | ||||||
167 | # handy shortcuts for common operations. | ||||||
168 | # - XML::LibXML is the full blown libxml and is rather a lot to learn. | ||||||
169 | # Because it's mainly C it's not easy to find where or how you're going | ||||||
170 | # wrong when your code doesn't work. libxml also seems stricter about | ||||||
171 | # namespace matters than XML::Parser/XML::Twig. | ||||||
172 | # - XML::RSS uses XML::Parser to build its own style tree of RSS, | ||||||
173 | # including unifying differences among RSS/RDF 0.91, 1.0 and 2.0. | ||||||
174 | # Nested elements seem to need specific handling in its code, which can | ||||||
175 | # make it tricky for sub-element oddities. A fair amount of it is about | ||||||
176 | # writing RSS too. | ||||||
177 | # - XML::RSS::LibXML uses libxml for XML::RSS compatible reading and | ||||||
178 | # writing. It seems to do better on unrecognised sub-elements. | ||||||
179 | # - XML::Atom offers the basic Atom elements but doesn't seem to give | ||||||
180 | # access to extra stuff that might be in a feed. | ||||||
181 | # - XML::Feed tries to unify XML::RSS and XML::Atom but again doesn't seem | ||||||
182 | # to go much beyond the basics. It too is geared towards writing as | ||||||
183 | # well as reading. | ||||||
184 | # - XML::TreePP pure perl parser to a hash tree. | ||||||
185 | # | ||||||
186 | # The choice of XML::Twig is based on wanting both RSS and Atom, but | ||||||
187 | # XML::Feed not going far enough. Tree processing is easier than stream, | ||||||
188 | # and an RSS isn't meant to be huge. A tree may help if channel fields | ||||||
189 | # follow items or something equally unnatural, but will probably assume that | ||||||
190 | # doesn't happen and look at the twig partial-tree mode. Between the tree | ||||||
191 | # styles XML::LibXML is harder to get into than Twig. | ||||||
192 | # | ||||||
193 | |||||||
194 | #------------------------------------------------------------------------------ | ||||||
195 | # mostly generic | ||||||
196 | |||||||
197 | # return $str with a newline at the end, if it doesn't already have one | ||||||
198 | sub str_ensure_newline { | ||||||
199 | 0 | 0 | 0 | my ($str) = @_; | |||
200 | 0 | 0 | if ($str !~ /\n$/) { $str .= "\n" } | ||||
0 | |||||||
201 | 0 | return $str; | |||||
202 | } | ||||||
203 | |||||||
204 | sub md5_of_utf8 { | ||||||
205 | 0 | 0 | 0 | my ($str) = @_; | |||
206 | 0 | require Digest::MD5; | |||||
207 | 0 | return Digest::MD5::md5_base64 (Encode::encode_utf8 ($str)); | |||||
208 | } | ||||||
209 | |||||||
210 | sub is_empty { | ||||||
211 | 0 | 0 | 0 | my ($str) = @_; | |||
212 | 0 | 0 | return (! defined $str || $str =~ /^\s*$/); | ||||
213 | } | ||||||
214 | sub is_non_empty { | ||||||
215 | 0 | 0 | 0 | my ($str) = @_; | |||
216 | 0 | return ! is_empty($str); | |||||
217 | } | ||||||
218 | sub non_empty { | ||||||
219 | 0 | 0 | 0 | my ($str) = @_; | |||
220 | 0 | 0 | return (is_non_empty($str) ? $str : ()); | ||||
221 | } | ||||||
222 | |||||||
223 | sub join_non_empty { | ||||||
224 | 0 | 0 | 0 | my $sep = shift; | |||
225 | 0 | return non_empty (join($sep, map {non_empty($_)} @_)); | |||||
0 | |||||||
226 | } | ||||||
227 | |||||||
228 | sub collapse_whitespace { | ||||||
229 | 0 | 0 | 0 | my ($str) = @_; | |||
230 | 0 | 0 | defined $str or return undef; | ||||
231 | 0 | 0 | $str =~ s/(\s+)/($1 eq ' ' ? $1 : ' ')/ge; | ||||
0 | |||||||
232 | 0 | return Text::Trim::trim($str); | |||||
233 | } | ||||||
234 | |||||||
235 | # return true if $str is entirely ascii chars 0 to 127 | ||||||
236 | sub is_ascii { | ||||||
237 | 0 | 0 | 0 | my ($str) = @_; | |||
238 | 0 | return ($str !~ /[^[:ascii:]]/); | |||||
239 | } | ||||||
240 | |||||||
241 | # Return the number of lines in $str. | ||||||
242 | # If $str ends with a newline then that counts as the last line, so "xyz\n" | ||||||
243 | # is one line. If $str doesn't end with a newline then the final chars are | ||||||
244 | # a line, so "abc\ndef" is two lines. | ||||||
245 | sub str_count_lines { | ||||||
246 | 0 | 0 | 0 | my ($str) = @_; | |||
247 | 0 | 0 | return scalar($str =~ tr/\n//) + (length($str) && substr($str,-1) ne "\n"); | ||||
248 | } | ||||||
249 | |||||||
250 | sub File_Temp_DEBUG_saver { | ||||||
251 | 0 | 0 | 0 | my ($self, $newval) = @_; | |||
252 | 0 | require Scope::Guard; | |||||
253 | 0 | require File::Temp; | |||||
254 | 0 | my $oldval = $File::Temp::DEBUG; | |||||
255 | 0 | 0 | my $ret = Scope::Guard->new (sub { $File::Temp::DEBUG = $oldval }); | ||||
0 | |||||||
256 | 0 | $File::Temp::DEBUG = $newval; | |||||
257 | 0 | return $ret; | |||||
258 | } | ||||||
259 | sub MIME_Tools_debugging { | ||||||
260 | 0 | 0 | 0 | my ($self, $newval) = @_; | |||
261 | 0 | require Scope::Guard; | |||||
262 | 0 | require MIME::Tools; | |||||
263 | 0 | my $oldval = MIME::Tools->debugging; | |||||
264 | 0 | 0 | my $ret = Scope::Guard->new (sub { MIME::Tools->debugging($oldval) }); | ||||
0 | |||||||
265 | 0 | MIME::Tools->debugging ($newval); | |||||
266 | 0 | return $ret; | |||||
267 | } | ||||||
268 | |||||||
269 | sub homedir { | ||||||
270 | # my ($self) = @_; | ||||||
271 | 0 | 0 | 0 | require File::HomeDir; | |||
272 | # call each time just in case playing tricks with $ENV{HOME} in conf file | ||||||
273 | 0 | 0 | return File::HomeDir->my_home | ||||
274 | // croak 'File::HomeDir says you have no home directory'; | ||||||
275 | } | ||||||
276 | |||||||
277 | #------------------------------------------------------------------------------ | ||||||
278 | # Number::Format for sizes in bytes | ||||||
279 | |||||||
280 | use constant::defer NUMBER_FORMAT => sub { | ||||||
281 | 0 | 0 | require Number::Format; | ||||
282 | 0 | 0 | Number::Format->VERSION(1.5); # for format_bytes() options params | ||||
283 | 0 | 0 | return Number::Format->new | ||||
284 | (-kilo_suffix => __p('number-format-kilobytes','K'), | ||||||
285 | -mega_suffix => __p('number-format-megabytes','M'), | ||||||
286 | -giga_suffix => __p('number-format-gigabytes','G')); | ||||||
287 | 1 | 1 | 820 | }; | |||
1 | 689 | ||||||
1 | 9 | ||||||
288 | |||||||
289 | sub format_size_in_bytes { | ||||||
290 | 0 | 0 | 0 | my ($self, $length) = @_; | |||
291 | 0 | 0 | if ($length >= 2000) { | ||||
292 | 0 | return $self->NUMBER_FORMAT()->format_bytes ($length, precision => 1); | |||||
293 | } else { | ||||||
294 | 0 | return __x('{size} bytes', size => $length); | |||||
295 | } | ||||||
296 | } | ||||||
297 | |||||||
298 | #------------------------------------------------------------------------------ | ||||||
299 | |||||||
300 | sub new { | ||||||
301 | 0 | 0 | 1 | my $class = shift; | |||
302 | 0 | return bless { | |||||
303 | # config variables | ||||||
304 | verbose => 0, | ||||||
305 | render => 0, | ||||||
306 | render_width => 60, | ||||||
307 | rss_get_links => 0, | ||||||
308 | rss_get_comments => 0, | ||||||
309 | rss_newest_only => 0, | ||||||
310 | get_icon => 0, | ||||||
311 | html_charset_from_content => 0, | ||||||
312 | |||||||
313 | # secret extra | ||||||
314 | msgidextra => '', | ||||||
315 | |||||||
316 | @_, | ||||||
317 | }, $class; | ||||||
318 | } | ||||||
319 | |||||||
320 | sub command_line { | ||||||
321 | 0 | 0 | 1 | my ($self) = @_; | |||
322 | |||||||
323 | 0 | my $done_version; | |||||
324 | 0 | require Getopt::Long; | |||||
325 | 0 | Getopt::Long::Configure ('no_ignore_case'); | |||||
326 | Getopt::Long::GetOptions | ||||||
327 | ('config=s' => \$self->{'config_filename'}, | ||||||
328 | 'verbose:1' => \$self->{'verbose'}, | ||||||
329 | 'version' => sub { | ||||||
330 | 0 | 0 | say __x("RSS2Leafnode version {version}", version => $VERSION); | ||||
331 | 0 | $done_version = 1; | |||||
332 | }, | ||||||
333 | 'bareversion' => sub { | ||||||
334 | 0 | 0 | say $VERSION; | ||||
335 | 0 | $done_version = 1; | |||||
336 | }, | ||||||
337 | 'msgid=s' => \$self->{'msgidextra'}, | ||||||
338 | 'help|?' => sub { | ||||||
339 | 0 | 0 | say __x("rss2leafnode [--options]"); | ||||
340 | 0 | say __x(" --config=filename configuration file (default ~/.rss2leafnode.conf)"); | |||||
341 | 0 | say __x(" --help print this help"); | |||||
342 | 0 | say __x(" --verbose describe what's done"); | |||||
343 | 0 | say __x(" --verbose=2 show technical details of what's done"); | |||||
344 | 0 | say __x(" --version print program version number"); | |||||
345 | 0 | exit 0; | |||||
346 | 0 | 0 | }) or return 1; | ||||
347 | 0 | 0 | if (! $done_version) { | ||||
348 | 0 | $self->do_config_file; | |||||
349 | 0 | $self->nntp_close; | |||||
350 | } | ||||||
351 | 0 | return 0; | |||||
352 | } | ||||||
353 | |||||||
354 | sub verbose { | ||||||
355 | 0 | 0 | 0 | my $self = shift; | |||
356 | 0 | my $count = shift; | |||||
357 | 0 | 0 | if ($self->{'verbose'} >= $count) { | ||||
358 | 0 | say @_; | |||||
359 | } | ||||||
360 | } | ||||||
361 | |||||||
362 | sub config_filename { | ||||||
363 | 0 | 0 | 0 | my ($self) = @_; | |||
364 | 0 | 0 | return $self->{'config_filename'} // do { | ||||
365 | 0 | require File::Spec; | |||||
366 | 0 | File::Spec->catfile ($self->homedir, '.rss2leafnode.conf'); | |||||
367 | }; | ||||||
368 | } | ||||||
369 | sub status_filename { | ||||||
370 | 0 | 0 | 0 | my ($self) = @_; | |||
371 | 0 | 0 | return $self->{'status_filename'} // do { | ||||
372 | 0 | require File::Spec; | |||||
373 | 0 | File::Spec->catfile ($self->homedir, '.rss2leafnode.status'); | |||||
374 | }; | ||||||
375 | } | ||||||
376 | |||||||
377 | sub do_config_file { | ||||||
378 | 0 | 0 | 0 | my ($self) = @_; | |||
379 | 0 | my @guards; | |||||
380 | |||||||
381 | 0 | 0 | open STDERR, '>&STDOUT' or die "Oops, can't join STDERR to STDOUT"; | ||||
382 | |||||||
383 | # File::Temp::DEBUG for possible temp files used by HTML::FormatExternal | ||||||
384 | # these debugs turned on only for the duration of running the config file | ||||||
385 | # and the downloading etc in it | ||||||
386 | 0 | 0 | if ($self->{'verbose'} >= 2) { | ||||
387 | 0 | push @guards, $self->File_Temp_DEBUG_saver(1); | |||||
388 | 0 | push @guards, $self->MIME_Tools_debugging(1); | |||||
389 | } | ||||||
390 | |||||||
391 | 0 | my $config_filename = $self->config_filename; | |||||
392 | 0 | $self->verbose (1, "config: ", $config_filename); | |||||
393 | |||||||
394 | 0 | require App::RSS2Leafnode::Conf; | |||||
395 | 0 | local $App::RSS2Leafnode::Conf::r2l = $self; | |||||
396 | 0 | 0 | if (! defined (do { package App::RSS2Leafnode::Conf; | ||||
397 | 0 | do $config_filename; | |||||
398 | })) { | ||||||
399 | 0 | 0 | if (! -e $config_filename) { | ||||
400 | 0 | croak "rss2leafnode: config file $config_filename doesn't exist\n"; | |||||
401 | } else { | ||||||
402 | 0 | croak $@; | |||||
403 | } | ||||||
404 | } | ||||||
405 | } | ||||||
406 | |||||||
407 | #------------------------------------------------------------------------------ | ||||||
408 | # LWP stuff | ||||||
409 | |||||||
410 | sub user_agent { | ||||||
411 | 0 | 0 | 0 | my ($self) = @_; | |||
412 | 0 | 0 | if (defined $self->{'user_agent'}) { | ||||
413 | 0 | return $self->{'user_agent'}; | |||||
414 | } else { | ||||||
415 | 0 | return 'RSS2leafnode/' . $self->VERSION . ' '; | |||||
416 | } | ||||||
417 | } | ||||||
418 | |||||||
419 | sub ua { | ||||||
420 | 0 | 0 | 0 | my ($self) = @_; | |||
421 | 0 | 0 | return ($self->{'ua'} ||= do { | ||||
422 | 0 | require LWP::UserAgent; | |||||
423 | 0 | LWP::UserAgent->VERSION(5.832); # 5.832 for content_charset() | |||||
424 | |||||||
425 | # one connection kept alive | ||||||
426 | 0 | my $ua = LWP::UserAgent->new (keep_alive => 1); | |||||
427 | 0 | Scalar::Util::weaken ($ua->{(__PACKAGE__)} = $self); | |||||
428 | 0 | $ua->agent ($self->user_agent); | |||||
429 | |||||||
430 | 0 | Scalar::Util::weaken (my $weak_self = $self); | |||||
431 | 0 | $ua->add_handler (request_send => \&lwp_request_send__verbose); | |||||
432 | $ua->add_handler (response_done => sub { | ||||||
433 | 0 | 0 | lwp_response_done__check_md5 ($weak_self, @_); | ||||
434 | 0 | }); | |||||
435 | |||||||
436 | # ask for everything $resp->decode() / $resp->decoded_content() can cope | ||||||
437 | # with, in particular "gzip" and "deflate" compression if Compress::Zlib | ||||||
438 | # or whatever is available | ||||||
439 | # | ||||||
440 | 0 | require HTTP::Message; | |||||
441 | 0 | my $decodable = HTTP::Message::decodable(); | |||||
442 | 0 | $self->verbose (2, "HTTP decodable: ", $decodable); | |||||
443 | 0 | $ua->default_header ('Accept-Encoding' => $decodable); | |||||
444 | |||||||
445 | 0 | $ua | |||||
446 | }); | ||||||
447 | } | ||||||
448 | |||||||
449 | sub lwp_request_send__verbose { | ||||||
450 | 0 | 0 | 0 | my ($req, $ua, $h) = @_; | |||
451 | 0 | my $self = $ua->{(__PACKAGE__)}; | |||||
452 | 0 | $self->verbose (2, "request_send:", $req->dump, "\n"); # extra newline | |||||
453 | 0 | return; # continue processing | |||||
454 | } | ||||||
455 | |||||||
456 | sub lwp_response_done__check_md5 { | ||||||
457 | 0 | 0 | 0 | my ($self, $resp, $ua, $h) = @_; | |||
458 | 0 | 0 | $self || return; | ||||
459 | ### lwp_response_done__check_md5() ... | ||||||
460 | 0 | 0 | my $want = $resp->header('Content-MD5') // do { | ||||
461 | 0 | $self->verbose (2, 'no Content-MD5 header'); | |||||
462 | 0 | return; | |||||
463 | }; | ||||||
464 | 0 | $resp->decode; | |||||
465 | 0 | my $cref = $resp->content_ref; | |||||
466 | 0 | require Digest::MD5; | |||||
467 | 0 | my $got = Digest::MD5::md5_hex($$cref); | |||||
468 | 0 | 0 | if ($got ne $want) { | ||||
469 | 0 | print __x("Warning, MD5 checksum mismatch on download {url}\n", | |||||
470 | url => $resp->request->uri); | ||||||
471 | } else { | ||||||
472 | 0 | $self->verbose(2, 'Content-MD5 ok'); | |||||
473 | } | ||||||
474 | } | ||||||
475 | |||||||
476 | # $resp is a HTTP::Response object. Modify its headers to apply our | ||||||
477 | # $html_charset_from_content option, which means if it's set then prefer the | ||||||
478 | # document's Content-Type over what the server says. | ||||||
479 | # | ||||||
480 | # The LWP::UserAgent parse_head option appends the document bits to | ||||||
481 | # the message headers. If the server and the document both offer a | ||||||
482 | # Content-Type then there's two, with the document one last, so all we have | ||||||
483 | # to do is change to make the last one the only one. | ||||||
484 | # | ||||||
485 | sub enforce_html_charset_from_content { | ||||||
486 | 0 | 0 | 0 | my ($self, $resp) = @_; | |||
487 | 0 | 0 | 0 | if ($self->{'html_charset_from_content'} | |||
488 | && $resp->headers->content_is_html) { | ||||||
489 | 0 | my $old = $resp->header('Content-Type'); | |||||
490 | 0 | $resp->header('Content-Type' => $resp->headers->content_type); | |||||
491 | |||||||
492 | 0 | $self->verbose (2, 'html_charset_from_content mangled Content-Type from'); | |||||
493 | 0 | $self->verbose (2, " from ", $old); | |||||
494 | 0 | $self->verbose (2, " to ", $resp->header('Content-Type')); | |||||
495 | 0 | $self->verbose (2, " giving charset ", $resp->content_charset); | |||||
496 | } | ||||||
497 | } | ||||||
498 | |||||||
499 | |||||||
500 | #------------------------------------------------------------------------------ | ||||||
501 | my %known; | ||||||
502 | |||||||
503 | # |
||||||
504 | # ENHANCE-ME: is this something to work into the skipdays? or a message expiry? | ||||||
505 | # | ||||||
506 | $known{'/channel/item/dcterms:valid'} = undef; | ||||||
507 | |||||||
508 | # |
||||||
509 | $known{'/channel/item/dcterms:audience'} = undef; | ||||||
510 | |||||||
511 | # |
||||||
512 | # |
||||||
513 | @known{qw(/channel/item/eq:depth | ||||||
514 | /channel/item/eq:seconds)} = (); | ||||||
515 | |||||||
516 | # rdf structure stuff | ||||||
517 | @known{qw(/channel/items | ||||||
518 | /channel/items/rdf:Seq | ||||||
519 | /channel/items/rdf:Seq/rdf:li)} = (); | ||||||
520 | |||||||
521 | @known{('/channel/cloud', | ||||||
522 | '/channel/link', | ||||||
523 | '/channel/docs', | ||||||
524 | '/channel/generator', | ||||||
525 | '/channel/rating', | ||||||
526 | '/channel/id', | ||||||
527 | '/channel/description', | ||||||
528 | '/channel/tagline', | ||||||
529 | '/channel/info', # atom something freeform | ||||||
530 | '/channel/itunes:summary', | ||||||
531 | '/channel/feedburner:info', | ||||||
532 | |||||||
533 | # nothing much in these as yet eg. rssboard | ||||||
534 | '/channel/item/sitemap:priority', | ||||||
535 | '/channel/item/sitemap:changefreq', | ||||||
536 | |||||||
537 | # feedburner junk | ||||||
538 | '/channel/feedburner:feedFlare', | ||||||
539 | |||||||
540 | # images | ||||||
541 | '/channel/itunes:owner', | ||||||
542 | '/channel/itunes:owner/itunes:name', | ||||||
543 | '/channel/itunes:owner/itunes:email', | ||||||
544 | |||||||
545 | '/channel/textInput', | ||||||
546 | '/channel/textInput/description', | ||||||
547 | '/channel/textInput/link', | ||||||
548 | '/channel/textInput/name', | ||||||
549 | '/channel/textInput/title', | ||||||
550 | '/channel/textinput', | ||||||
551 | '/channel/textinput/title', | ||||||
552 | '/channel/textinput/description', | ||||||
553 | '/channel/textinput/name', | ||||||
554 | '/channel/textinput/link', | ||||||
555 | |||||||
556 | '/channel/openSearch:totalResults', | ||||||
557 | '/channel/openSearch:startIndex', | ||||||
558 | '/channel/openSearch:itemsPerPage', | ||||||
559 | |||||||
560 | '/channel/item', | ||||||
561 | '/channel/item/source', | ||||||
562 | |||||||
563 | '/channel/item/twitter:source', | ||||||
564 | |||||||
565 | # something from radio free france | ||||||
566 | # eg. http://radiofrance-podcast.net/podcast09/rss_10193.xml | ||||||
567 | '/channel/item/podcastRF:businessReference', | ||||||
568 | |||||||
569 | # google documents stuff | ||||||
570 | '/channel/item/gd:extendedProperty', | ||||||
571 | |||||||
572 | # |
||||||
573 | # eg. RBA http://www.rba.gov.au/rss/rss-cb-exchange-rates.xml | ||||||
574 | 'channel/item/cb:statistics', | ||||||
575 | |||||||
576 | # |
||||||
577 | # eg. Fed Reserve http://www.federalreserve.gov/feeds/press_taf.xml | ||||||
578 | '/channel/item/cb:news', | ||||||
579 | |||||||
580 | # FIXME: |
||||||
581 | # Fed eg. http://www.federalreserve.gov/feeds/speeches.xml | ||||||
582 | '/channel/item/cb:speech', | ||||||
583 | |||||||
584 | # |
||||||
585 | # |
||||||
586 | # Fed eg. http://www.federalreserve.gov/feeds/ifdp.xml | ||||||
587 | # FIXME: except |
||||||
588 | '/channel/item/cb:paper', | ||||||
589 | '/channel/item/cb:event', | ||||||
590 | |||||||
591 | # |
||||||
592 | '/channel/item/media:hash', | ||||||
593 | |||||||
594 | # not sure what these are, but don't seem very interesting | ||||||
595 | '/channel/item/slate:slate_plus', # |
||||||
596 | '/channel/item/slate:paywall', # |
||||||
597 | '/channel/item/slate:sponsored', # |
||||||
598 | )} = (); | ||||||
599 | |||||||
600 | # weather | ||||||
601 | # '/channel/item/w:current', | ||||||
602 | # '/channel/item/w:forecast', | ||||||
603 | # '/channel/yweather:location', | ||||||
604 | # '/channel/yweather:units', | ||||||
605 | # '/channel/yweather:wind', | ||||||
606 | # '/channel/yweather:atmosphere', | ||||||
607 | # '/channel/yweather:astronomy', | ||||||
608 | # '/channel/item/yweather:condition', | ||||||
609 | # '/channel/item/yweather:forecast', | ||||||
610 | |||||||
611 | # --central-bank | ||||||
612 | # /channel/item/cb:statistics | ||||||
613 | # /channel/item/cb:statistics/cb:country | ||||||
614 | # /channel/item/cb:statistics/cb:institutionAbbrev | ||||||
615 | # /channel/item/cb:statistics/cb:exchangeRate | ||||||
616 | # /channel/item/cb:statistics/cb:exchangeRate/cb:value | ||||||
617 | # /channel/item/cb:statistics/cb:exchangeRate/cb:baseCurrency | ||||||
618 | # /channel/item/cb:statistics/cb:exchangeRate/cb:targetCurrency | ||||||
619 | # /channel/item/cb:statistics/cb:exchangeRate/cb:rateType | ||||||
620 | # /channel/item/cb:statistics/cb:exchangeRate/cb:observationPeriod | ||||||
621 | # /channel/item/cb:speech | ||||||
622 | # /channel/item/cb:speech/cb:simpleTitle | ||||||
623 | # /channel/item/cb:speech/cb:occurrenceDate | ||||||
624 | # /channel/item/cb:speech/cb:person | ||||||
625 | # /channel/item/cb:speech/cb:person/cb:givenName | ||||||
626 | # /channel/item/cb:speech/cb:person/cb:surname | ||||||
627 | # /channel/item/cb:speech/cb:person/cb:personalTitle | ||||||
628 | # /channel/item/cb:speech/cb:person/cb:nameAsWritten | ||||||
629 | # /channel/item/cb:speech/cb:person/cb:role | ||||||
630 | # /channel/item/cb:speech/cb:person/cb:role/cb:jobTitle | ||||||
631 | # /channel/item/cb:speech/cb:person/cb:role/cb:affiliation | ||||||
632 | # /channel/item/cb:speech/cb:venue | ||||||
633 | |||||||
634 | |||||||
635 | #------------------------------------------------------------------------------ | ||||||
636 | # dates | ||||||
637 | |||||||
638 | 1 | 1 | 1486 | use constant RFC822_STRFTIME_FORMAT => '%a, %d %b %Y %H:%M:%S %z'; | |||
1 | 3 | ||||||
1 | 2122 | ||||||
639 | |||||||
640 | # return a string which is current time in RFC 822 format | ||||||
641 | sub rfc822_time_now { | ||||||
642 | 0 | 0 | 0 | return POSIX::strftime (RFC822_STRFTIME_FORMAT, localtime(time())); | |||
643 | } | ||||||
644 | |||||||
645 | sub isodate_to_rfc822 { | ||||||
646 | 0 | 0 | 0 | my ($isodate) = @_; | |||
647 | 0 | 0 | if (! defined $isodate) { return undef; } | ||||
0 | |||||||
648 | 0 | my $date = $isodate; # the original goes through if unrecognised | |||||
649 | |||||||
650 | 0 | 0 | 0 | if ($isodate =~ /\dT\d/ || $isodate =~ /^\d{4}-\d{2}-\d{2}$/) { | |||
651 | # eg. "2000-01-01T12:00+00:00" | ||||||
652 | # "2000-01-01T12:00:00Z" | ||||||
653 | # "2000-01-01" | ||||||
654 | 0 | 0 | my $zonestr = ($isodate =~ s/([+-][0-9][0-9]):([0-9][0-9])$// ? " $1$2" | ||||
0 | |||||||
655 | : $isodate =~ s/Z$// ? ' +0000' | ||||||
656 | : ''); | ||||||
657 | 0 | require Date::Parse; | |||||
658 | 0 | my $time_t = Date::Parse::str2time($isodate); | |||||
659 | 0 | 0 | if (defined $time_t) { | ||||
660 | 0 | $date = POSIX::strftime ("%a, %d %b %Y %H:%M:%S$zonestr", | |||||
661 | localtime ($time_t)); | ||||||
662 | } | ||||||
663 | } | ||||||
664 | 0 | return $date; | |||||
665 | } | ||||||
666 | |||||||
667 | # Return an RFC822 date string, or undef if nothing known. | ||||||
668 | # This gets a sensible sort-by-date in the newsreader. | ||||||
669 | # |
||||||
670 | # redundant. | ||||||
671 | # | ||||||
672 | sub item_to_date { | ||||||
673 | 0 | 0 | 0 | my ($self, $item) = @_; | |||
674 | 0 | my $date; | |||||
675 | 0 | foreach my $elt ($item, item_to_channel($item)) { | |||||
676 | 0 | 0 | $date = (non_empty ($elt->first_child_trimmed_text('pubDate')) | ||||
0 | |||||||
0 | |||||||
0 | |||||||
0 | |||||||
0 | |||||||
0 | |||||||
0 | |||||||
0 | |||||||
677 | // non_empty ($elt->first_child_trimmed_text('dc:date')) | ||||||
678 | // non_empty ($elt->first_child_trimmed_text('jf:creationDate')) | ||||||
679 | # Atom | ||||||
680 | // non_empty ($elt->first_child_trimmed_text('modified')) | ||||||
681 | // non_empty ($elt->first_child_trimmed_text('updated')) | ||||||
682 | // non_empty ($elt->first_child_trimmed_text('issued')) | ||||||
683 | // non_empty ($elt->first_child_trimmed_text('dcterms:issued')) | ||||||
684 | // non_empty ($elt->first_child_trimmed_text('created')) | ||||||
685 | # channel | ||||||
686 | // non_empty ($elt->first_child_trimmed_text('lastBuildDate')) | ||||||
687 | # Atom | ||||||
688 | // non_empty ($elt->first_child_trimmed_text('published')) | ||||||
689 | ); | ||||||
690 | 0 | 0 | last if defined $date; | ||||
691 | } | ||||||
692 | 0 | return isodate_to_rfc822($date); | |||||
693 | } | ||||||
694 | @known{qw(/channel/dc:date | ||||||
695 | /channel/lastBuildDate | ||||||
696 | /channel/pubDate | ||||||
697 | /channel/updated | ||||||
698 | /channel/modified | ||||||
699 | |||||||
700 | /channel/item/dc:date | ||||||
701 | /channel/item/pubDate | ||||||
702 | /channel/item/updated | ||||||
703 | /channel/item/published | ||||||
704 | /channel/item/modified | ||||||
705 | /channel/item/created | ||||||
706 | /channel/item/issued | ||||||
707 | /channel/item/dcterms:issued | ||||||
708 | |||||||
709 | /channel/item/jf:creationDate --java-locale-human-readable | ||||||
710 | /channel/item/jf:modificationDate | ||||||
711 | /channel/item/jf:date --free-form | ||||||
712 | )} = (); | ||||||
713 | |||||||
714 | |||||||
715 | sub item_to_timet { | ||||||
716 | 0 | 0 | 0 | my ($self, $item) = @_; | |||
717 | ### item_to_timet() ... | ||||||
718 | 0 | 0 | my $str = $self->item_to_date($item) | ||||
719 | // return - POSIX::DBL_MAX(); # no date fields | ||||||
720 | |||||||
721 | 0 | require Date::Parse; | |||||
722 | ### $str | ||||||
723 | # print Date::Parse::str2time($str)," $str\n"; | ||||||
724 | return (Date::Parse::str2time($str) | ||||||
725 | 0 | 0 | // do { | ||||
726 | 0 | say __x('Unrecognised date "{date}" from {url}', | |||||
727 | date => $str, | ||||||
728 | url => $self->{'uri'}); | ||||||
729 | 0 | - POSIX::DBL_MAX(); | |||||
730 | }); | ||||||
731 | } | ||||||
732 | |||||||
733 | #----------------------------------------------------------------------------- | ||||||
734 | # Message-ID | ||||||
735 | |||||||
736 | # Return a message ID for something at $uri, optionally uniquified by $str. | ||||||
737 | # $uri is either a URI object or a url string. | ||||||
738 | # Weird chars in $uri or $str are escaped as necessary. | ||||||
739 | # Secret $self->{'msgidextra'} can make different message ids for the same | ||||||
740 | # content when testing. | ||||||
741 | # | ||||||
742 | # The path from $uri is incorporated in the result. fetch_html() needs this | ||||||
743 | # since the ETag identifier is only per-url, not globally unique. Suspect | ||||||
744 | # fetch_rss() needs it for a guid too (a non-permaLink one), as think the | ||||||
745 | # guid is only unique within the particular $uri feed, not globally and not | ||||||
746 | # even across multiple feeds on the same server. | ||||||
747 | # | ||||||
748 | sub url_to_msgid { | ||||||
749 | 0 | 0 | 0 | my ($self, $url, $str) = @_; | |||
750 | |||||||
751 | 0 | my $host; | |||||
752 | 0 | my $pathbit = $url; | |||||
753 | |||||||
754 | 0 | 0 | if (my $uri = eval { URI->new($url) }) { | ||||
0 | |||||||
755 | 0 | $uri = $uri->canonical; | |||||
756 | 0 | 0 | if ($uri->can('host')) { | ||||
0 | |||||||
757 | 0 | $host = $uri->host; | |||||
758 | 0 | $uri->host(''); | |||||
759 | 0 | $pathbit = $uri->as_string; | |||||
760 | |||||||
761 | # If the $uri schema has a host part but it's empty or "localhost" | ||||||
762 | # then try expanding that to hostname(). | ||||||
763 | # | ||||||
764 | # $uri schemas without a host part, like "urn:" in an Atom |
||||||
765 | # get hostname(), since want the generated msgid to come out the same | ||||||
766 | # if such a urn: appears from different downloaded locations. | ||||||
767 | # | ||||||
768 | 0 | 0 | 0 | if (is_empty($host) || $host eq 'localhost') { | |||
769 | 0 | require Sys::Hostname; | |||||
770 | 0 | eval { $host = Sys::Hostname::hostname() }; | |||||
0 | |||||||
771 | } | ||||||
772 | |||||||
773 | } elsif ($uri->can('authority')) { | ||||||
774 | # the "authority" part of a "tag:" schema | ||||||
775 | 0 | $host = $uri->authority; | |||||
776 | 0 | $uri->authority(''); | |||||
777 | 0 | $pathbit = $uri->as_string; | |||||
778 | } | ||||||
779 | } | ||||||
780 | |||||||
781 | # $host can be empty if running from a file:/// | ||||||
782 | # "localhost" is a bit bogus and in particular leafnode won't accept it. | ||||||
783 | # ".invalid" as per RFC 2606 | ||||||
784 | 0 | 0 | 0 | if (is_empty($host) || $host eq 'localhost') { | |||
785 | 0 | $host = 'rss2leafnode.invalid'; | |||||
786 | } | ||||||
787 | |||||||
788 | # ipv6 dotted hostname "[1234:5678::0000]" -> "1234.5678.0000..ipv6", | ||||||
789 | # because [ and : are not allowed (RFC 2822 "Atom" atext) | ||||||
790 | # $uri->canonical above lower cases any hex, for consistency | ||||||
791 | 0 | 0 | if (($host =~ s/^\[|\]$//g) | ($host =~ tr/:/./)) { | ||||
792 | 0 | $host .= '.ipv6'; | |||||
793 | } | ||||||
794 | |||||||
795 | # leafnode 2.0.0.alpha20070602a seems to insist on a "." in the host name | ||||||
796 | 0 | 0 | unless ($host =~ /\./) { | ||||
797 | 0 | $host .= '.withadot'; | |||||
798 | } | ||||||
799 | |||||||
800 | 0 | return ('<' | |||||
801 | . msgid_chars(join_non_empty('.', | ||||||
802 | "rss2leafnode" . $self->{'msgidextra'}, | ||||||
803 | $pathbit, | ||||||
804 | $str)) | ||||||
805 | . '@' | ||||||
806 | . msgid_chars($host) | ||||||
807 | . '>'); | ||||||
808 | } | ||||||
809 | # msgid_chars($str) returns $str with invalid Message-ID characters munged. | ||||||
810 | # Per RFC850 must be printing ascii and not < > or whitespace, but for | ||||||
811 | # safety reduce that a bit, in particular excluding ' and ". | ||||||
812 | sub msgid_chars { | ||||||
813 | 0 | 0 | 0 | my ($str) = @_; | |||
814 | 0 | require URI::Escape; | |||||
815 | 0 | return URI::Escape::uri_escape_utf8 ($str, "^A-Za-z0-9\\-_.!~*/:"); | |||||
816 | } | ||||||
817 | |||||||
818 | #------------------------------------------------------------------------------ | ||||||
819 | # news posting | ||||||
820 | # | ||||||
821 | # This used to run the "rnews" program, which in leafnode 2 does some direct | ||||||
822 | # writing to the spool. But that requires user "news" perms, and as of the | ||||||
823 | # June 2007 leafnode beta it tends to be a good deal slower because it reads | ||||||
824 | # the whole groupinfo file. It has the advantage of not being picky about | ||||||
825 | # message ID hostnames, and allowing read-only groups to be filled. But | ||||||
826 | # apart from that plain POST seems much easier for being "server neutral". | ||||||
827 | # | ||||||
828 | # IHAVE instead of POST would be a possibility, when available, though POST | ||||||
829 | # is probably more accurate in the sense it's a new article coming into the | ||||||
830 | # news system. | ||||||
831 | # | ||||||
832 | # Net::NNTP looks at $ENV{NNTPSERVER}, $ENV{NEWSHOST} and Net::Config | ||||||
833 | # nntp_hosts list for the news server. Maybe could have that here too, | ||||||
834 | # instead of always defaulting to localhost (in $self->{'nntp_host'}). | ||||||
835 | # Would want to find out the name chosen to show in diagnostics though. | ||||||
836 | |||||||
837 | # return a string "host:port", suitable for the Host arg to Net::NNTP->new | ||||||
838 | sub uri_to_nntp_host { | ||||||
839 | 0 | 0 | 0 | my ($uri) = @_; | |||
840 | 0 | 0 | return (non_empty($uri->host) // 'localhost') . ':' . $uri->port; | ||||
841 | } | ||||||
842 | |||||||
843 | sub nntp { | ||||||
844 | 0 | 0 | 0 | my ($self) = @_; | |||
845 | # reopen if different 'nntp_host' | ||||||
846 | 0 | 0 | 0 | if (! $self->{'nntp'} | |||
847 | || $self->{'nntp'}->host ne $self->{'nntp_host'}) { | ||||||
848 | 0 | my $host = $self->{'nntp_host'}; | |||||
849 | 0 | $self->verbose (1, __x("nntp: {host}", host => $host)); | |||||
850 | 0 | require Net::NNTP; | |||||
851 | 0 | 0 | my $nntp = $self->{'nntp'} | ||||
852 | = Net::NNTP->new ($host, ($self->{'verbose'} >= 2 | ||||||
853 | ? (Debug => 1) | ||||||
854 | : ())); | ||||||
855 | 0 | 0 | if (! $nntp) { | ||||
856 | 0 | croak __x("Cannot connect to NNTP on \"{host}\"\n", host => $host); | |||||
857 | } | ||||||
858 | 0 | 0 | if (! $nntp->postok) { | ||||
859 | 0 | $self->verbose (1, "Hmm, ", $nntp->host, " doesn't say \"posting ok\" ..."); | |||||
860 | } | ||||||
861 | } | ||||||
862 | 0 | return $self->{'nntp'}; | |||||
863 | } | ||||||
864 | |||||||
865 | sub nntp_close { | ||||||
866 | 0 | 0 | 0 | my ($self) = @_; | |||
867 | 0 | 0 | if (my $nntp = delete $self->{'nntp'}) { | ||||
868 | 0 | 0 | if (! $nntp->quit) { | ||||
869 | 0 | say "Error closing nntp: ",$nntp->message; | |||||
870 | } | ||||||
871 | } | ||||||
872 | } | ||||||
873 | |||||||
874 | # check that $group exists in the NNTP, return 1 if so, or 0 if not | ||||||
875 | sub nntp_group_check { | ||||||
876 | 0 | 0 | 0 | my ($self, $group) = @_; | |||
877 | 0 | my $nntp = $self->nntp; | |||||
878 | 0 | 0 | if (! $nntp->group($group)) { | ||||
879 | 0 | print __x("rss2leafnode: no group \"{group}\" on host \"{host}\" | |||||
880 | (See the rss2leafnode man page for notes on creating groups.) | ||||||
881 | ", | ||||||
882 | host => $nntp->host, | ||||||
883 | group => $group); | ||||||
884 | 0 | return 0; | |||||
885 | } | ||||||
886 | |||||||
887 | 0 | return 1; | |||||
888 | } | ||||||
889 | |||||||
890 | sub nntp_message_id_exists { | ||||||
891 | 0 | 0 | 0 | my ($self, $msgid) = @_; | |||
892 | 0 | my $ret = $self->nntp->nntpstat($msgid); | |||||
893 | 0 | 0 | if ($self->{'verbose'} >= 2) { | ||||
0 | |||||||
894 | 0 | 0 | $self->verbose (2, "'$msgid' ", ($ret ? 'exists already' : 'new')); | ||||
895 | } elsif ($self->{'verbose'} >= 1) { | ||||||
896 | 0 | 0 | if ($ret) { | ||||
897 | 0 | $self->verbose (1, ' ', __('exists already')); | |||||
898 | } | ||||||
899 | } | ||||||
900 | 0 | return $ret; | |||||
901 | } | ||||||
902 | |||||||
903 | # post $msg to NNTP, return true if successful | ||||||
904 | sub nntp_post { | ||||||
905 | 0 | 0 | 0 | my ($self, $msg) = @_; | |||
906 | 0 | my $nntp = $self->nntp; | |||||
907 | 0 | 0 | if (! $nntp->post ($msg->as_string)) { | ||||
908 | 0 | say __x('Cannot post: {message}', | |||||
909 | message => scalar($nntp->message)); | ||||||
910 | 0 | return 0; | |||||
911 | } | ||||||
912 | 0 | return 1; | |||||
913 | } | ||||||
914 | |||||||
915 | |||||||
916 | #------------------------------------------------------------------------------ | ||||||
917 | # HTML title | ||||||
918 | |||||||
919 | # extra data associated against a HTTP::Response object | ||||||
920 | Hash::Util::FieldHash::fieldhash (my %resp_exiftool_info); | ||||||
921 | |||||||
922 | # return hashref { Title => $str, ... }, or empty {} if no exiftool etc | ||||||
923 | sub resp_exiftool_info { | ||||||
924 | 0 | 0 | 0 | my ($resp) = @_; | |||
925 | 0 | 0 | defined $resp or return {}; | ||||
926 | 0 | 0 | if (! exists $resp_exiftool_info{$resp}) { | ||||
927 | 0 | $resp_exiftool_info{$resp} = _resp_exiftool_info($resp); | |||||
928 | ### exiftool info: $resp_exiftool_info{$resp} | ||||||
929 | } | ||||||
930 | 0 | return $resp_exiftool_info{$resp}; | |||||
931 | } | ||||||
932 | sub _resp_exiftool_info { | ||||||
933 | 0 | 0 | my ($resp) = @_; | ||||
934 | |||||||
935 | # Want ExifTool 8.22 to have PNG tEXt returned as utf8, but don't bother | ||||||
936 | # to enforce that. | ||||||
937 | # | ||||||
938 | # The returned fields from image formats with a defined charset are | ||||||
939 | # converted to the exiftool default "Charset" of utf8, and from other | ||||||
940 | # image formats the fields are bytes of something unknown. Might slightly | ||||||
941 | # like to know which is the case, and show raw bytes different from "bytes | ||||||
942 | # which ought to be utf8", but for now just Encode::decode_utf8() and let | ||||||
943 | # its Encode::FB_DEFAULT() put substitution chars for non-ascii non-utf8. | ||||||
944 | # | ||||||
945 | 0 | 0 | eval { require Image::ExifTool; 1 } || return {}; | ||||
0 | |||||||
0 | |||||||
946 | 0 | $resp->decode; | |||||
947 | 0 | my $cref = $resp->content_ref; | |||||
948 | 0 | return Image::ExifTool::ImageInfo | |||||
949 | ($cref, | ||||||
950 | ['Title','Author','Copyright','ImageSize'], # just these tags | ||||||
951 | {List => 0}); # get list values as comma separated | ||||||
952 | } | ||||||
953 | |||||||
954 | # $resp is a HTTP::Response, return title | ||||||
955 | sub html_title { | ||||||
956 | 0 | 0 | 0 | my ($resp) = @_; | |||
957 | |||||||
958 | return (# for images prefer filename+size over URI::Title just filename | ||||||
959 | 0 | 0 | non_empty (html_title_exiftool_image($resp)) | ||||
0 | |||||||
0 | |||||||
960 | |||||||
961 | // non_empty (html_title_urititle($resp)) | ||||||
962 | // non_empty (html_title_exiftool($resp)) | ||||||
963 | // $resp->title); | ||||||
964 | } | ||||||
965 | sub html_title_urititle { | ||||||
966 | 0 | 0 | 0 | my ($resp) = @_; | |||
967 | 0 | 0 | eval { require URI::Title } or return undef; | ||||
0 | |||||||
968 | |||||||
969 | # suppress some dodginess in URI::Title 1.82 | ||||||
970 | local $SIG{'__WARN__'} = sub { | ||||||
971 | 0 | 0 | my ($msg) = @_; | ||||
972 | 0 | 0 | $msg =~ /Use of uninitialized value/ or warn @_; | ||||
973 | 0 | }; | |||||
974 | 0 | $resp->decode; | |||||
975 | 0 | 0 | return URI::Title::title | ||||
976 | ({ url => ($resp->request->uri // ''), | ||||||
977 | data => $resp->content}); | ||||||
978 | } | ||||||
979 | sub html_title_exiftool_image { | ||||||
980 | 0 | 0 | 0 | my ($resp) = @_; | |||
981 | 0 | 0 | $resp->content_type =~ m{^image/} or return; | ||||
982 | 0 | 0 | if (defined (my $title = html_title_exiftool($resp))) { | ||||
983 | 0 | return $title; | |||||
984 | } | ||||||
985 | 0 | 0 | my $info = resp_exiftool_info($resp) // return; | ||||
986 | ### html_title_exiftool_image() on: $info | ||||||
987 | 0 | 0 | defined $info->{'ImageSize'} or return; | ||||
988 | 0 | return $resp->filename.' '.$info->{'ImageSize'}; | |||||
989 | } | ||||||
990 | sub html_title_exiftool { | ||||||
991 | 0 | 0 | 0 | my ($resp) = @_; | |||
992 | 0 | 0 | my $title = resp_exiftool_info($resp)->{'Title'} // return; | ||||
993 | 0 | return Encode::decode_utf8 ($title); | |||||
994 | } | ||||||
995 | |||||||
996 | |||||||
997 | #------------------------------------------------------------------------------ | ||||||
998 | # mime | ||||||
999 | |||||||
1000 | # prepended to "X-Mailer" header | ||||||
1001 | 1 | 1 | 6 | use constant mime_mailer_extra => "RSS2Leafnode $VERSION"; | |||
1 | 2 | ||||||
1 | 1777 | ||||||
1002 | |||||||
1003 | # $body is a MIME::Body object, append $str to it | ||||||
1004 | sub mime_body_append { | ||||||
1005 | 0 | 0 | 0 | my ($body, $str) = @_; | |||
1006 | 0 | $str = $body->as_string . "\n" . str_ensure_newline ($str); | |||||
1007 | 0 | 0 | my $IO = $body->open('w') | ||||
1008 | or die "rss2leafnode: body I/O open: $!"; | ||||||
1009 | 0 | $IO->print ($str); | |||||
1010 | 0 | 0 | $IO->close | ||||
1011 | or die "rss2leafnode: body I/O close: $!"; | ||||||
1012 | } | ||||||
1013 | |||||||
1014 | # if $str is not ascii then apply encode_mimewords() | ||||||
1015 | sub mimewords_non_ascii { | ||||||
1016 | 0 | 0 | 0 | my ($str) = @_; | |||
1017 | 0 | 0 | 0 | if (defined $str && ! is_ascii($str)) { | |||
1018 | 0 | require MIME::Words; | |||||
1019 | 0 | $str = MIME::Words::encode_mimewords (Encode::encode_utf8($str), | |||||
1020 | Charset => 'UTF-8'); | ||||||
1021 | } | ||||||
1022 | 0 | return $str; | |||||
1023 | } | ||||||
1024 | |||||||
1025 | sub mime_build { | ||||||
1026 | 0 | 0 | 0 | my ($self, $headers, @args) = @_; | |||
1027 | |||||||
1028 | # Headers in utf-8, the same as other text. The docs of | ||||||
1029 | # encode_mimewords() isn't clear, but seems to expect bytes of the | ||||||
1030 | # specified charset. | ||||||
1031 | 0 | foreach my $key (sort keys %$headers) { | |||||
1032 | 0 | $headers->{$key} | |||||
1033 | = mimewords_non_ascii(Text::Trim::trim($headers->{$key})); | ||||||
1034 | } | ||||||
1035 | |||||||
1036 | 0 | %$headers = (%$headers, @args); | |||||
1037 | 0 | 0 | $headers->{'Top'} //= 0; # default to a part not a toplevel | ||||
1038 | 0 | 0 | $headers->{'Encoding'} //= '-SUGGEST'; | ||||
1039 | |||||||
1040 | 0 | 0 | if ($headers->{'Top'}) { | ||||
1041 | 0 | my $now822 = rfc822_time_now(); | |||||
1042 | 0 | 0 | $headers->{'Date'} //= $now822; | ||||
1043 | 0 | $headers->{'Date-Received:'} = $now822; | |||||
1044 | } | ||||||
1045 | |||||||
1046 | 0 | 0 | if (utf8::is_utf8($headers->{'Data'})) { | ||||
1047 | 0 | warn 'Oops, mime_build() data should be bytes'; | |||||
1048 | } | ||||||
1049 | |||||||
1050 | # downgrade utf-8 to us-ascii if possible | ||||||
1051 | 0 | 0 | 0 | if ($headers->{'Type'} eq 'text/plain' | |||
0 | |||||||
0 | |||||||
1052 | && lc($headers->{'Charset'}||0) eq 'utf-8' | ||||||
1053 | && is_ascii ($headers->{'Data'})) { | ||||||
1054 | 0 | $headers->{'Charset'} = 'us-ascii'; | |||||
1055 | |||||||
1056 | # not sure mangling text/html body content is a good idea -- would only | ||||||
1057 | # want it on generated html, not downloaded | ||||||
1058 | # | ||||||
1059 | # if ($headers->{'Type'} eq 'text/html') { | ||||||
1060 | # $headers->{'Data'} =~ s{( | ||||||
1061 | # } | ||||||
1062 | } | ||||||
1063 | |||||||
1064 | 0 | @args = map {$_,$headers->{$_}} sort keys %$headers; | |||||
0 | |||||||
1065 | 0 | 0 | if ($self->{'verbose'} >= 4) { | ||||
1066 | 0 | require Data::Dumper; | |||||
1067 | 0 | $self->verbose (4, Data::Dumper->new([\@args],['mime headers'])->Dump); | |||||
1068 | } | ||||||
1069 | |||||||
1070 | 0 | require MIME::Entity; | |||||
1071 | 0 | my $top = MIME::Entity->build (Disposition => 'inline', @args); | |||||
1072 | |||||||
1073 | 0 | 0 | 0 | if ($headers->{'Top'} && ! defined $headers->{'X-Mailer:'}) { | |||
1074 | 0 | my $head = $top->head; | |||||
1075 | 0 | $head->set('X-Mailer', join_non_empty (', ', | |||||
1076 | $self->mime_mailer_extra, | ||||||
1077 | $head->get('X-Mailer'))); | ||||||
1078 | } | ||||||
1079 | |||||||
1080 | 0 | return $top; | |||||
1081 | } | ||||||
1082 | |||||||
1083 | # $resp is a HTTP::Response | ||||||
1084 | # Return a MIME::Entity which contains the response, and any further @headers. | ||||||
1085 | # If $self->{'render'} is true then render HTML to plain text. | ||||||
1086 | # | ||||||
1087 | sub mime_part_from_response { | ||||||
1088 | 0 | 0 | 0 | my ($self, $resp, @headers) = @_; | |||
1089 | |||||||
1090 | 0 | my $content_type = $resp->content_type; | |||||
1091 | 0 | $self->verbose (2, ' content-type: ',$content_type); | |||||
1092 | 0 | $resp->decode; | |||||
1093 | 0 | my $content = $resp->content; # the bytes | |||||
1094 | 0 | my $charset = $resp->content_charset; # and their charset | |||||
1095 | 0 | my $url = $resp->request->uri->as_string; | |||||
1096 | 0 | my $content_md5 = $resp->header('Content-MD5'); | |||||
1097 | |||||||
1098 | 0 | ($content, $content_type, $charset, my $rendered) | |||||
1099 | = $self->render_maybe ($content, $content_type, $charset, $url); | ||||||
1100 | 0 | 0 | if ($rendered) { | ||||
1101 | 0 | undef $content_md5; | |||||
1102 | } | ||||||
1103 | |||||||
1104 | 0 | return $self->mime_build | |||||
1105 | ({ 'Content-Language:' => scalar($resp->header('Content-Language')), | ||||||
1106 | 'Content-Location:' => $url, | ||||||
1107 | 'Content-MD5:' => $content_md5, | ||||||
1108 | @headers, | ||||||
1109 | }, | ||||||
1110 | Type => $content_type, | ||||||
1111 | Charset => $charset, | ||||||
1112 | Data => $content, | ||||||
1113 | Filename => $resp->filename); | ||||||
1114 | } | ||||||
1115 | |||||||
1116 | |||||||
1117 | # set "Lines:" header per RFC 1036 | ||||||
1118 | # MIME::Entity 5.428 doesn't seem to have anything for this itself | ||||||
1119 | # this is after qp or base64, is that right? the actual message lines | ||||||
1120 | sub mime_entity_lines { | ||||||
1121 | 0 | 0 | 0 | my ($top) = @_; | |||
1122 | 0 | $top->head->set('Lines', str_count_lines ($top->stringify_body)); | |||||
1123 | } | ||||||
1124 | |||||||
1125 | #------------------------------------------------------------------------------ | ||||||
1126 | # XML::Twig stuff | ||||||
1127 | |||||||
1128 | # Return the text of $elt and treat child elements as improperly escaped | ||||||
1129 | # parts of the text too. | ||||||
1130 | # | ||||||
1131 | # This is good for elements which are supposed to be HTML with etc |
||||||
1132 | # escaped as <p>, but copes with feeds that don't have the necessary | ||||||
1133 | # escapes and thus come out with xml child elements under $elt. | ||||||
1134 | # | ||||||
1135 | # For elements which are supposed to be plain text with no markup and no | ||||||
1136 | # sub-elements this will at least make improper child text visible, though | ||||||
1137 | # it might not look very good. | ||||||
1138 | # | ||||||
1139 | # As of June 2010 http://www.drweil.com/drw/ecs/rss.xml is an example of | ||||||
1140 | # improperly escaped html. | ||||||
1141 | # | ||||||
1142 | # FIXME: Any need to watch out for |
||||||
1143 | # | ||||||
1144 | sub elt_subtext { | ||||||
1145 | 0 | 0 | 0 | my ($elt) = @_; | |||
1146 | 0 | 0 | defined $elt or return undef; | ||||
1147 | 0 | 0 | if ($elt->is_text) { return $elt->text; } | ||||
0 | |||||||
1148 | 0 | return join ('', map {_elt_subtext_with_tags($_)} $elt->children); | |||||
0 | |||||||
1149 | } | ||||||
1150 | sub _elt_subtext_with_tags { | ||||||
1151 | 0 | 0 | my ($elt) = @_; | ||||
1152 | 0 | 0 | defined $elt or return undef; | ||||
1153 | 0 | 0 | if ($elt->is_text) { return $elt->text; } | ||||
0 | |||||||
1154 | 0 | return ($elt->start_tag | |||||
1155 | 0 | . join ('', map {_elt_subtext_with_tags($_)} $elt->children) | |||||
1156 | . $elt->end_tag); | ||||||
1157 | } | ||||||
1158 | |||||||
1159 | # $elt contains xhtml etc sub-elements. Return a plain html string. |
||||||
1160 | # Prefixes like |
||||||
1161 | # This relies on the map_xmlns mapping to give prefix "xhtml:" | ||||||
1162 | # | ||||||
1163 | sub elt_xhtml_to_html { | ||||||
1164 | 0 | 0 | 0 | my ($elt) = @_; | |||
1165 | |||||||
1166 | # could probably do it destructively, but just in case | ||||||
1167 | 0 | $elt = $elt->copy; | |||||
1168 | 0 | App::RSS2Leafnode::XML::Twig::Other::elt_tree_strip_prefix ($elt, 'xhtml'); | |||||
1169 | |||||||
1170 | # lose xmlns:xhtml="http://www.w3.org/1999/xhtml" | ||||||
1171 | 0 | $elt->strip_att('xmlns:xhtml'); | |||||
1172 | |||||||
1173 | # something fishy turns "href" to "xhtml:href", drop any "xhtml:" | ||||||
1174 | # bare "href" also gets turned into atom:href as the default namespace, | ||||||
1175 | # drop any "atom:" | ||||||
1176 | 0 | foreach my $child ($elt->descendants) { | |||||
1177 | 0 | foreach my $attname ($child->att_names) { | |||||
1178 | 0 | 0 | if ($attname =~ /^(xhtml|atom):(.*)/) { | ||||
1179 | 0 | $child->change_att_name($attname, $2); | |||||
1180 | } | ||||||
1181 | } | ||||||
1182 | } | ||||||
1183 | |||||||
1184 | 0 | my $old_pretty = $elt->set_pretty_print ('none'); | |||||
1185 | ### $old_pretty | ||||||
1186 | 0 | my $ret = $elt->xml_string; | |||||
1187 | 0 | $elt->set_pretty_print ($old_pretty); | |||||
1188 | 0 | return $ret; | |||||
1189 | |||||||
1190 | } | ||||||
1191 | |||||||
1192 | # elt_content_type() returns 'text', 'html', 'xhtml' or a mime type. | ||||||
1193 | # If no type="" attribute the default is 'text', except for RSS | ||||||
1194 | # |
||||||
1195 | # | ||||||
1196 | # RSS http://www.debian.org/News/weekly/dwn.en.rdf circa Feb 2010 had some | ||||||
1197 | # html in its |
||||||
1198 | # plain text) and that RSS is all plain text outside |
||||||
1199 | # | ||||||
1200 | # |
||||||
1201 | # not the formatting as html vs text. | ||||||
1202 | # | ||||||
1203 | @known{'/channel/item/dc:type'} = undef; | ||||||
1204 | # | ||||||
1205 | sub elt_content_type { | ||||||
1206 | 0 | 0 | 0 | my ($elt) = @_; | |||
1207 | 0 | 0 | if (! defined $elt) { return undef; } | ||||
0 | |||||||
1208 | |||||||
1209 | 0 | 0 | 0 | if (defined (my $type = ($elt->att('atom:type') // $elt->att('type')))) { | |||
1210 | # type="application/xhtml+xml" at http://xmltwig.com/blog/index.atom, | ||||||
1211 | # dunno if it should be just "xhtml", but recognise it anyway | ||||||
1212 | 0 | 0 | if ($type eq 'application/xhtml+xml') { return 'xhtml'; } | ||||
0 | |||||||
1213 | 0 | return $type; | |||||
1214 | } | ||||||
1215 | 0 | 0 | if ($elt->root->tag eq 'feed') { | ||||
1216 | 0 | return 'text'; # Atom |
|||||
1217 | } | ||||||
1218 | 0 | my $tag = $elt->tag; | |||||
1219 | 0 | 0 | if ($tag =~ /^itunes:/) { | ||||
1220 | # itunes spec is for text-only, no html markup | ||||||
1221 | 0 | return 'text'; | |||||
1222 | } | ||||||
1223 | 0 | 0 | 0 | if ($tag eq 'description' # RSS |
|||
1224 | || $tag eq 'content:encoded') { # same in content:encoded | ||||||
1225 | 0 | return 'html'; | |||||
1226 | } | ||||||
1227 | # other RSS is text | ||||||
1228 | 0 | return 'text'; | |||||
1229 | } | ||||||
1230 | |||||||
1231 | # $elt is an XML::Twig::Elt of an RSS or Atom text element. | ||||||
1232 | # Atom has a type="" attribute, RSS is html. Html or xhtml are rendered to | ||||||
1233 | # a single long line of plain text. | ||||||
1234 | # | ||||||
1235 | sub elt_to_rendered_line { | ||||||
1236 | 0 | 0 | 0 | my ($elt) = @_; | |||
1237 | 0 | 0 | defined $elt or return; | ||||
1238 | |||||||
1239 | 0 | my $str; | |||||
1240 | 0 | my $type = elt_content_type ($elt); | |||||
1241 | 0 | 0 | if ($type eq 'xhtml') { | ||||
1242 | 0 | $str = elt_xhtml_to_html ($elt); | |||||
1243 | 0 | $type = 'html'; | |||||
1244 | } else { | ||||||
1245 | 0 | $str = elt_subtext($elt); | |||||
1246 | } | ||||||
1247 | 0 | 0 | if ($type eq 'html') { | ||||
1248 | 0 | $str = html_to_rendered_line($str); | |||||
1249 | } | ||||||
1250 | # plain 'text' or anything unrecognised collapsed too | ||||||
1251 | 0 | return non_empty(collapse_whitespace($str)); | |||||
1252 | } | ||||||
1253 | |||||||
1254 | sub html_to_rendered_line { | ||||||
1255 | 0 | 0 | 0 | my ($html) = @_; | |||
1256 | 0 | require HTML::FormatText; | |||||
1257 | 0 | return collapse_whitespace | |||||
1258 | (HTML::FormatText->format_string ($html, | ||||||
1259 | leftmargin => 0, | ||||||
1260 | rightmargin => 999)); | ||||||
1261 | } | ||||||
1262 | |||||||
1263 | |||||||
1264 | #------------------------------------------------------------------------------ | ||||||
1265 | # XML::RSS::Timing | ||||||
1266 | |||||||
1267 | sub twig_to_timingfields { | ||||||
1268 | 0 | 0 | 0 | my ($self, $twig) = @_; | |||
1269 | 0 | 0 | return if ! defined $twig; | ||||
1270 | 0 | my $root = $twig->root; | |||||
1271 | 0 | my %timingfields; | |||||
1272 | |||||||
1273 | 0 | 0 | if (my $ttl = $root->first_descendant('ttl')) { | ||||
1274 | 0 | $timingfields{'ttl'} = $ttl->trimmed_text; | |||||
1275 | } | ||||||
1276 | 0 | 0 | if (my $skipHours = $root->first_descendant('skipHours')) { | ||||
1277 | 0 | $timingfields{'skipHours'} = [map {$_->trimmed_text} $skipHours->children('hour')]; | |||||
0 | |||||||
1278 | } | ||||||
1279 | 0 | 0 | if (my $skipDays = $root->first_descendant('skipDays')) { | ||||
1280 | 0 | $timingfields{'skipDays'} = [map {$_->trimmed_text} $skipDays->children('day')]; | |||||
0 | |||||||
1281 | } | ||||||
1282 | |||||||
1283 | # "syn:updatePeriod" etc | ||||||
1284 | 0 | foreach my $key (qw(updatePeriod updateFrequency updateBase)) { | |||||
1285 | 0 | 0 | if (my $update = $root->first_descendant("syn:$key")) { | ||||
1286 | 0 | $timingfields{$key} = $update->trimmed_text; | |||||
1287 | } | ||||||
1288 | } | ||||||
1289 | 0 | 0 | if ($self->{'verbose'} >= 2) { | ||||
1290 | 0 | require Data::Dumper; | |||||
1291 | 0 | $self->verbose (2, | |||||
1292 | Data::Dumper->new([\%timingfields],['timingfields']) | ||||||
1293 | ->Indent(1)->Sortkeys(1)->Dump); | ||||||
1294 | } | ||||||
1295 | 0 | 0 | if (! %timingfields) { | ||||
1296 | 0 | return; # no info | |||||
1297 | } | ||||||
1298 | |||||||
1299 | # if XML::RSS::Timing doesn't like the values then don't record them | ||||||
1300 | 0 | 0 | return unless $self->timingfields_to_timing(\%timingfields); | ||||
1301 | |||||||
1302 | 0 | return \%timingfields; | |||||
1303 | } | ||||||
1304 | @known{qw(/channel/skipDays | ||||||
1305 | /channel/skipDays/day | ||||||
1306 | /channel/skipHours | ||||||
1307 | /channel/skipHours/hour | ||||||
1308 | /channel/ttl | ||||||
1309 | /channel/syn:updateBase | ||||||
1310 | /channel/syn:updatePeriod | ||||||
1311 | /channel/syn:updateFrequency)} = (); | ||||||
1312 | |||||||
1313 | # return an XML::RSS::Timing object, or undef | ||||||
1314 | sub timingfields_to_timing { | ||||||
1315 | 0 | 0 | 0 | my ($self, $timingfields) = @_; | |||
1316 | 0 | 0 | $timingfields // return undef; | ||||
1317 | |||||||
1318 | 0 | 0 | eval { require XML::RSS::Timing } || return undef; | ||||
0 | |||||||
1319 | 0 | my $timing = XML::RSS::Timing->new; | |||||
1320 | 0 | $timing->use_exceptions(0); | |||||
1321 | 0 | while (my ($key, $value) = each %$timingfields) { | |||||
1322 | 0 | 0 | if (ref $value) { | ||||
1323 | 0 | $timing->$key (@$value); | |||||
1324 | } else { | ||||||
1325 | 0 | $timing->$key ($value); | |||||
1326 | } | ||||||
1327 | } | ||||||
1328 | 0 | 0 | if (my @complaints = $timing->complaints) { | ||||
1329 | 0 | say __x('XML::RSS::Timing complains about {url}', | |||||
1330 | url => $self->{'uri'}); | ||||||
1331 | 0 | foreach my $complaint (@complaints) { | |||||
1332 | 0 | say " $complaint"; | |||||
1333 | } | ||||||
1334 | 0 | return undef; | |||||
1335 | } | ||||||
1336 | 0 | return $timing; | |||||
1337 | } | ||||||
1338 | |||||||
1339 | |||||||
1340 | #------------------------------------------------------------------------------ | ||||||
1341 | # rss2leafnode.status file | ||||||
1342 | |||||||
1343 | # $self->{'global_status'} is a hashref containing entries URL => STATUS, | ||||||
1344 | # where URL is a string and STATUS is a sub-hashref of information | ||||||
1345 | |||||||
1346 | 1 | 1 | 6 | use constant STATUS_EXPIRE_DAYS => 45; | |||
1 | 1 | ||||||
1 | 3407 | ||||||
1347 | |||||||
1348 | # read $status_filename into $self->{'global_status'} | ||||||
1349 | sub status_read { | ||||||
1350 | 0 | 0 | 0 | my ($self) = @_; | |||
1351 | 0 | $self->{'global_status'} = {}; | |||||
1352 | 0 | my $status_filename = $self->status_filename; | |||||
1353 | 0 | $self->verbose (2, 'read status: ', $status_filename); | |||||
1354 | |||||||
1355 | 0 | $! = 0; | |||||
1356 | 0 | my $global_status = do $status_filename; | |||||
1357 | 0 | 0 | if (! defined $global_status) { | ||||
1358 | 0 | 0 | if ($! == POSIX::ENOENT()) { | ||||
1359 | 0 | $self->verbose (2, "status file doesn't exist"); | |||||
1360 | } else { | ||||||
1361 | 0 | say "rss2leafnode: error in $status_filename\n$@"; | |||||
1362 | 0 | say "ignoring that file"; | |||||
1363 | } | ||||||
1364 | 0 | $global_status = {}; | |||||
1365 | } | ||||||
1366 | 0 | $self->{'global_status'} = $global_status; | |||||
1367 | } | ||||||
1368 | |||||||
1369 | # delete old entries from $self->{'global_status'} | ||||||
1370 | sub status_prune { | ||||||
1371 | 0 | 0 | 0 | my ($self) = @_; | |||
1372 | 0 | 0 | my $global_status = $self->{'global_status'} // return; | ||||
1373 | 0 | my $pruned = 0; | |||||
1374 | 0 | my $old_time = time() - STATUS_EXPIRE_DAYS * 86400; | |||||
1375 | 0 | foreach my $key (keys %$global_status) { | |||||
1376 | 0 | 0 | if ($global_status->{$key}->{'status-time'} < $old_time) { | ||||
1377 | 0 | $self->verbose (2, __x("discard old status {url}\n", url => $key)); | |||||
1378 | 0 | delete $global_status->{$key}; | |||||
1379 | 0 | $pruned++; | |||||
1380 | } | ||||||
1381 | } | ||||||
1382 | 0 | 0 | if ($pruned) { | ||||
1383 | 0 | $self->verbose (1, __xn("discard {count} old status entry\n", | |||||
1384 | "discard {count} old status entries\n", | ||||||
1385 | $pruned, | ||||||
1386 | count => $pruned)); | ||||||
1387 | } | ||||||
1388 | } | ||||||
1389 | |||||||
1390 | # save $self->{'global_status'} into the $status_filename | ||||||
1391 | sub status_save { | ||||||
1392 | 0 | 0 | 0 | my ($self, $status) = @_; | |||
1393 | 0 | $status->{'status-time'} = time(); | |||||
1394 | 0 | 0 | if ($status->{'timingfields'}) { | ||||
1395 | 0 | $status->{'timingfields'}->{'lastPolled'} = $status->{'status-time'}; | |||||
1396 | } | ||||||
1397 | |||||||
1398 | 0 | $self->status_prune; | |||||
1399 | |||||||
1400 | 0 | require Data::Dumper; | |||||
1401 | 0 | my $str = Data::Dumper->new([$self->{'global_status'}],['global_status']) | |||||
1402 | ->Indent(1)->Sortkeys(1)->Terse(1)->Useqq(1)->Dump; | ||||||
1403 | 0 | $str = <<"HERE"; | |||||
1404 | # rss2leafnode status file -- automatically generated -- DO NOT EDIT | ||||||
1405 | # | ||||||
1406 | # (If there seems to be something very wrong then you can delete this file | ||||||
1407 | # and it'll be started afresh on the next run.) | ||||||
1408 | |||||||
1409 | $str | ||||||
1410 | |||||||
1411 | |||||||
1412 | # Local variables: | ||||||
1413 | # mode: perl-mode | ||||||
1414 | # End: | ||||||
1415 | HERE | ||||||
1416 | |||||||
1417 | 0 | my $status_filename = $self->status_filename; | |||||
1418 | 0 | my $out; | |||||
1419 | 0 | 0 | 0 | (open $out, '>', $status_filename | |||
0 | |||||||
1420 | and print $out $str | ||||||
1421 | and close $out) | ||||||
1422 | or croak "rss2leafnode: cannot write to $status_filename: $!\n"; | ||||||
1423 | } | ||||||
1424 | |||||||
1425 | # return a hashref which has status information about $url, or undef if | ||||||
1426 | # nothing recorded about $url | ||||||
1427 | sub status_geturl { | ||||||
1428 | 0 | 0 | 0 | my ($self, $url) = @_; | |||
1429 | 0 | 0 | $self->status_read if ! $self->{'global_status'}; | ||||
1430 | 0 | 0 | if (! $self->{'global_status'}->{$url}) { | ||||
1431 | 0 | $self->{'global_status'}->{$url} = { 'status-time' => time() }; | |||||
1432 | } | ||||||
1433 | 0 | return $self->{'global_status'}->{$url}; | |||||
1434 | } | ||||||
1435 | |||||||
1436 | # $resp is a HTTP::Response object from retrieving $url. | ||||||
1437 | # Optional $twig is an XML::Twig. | ||||||
1438 | # Record against $url any ETag, Last-Modified and ttl from $resp and $twig. | ||||||
1439 | # If $resp is an error return, or is undef, then do nothing. | ||||||
1440 | sub status_etagmod_resp { | ||||||
1441 | 0 | 0 | 0 | my ($self, $url, $resp, $twig) = @_; | |||
1442 | 0 | 0 | 0 | if ($resp && $resp->is_success) { | |||
1443 | 0 | my $status = $self->status_geturl ($url); | |||||
1444 | 0 | $status->{'Last-Modified'} = $resp->header('Last-Modified'); | |||||
1445 | 0 | $status->{'ETag'} = $resp->header('ETag'); | |||||
1446 | 0 | $status->{'timingfields'} = $self->twig_to_timingfields ($twig); | |||||
1447 | |||||||
1448 | 0 | 0 | 0 | if (! defined $status->{'ETag'} && ! defined $status->{'Last-Modified'}) { | |||
1449 | 0 | $self->verbose (1, " no ETag or Last-Modified"); | |||||
1450 | } | ||||||
1451 | 0 | 0 | if (defined (my $comments_count = $self->{'comments_count'})) { | ||||
1452 | 0 | $status->{'comments_count'} = $comments_count; | |||||
1453 | } | ||||||
1454 | |||||||
1455 | 0 | 0 | if ($twig) { | ||||
1456 | # record previously applied newest option | ||||||
1457 | 0 | $status->{'rss_newest_only'} = $self->{'rss_newest_only'}; | |||||
1458 | |||||||
1459 | # if (rss_newest_cmp($self,$status) > 0) { | ||||||
1460 | # # the newest number increases | ||||||
1461 | # } | ||||||
1462 | } | ||||||
1463 | 0 | foreach my $key (keys %$status) { | |||||
1464 | 0 | 0 | if (! defined $status->{$key}) { delete $status->{$key} } | ||||
0 | |||||||
1465 | } | ||||||
1466 | 0 | $self->status_save($status); | |||||
1467 | } | ||||||
1468 | } | ||||||
1469 | |||||||
1470 | # update recorded status for a $url with unchanged contents | ||||||
1471 | sub status_unchanged { | ||||||
1472 | 0 | 0 | 0 | my ($self, $url) = @_; | |||
1473 | 0 | $self->verbose (1, ' ', __('unchanged')); | |||||
1474 | 0 | $self->status_save ($self->status_geturl ($url)); | |||||
1475 | } | ||||||
1476 | |||||||
1477 | # $req is a HTTP::Request object. | ||||||
1478 | # Add "If-None-Match" and/or "If-Modified-Since" headers to it based on what | ||||||
1479 | # the status file has recorded from when we last fetched the url in $req. | ||||||
1480 | # Return 1 to download, 0 if nothing expected yet by RSS timing fields | ||||||
1481 | # | ||||||
1482 | sub status_etagmod_req { | ||||||
1483 | 0 | 0 | 0 | my ($self, $req, $for_rss) = @_; | |||
1484 | 0 | 0 | $self->{'global_status'} or $self->status_read; | ||||
1485 | |||||||
1486 | 0 | my $url = $req->uri->as_string; | |||||
1487 | my $status = $self->{'global_status'}->{$url} | ||||||
1488 | 0 | 0 | // do { | ||||
1489 | 0 | $self->verbose (2, __x("no status info for {url}\n", url => $url)); | |||||
1490 | 0 | return 1; # want download | |||||
1491 | }; | ||||||
1492 | |||||||
1493 | 0 | 0 | if ($for_rss) { | ||||
1494 | # if status says the last download was for only a certain number of | ||||||
1495 | # newest, then force a re-download if that option now different | ||||||
1496 | 0 | 0 | if (! str_equal($self->{'rss_newest_only'}, | ||||
1497 | $status->{'rss_newest_only'})) { | ||||||
1498 | 0 | return 1; # want download | |||||
1499 | } | ||||||
1500 | } | ||||||
1501 | |||||||
1502 | 0 | 0 | if (my $timing = $self->timingfields_to_timing ($status->{'timingfields'})) { | ||||
1503 | 0 | my $next = $timing->nextUpdate; | |||||
1504 | 0 | my $now = time(); | |||||
1505 | 0 | 0 | if ($next > $now) { | ||||
1506 | 0 | $self->verbose (1, ' ', | |||||
1507 | __x('timing: next update {time} (local time)', | ||||||
1508 | time => POSIX::strftime ("%H:%M:%S %a %d %b %Y", | ||||||
1509 | localtime($next)))); | ||||||
1510 | 0 | 0 | 0 | if (eval 'use Time::Duration::Locale; 1' | |||
1511 | || eval 'use Time::Duration; 1') { | ||||||
1512 | 0 | $self->verbose (1, ' ', __x('which is {duration} from now', | |||||
1513 | duration => duration($next-$now))); | ||||||
1514 | } | ||||||
1515 | 0 | return 0; # no update yet | |||||
1516 | } | ||||||
1517 | } | ||||||
1518 | 0 | 0 | if (defined (my $lastmod = $status->{'Last-Modified'})) { | ||||
1519 | 0 | $req->header('If-Modified-Since' => $lastmod); | |||||
1520 | } | ||||||
1521 | 0 | 0 | if (defined (my $etag = $status->{'ETag'})) { | ||||
1522 | 0 | $req->header('If-None-Match' => $etag); | |||||
1523 | } | ||||||
1524 | 0 | return 1; | |||||
1525 | } | ||||||
1526 | |||||||
1527 | # return -1 if x |
||||||
1528 | # sub rss_newest_cmp { | ||||||
1529 | # my ($x, $y) = @_; | ||||||
1530 | # if ($x->{'rss_newest_only'}) { | ||||||
1531 | # if (! $y->{'rss_newest_only'}) { | ||||||
1532 | # return -1; # x finite, y infinite | ||||||
1533 | # } | ||||||
1534 | # # x and y finite | ||||||
1535 | # return ($x->{'rss_newest_only'} <=> $y->{'rss_newest_only'}); | ||||||
1536 | # } else { | ||||||
1537 | # # x infinite, so 1 if y finite, 0 if y infinite too | ||||||
1538 | # return !! $y->{'rss_newest_only'}; | ||||||
1539 | # } | ||||||
1540 | # } | ||||||
1541 | sub str_equal { | ||||||
1542 | 0 | 0 | 0 | my ($x, $y) = @_; | |||
1543 | 0 | 0 | return ((defined $x && defined $y && $x eq $y) | ||||
1544 | || (! defined $x && ! defined $y)); | ||||||
1545 | } | ||||||
1546 | |||||||
1547 | #------------------------------------------------------------------------------ | ||||||
1548 | # render html | ||||||
1549 | |||||||
1550 | # $content_type is a string like "text/html" or "text/plain". | ||||||
1551 | # $content is data as raw bytes. | ||||||
1552 | # $charset is the character set of those bytes, eg. "utf-8". | ||||||
1553 | # | ||||||
1554 | # If the $render option is set, and $content_type is 'text/html', then | ||||||
1555 | # render $content down to 'text/plain', using either HTML::FormatText or | ||||||
1556 | # Lynx. | ||||||
1557 | # The return is a new triplet ($content, $content_type, $charset). | ||||||
1558 | # | ||||||
1559 | sub render_maybe { | ||||||
1560 | 0 | 0 | 0 | my ($self, $content, $content_type, $charset, $base_url) = @_; | |||
1561 | 0 | my $rendered = 0; | |||||
1562 | 0 | 0 | 0 | if ($self->{'render'} && $content_type eq 'text/html') { | |||
1563 | |||||||
1564 | 0 | my $class = $self->{'render'}; | |||||
1565 | 0 | 0 | if ($class !~ /^HTML::/) { $class = "HTML::FormatText::\u$class"; } | ||||
0 | |||||||
1566 | 0 | $class =~ s/::1$//; # "::1" is $render=1 for plain HTML::FormatText | |||||
1567 | 0 | require Module::Load; | |||||
1568 | 0 | Module::Load::load ($class); | |||||
1569 | |||||||
1570 | 0 | 0 | if ($class =~ /^HTML::FormatText($|::WithLinks)/) { | ||||
1571 | # Believe HTML::FormatText (as of version 2.04) doesn't know much | ||||||
1572 | # about input or output charsets, but it can be tricked into getting | ||||||
1573 | # pretty reasonable results by putting wide chars through it. | ||||||
1574 | # Likewise HTML::FormatText::WithLinks (as of version 0.11). | ||||||
1575 | # | ||||||
1576 | # FIXME: decode() can error out on bad charset | ||||||
1577 | 0 | $content = Encode::decode ($charset, $content); | |||||
1578 | 0 | local $SIG{'__WARN__'} = \&_warn_suppress_unknown_base; | |||||
1579 | 0 | $content = $class->format_string | |||||
1580 | ($content, | ||||||
1581 | base => $base_url, | ||||||
1582 | leftmargin => 0, | ||||||
1583 | rightmargin => $self->{'render_width'}); | ||||||
1584 | 0 | $content = Encode::encode_utf8 ($content); | |||||
1585 | |||||||
1586 | } else { | ||||||
1587 | # HTML::FormatExternal style charset specs | ||||||
1588 | 0 | $content = $class->format_string | |||||
1589 | ($content, | ||||||
1590 | base => $base_url, | ||||||
1591 | leftmargin => 0, | ||||||
1592 | rightmargin => $self->{'render_width'}, | ||||||
1593 | input_charset => $charset, | ||||||
1594 | output_charset => 'utf-8'); | ||||||
1595 | } | ||||||
1596 | 0 | $charset = 'UTF-8'; | |||||
1597 | 0 | $content_type = 'text/plain'; | |||||
1598 | 0 | $rendered = 1; | |||||
1599 | } | ||||||
1600 | 0 | return ($content, $content_type, $charset, $rendered); | |||||
1601 | } | ||||||
1602 | sub _warn_suppress_unknown_base { | ||||||
1603 | 0 | 0 | my ($msg) = @_; | ||||
1604 | 0 | 0 | $msg =~ /^Unknown configure option 'base'/ | ||||
1605 | or warn $msg; | ||||||
1606 | } | ||||||
1607 | |||||||
1608 | # $str is a wide-char string of text | ||||||
1609 | sub text_wrap { | ||||||
1610 | 0 | 0 | 0 | my ($self, $str, $prefix) = @_; | |||
1611 | 0 | 0 | if (! defined $prefix) { $prefix = ''; } | ||||
0 | |||||||
1612 | 0 | require Text::WrapI18N; | |||||
1613 | 0 | local $Text::WrapI18N::columns = $self->{'render_width'} + 1; | |||||
1614 | 0 | local $Text::WrapI18N::unexpand = 0; # no tabs in output | |||||
1615 | 0 | local $Text::WrapI18N::huge = 'overflow'; # don't break long words | |||||
1616 | 0 | $str =~ tr/\n/ /; | |||||
1617 | 0 | 0 | my $second_prefix = (length($prefix) ? ' 'x(length($prefix)+2) : ''); | ||||
1618 | 0 | return Text::WrapI18N::wrap($prefix, $second_prefix, $str); | |||||
1619 | } | ||||||
1620 | |||||||
1621 | #------------------------------------------------------------------------------ | ||||||
1622 | # Face icons | ||||||
1623 | |||||||
1624 | # $item is an XML::Twig::Elt of an RSS or Atom item | ||||||
1625 | # return a string value for the Face: header, or undef if no icon | ||||||
1626 | sub item_to_face { | ||||||
1627 | 0 | 0 | 0 | my ($self, $item) = @_; | |||
1628 | 0 | 0 | $self->{'get_icon'} || return; | ||||
1629 | 0 | 0 | my ($uri, $width, $height) = $self->item_image_uwh ($item) | ||||
1630 | or return; | ||||||
1631 | 0 | 0 | $self->face_wh_ok ($width, $height) || return; | ||||
1632 | 0 | return $self->download_face ($uri, $width, $height); | |||||
1633 | } | ||||||
1634 | |||||||
1635 | # $item is an XML::Twig::Elt of an RSS or Atom item | ||||||
1636 | # return values ($uri, $width, $height) of the |
||||||
1637 | # | ||||||
1638 | sub item_image_uwh { | ||||||
1639 | 0 | 0 | 0 | my ($self, $item) = @_; | |||
1640 | ### item_image_uwh() ... | ||||||
1641 | |||||||
1642 | 0 | foreach my $where ($item, | |||||
1643 | item_to_channel($item)) { | ||||||
1644 | ### image text: $where->first_child_text('image') | ||||||
1645 | |||||||
1646 | # identi.ca | ||||||
1647 | 0 | 0 | if (my $actor = $where->first_child('activity:actor')) { | ||||
1648 | 0 | my ($url, $width, $height); | |||||
1649 | 0 | foreach my $link_elt ($actor->children('link')) { | |||||
1650 | 0 | 0 | 0 | ($link_elt->att('rel')||$link_elt->att('atom:rel')||'') | |||
1651 | eq 'avatar' or next; | ||||||
1652 | 0 | 0 | $url = $link_elt->att('href') // $link_elt->att('atom:href') // next; | ||||
0 | |||||||
1653 | 0 | my $this_width = $link_elt->att('media:width'); | |||||
1654 | 0 | 0 | 0 | next if (defined $width | |||
0 | |||||||
1655 | && defined $this_width | ||||||
1656 | && $width < $this_width); # prefer smallest | ||||||
1657 | 0 | $url = App::RSS2Leafnode::XML::Twig::Other::elt_xml_based_uri ($link_elt, $url); | |||||
1658 | 0 | 0 | $width = ($this_width || 0); | ||||
1659 | 0 | 0 | $height = ($link_elt->att('media:height') || 0); | ||||
1660 | ### $url | ||||||
1661 | ### $width | ||||||
1662 | ### $height | ||||||
1663 | } | ||||||
1664 | 0 | 0 | if (defined $url) { | ||||
1665 | 0 | return ($url, $width, $height); | |||||
1666 | } | ||||||
1667 | } | ||||||
1668 | |||||||
1669 | # RSS | ||||||
1670 | # |
||||||
1671 | # |
||||||
1672 | # |
||||||
1673 | # |
||||||
1674 | # | ||||||
1675 | 0 | 0 | if (my $image_elt = $where->first_child('image')) { | ||||
1676 | 0 | my $url_elt; # XML::Twig::Elt where the url came from | |||||
1677 | my $url; # url string | ||||||
1678 | 0 | 0 | if ($url_elt = $image_elt->first_child('url')) { | ||||
1679 | 0 | $url = $url_elt->trimmed_text; | |||||
1680 | } else { | ||||||
1681 | # Cooper Hewitt museum http://blog.cooperhewitt.org/rss/?limit=10 | ||||||
1682 | # item |
||||||
1683 | # |
||||||
1684 | # ]]> | ||||||
1685 | # | ||||||
1686 | # don't want to encourage dodginess like this, but picking it out | ||||||
1687 | # isn't too hard | ||||||
1688 | 0 | 0 | if ($image_elt->text =~ /]*\ssrc="([^"]*)/) { | ||||
1689 | ### image from html: $1 | ||||||
1690 | 0 | $url_elt = $image_elt; | |||||
1691 | 0 | $url = $1; | |||||
1692 | } | ||||||
1693 | } | ||||||
1694 | 0 | 0 | if (is_non_empty ($url)) { | ||||
1695 | 0 | my $width = $image_elt->first_child_text('width'); | |||||
1696 | 0 | 0 | 0 | unless (Scalar::Util::looks_like_number($width) && $width > 0) { | |||
1697 | 0 | $width = 0; | |||||
1698 | } | ||||||
1699 | 0 | my $height = $image_elt->first_child_text('height'); | |||||
1700 | 0 | 0 | 0 | unless (Scalar::Util::looks_like_number($height) && $height > 0) { | |||
1701 | 0 | $height = 0; | |||||
1702 | } | ||||||
1703 | ### item_image_uwh() RSS: $url | ||||||
1704 | 0 | return (App::RSS2Leafnode::XML::Twig::Other::elt_xml_based_uri ($url_elt, $url), | |||||
1705 | $width, $height); | ||||||
1706 | } | ||||||
1707 | } | ||||||
1708 | |||||||
1709 | # Atom channel |
||||||
1710 | # or channel |
||||||
1711 | # | ||||||
1712 | # |
||||||
1713 | # is bigger than the RSS 48x48, so would probably need shrinking. Rate | ||||||
1714 | # it below |
||||||
1715 | # | ||||||
1716 | # |
||||||
1717 | # Is it better to show the channel icon, being the From person? | ||||||
1718 | { | ||||||
1719 | 0 | my $elt; | |||||
0 | |||||||
1720 | 0 | my ($width, $height); | |||||
1721 | my $url = ((($elt = $where->first_child('icon')) | ||||||
1722 | && non_empty ($elt->text)) | ||||||
1723 | || (($elt = $where->first_child('logo')) | ||||||
1724 | && non_empty ($elt->text)) | ||||||
1725 | || (($elt = $where->first_child('itunes:image')) | ||||||
1726 | && non_empty ($elt->att('href'))) | ||||||
1727 | || (($elt = $where->first_child('media:thumbnail')) | ||||||
1728 | && is_non_empty ($elt->att('url')) | ||||||
1729 | && do { | ||||||
1730 | $width = $elt->att('width'); | ||||||
1731 | $height = $elt->att('height'); | ||||||
1732 | $elt->att('url') }) | ||||||
1733 | # seen att('atom:url' rather than plain 'url' ... | ||||||
1734 | || (($elt = $where->first_child('media:thumbnail')) | ||||||
1735 | && is_non_empty ($elt->att('atom:url')) | ||||||
1736 | 0 | 0 | && do { | ||||
1737 | $width = $elt->att('width'); | ||||||
1738 | $height = $elt->att('height'); | ||||||
1739 | $elt->att('atom:url') })); | ||||||
1740 | ### $url | ||||||
1741 | 0 | 0 | if ($url) { | ||||
1742 | 0 | 0 | 0 | unless (Scalar::Util::looks_like_number($width) && $width > 0) { | |||
1743 | 0 | $width = 0; | |||||
1744 | } | ||||||
1745 | 0 | 0 | 0 | unless (Scalar::Util::looks_like_number($height) && $height > 0) { | |||
1746 | 0 | $height = 0; | |||||
1747 | } | ||||||
1748 | 0 | return (App::RSS2Leafnode::XML::Twig::Other::elt_xml_based_uri ($elt, $url), | |||||
1749 | $width, | ||||||
1750 | $height); | ||||||
1751 | } | ||||||
1752 | } | ||||||
1753 | |||||||
1754 | # status.net for rss 1.0 | ||||||
1755 | # |
||||||
1756 | 0 | 0 | if (my $elt = $where->first_child('statusnet:postIcon')) { | ||||
1757 | 0 | 0 | if (is_non_empty (my $url = $elt->att('rdf:resource'))) { | ||||
1758 | 0 | return (App::RSS2Leafnode::XML::Twig::Other::elt_xml_based_uri ($elt, $url), | |||||
1759 | 0, 0); # unknown size | ||||||
1760 | } | ||||||
1761 | } | ||||||
1762 | |||||||
1763 | # |
||||||
1764 | # eg. from blogger.com | ||||||
1765 | # |
||||||
1766 | { | ||||||
1767 | 0 | my $elt; | |||||
0 | |||||||
1768 | 0 | 0 | 0 | if (($elt = $where->first_child('author')) | |||
0 | |||||||
0 | |||||||
1769 | && ($elt = $elt->first_child('gd:image')) | ||||||
1770 | && (is_non_empty (my $url = $elt->att('src') // $elt->att('atom:src')))) { | ||||||
1771 | ### $url | ||||||
1772 | 0 | 0 | return (App::RSS2Leafnode::XML::Twig::Other::elt_xml_based_uri ($elt, $url), | ||||
0 | |||||||
1773 | $elt->att('width') || $elt->att('atom:width') || 0, | ||||||
1774 | $elt->att('height') || $elt->att('atom:height') || 0); | ||||||
1775 | } | ||||||
1776 | } | ||||||
1777 | } | ||||||
1778 | 0 | return; | |||||
1779 | } | ||||||
1780 | @known{qw(/channel/logo | ||||||
1781 | /channel/icon | ||||||
1782 | /channel/image | ||||||
1783 | /channel/image/url | ||||||
1784 | /channel/image/width | ||||||
1785 | /channel/image/height | ||||||
1786 | /channel/image/title | ||||||
1787 | /channel/image/link | ||||||
1788 | /channel/image/description | ||||||
1789 | /channel/itunes:image | ||||||
1790 | /channel/statusnet:postIcon | ||||||
1791 | |||||||
1792 | /channel/item/image | ||||||
1793 | /channel/item/media:thumbnail | ||||||
1794 | /channel/item/statusnet:postIcon | ||||||
1795 | )} = (); | ||||||
1796 | |||||||
1797 | # $resp is a HTTP::Response | ||||||
1798 | # return a string value for the Face: header, or undef if no icon | ||||||
1799 | sub http_resp_to_face { | ||||||
1800 | 0 | 0 | 0 | my ($self, $resp) = @_; | |||
1801 | 0 | 0 | $self->{'get_icon'} || return; | ||||
1802 | |||||||
1803 | 0 | 0 | my $uri = http_resp_favicon_uri($resp) || return; | ||||
1804 | 0 | $self->verbose (2, ' response favicon URI: ', $uri); | |||||
1805 | 0 | return $self->download_face ($uri, 0, 0); | |||||
1806 | } | ||||||
1807 | |||||||
1808 | # $resp is a HTTP::Response | ||||||
1809 | # if it's a html with a favicon link return a URI object of that image | ||||||
1810 | # | ||||||
1811 | # http://www.w3.org/2005/10/howto-favicon | ||||||
1812 | # | ||||||
1813 | sub http_resp_favicon_uri { | ||||||
1814 | 0 | 0 | 0 | my ($resp) = @_; | |||
1815 | 0 | 0 | $resp->headers->content_is_html || return; | ||||
1816 | 0 | require HTML::Parser; | |||||
1817 | 0 | my $href; | |||||
1818 | my $p; | ||||||
1819 | $p = HTML::Parser->new (api_version => 3, | ||||||
1820 | start_h => [ sub { | ||||||
1821 | 0 | 0 | my ($tagname, $attr) = @_; | ||||
1822 | 0 | 0 | 0 | if ($tagname eq 'link' | |||
1823 | && $attr->{'rel'} eq 'icon') { | ||||||
1824 | 0 | $href = $attr->{'href'}; | |||||
1825 | 0 | $p->eof; | |||||
1826 | } | ||||||
1827 | 0 | }, "tagname, attr"]); | |||||
1828 | 0 | $resp->decode; | |||||
1829 | 0 | $p->parse ($resp->content); | |||||
1830 | 0 | 0 | return $href && URI->new_abs ($href, $resp->base); | ||||
1831 | } | ||||||
1832 | |||||||
1833 | # return base64 string value for "Face:" header | ||||||
1834 | sub download_face { | ||||||
1835 | 0 | 0 | 0 | my ($self, $uri, $width, $height) = @_; | |||
1836 | 0 | my $key = $uri->canonical->as_string; | |||||
1837 | 0 | 0 | if (! exists $self->{'download_face'}->{$key}) { | ||||
1838 | 0 | $self->{'download_face'}->{$key} | |||||
1839 | = $self->download_face_uncached ($uri, $width, $height); | ||||||
1840 | } | ||||||
1841 | 0 | return $self->{'download_face'}->{$key}; | |||||
1842 | } | ||||||
1843 | sub download_face_uncached { | ||||||
1844 | 0 | 0 | 0 | my ($self, $url, $width, $height) = @_; | |||
1845 | |||||||
1846 | 0 | $self->{'download_face_uncached'} = $url; | |||||
1847 | 0 | $self->verbose (1, ' image download: ', $url); | |||||
1848 | |||||||
1849 | 0 | require HTTP::Request; | |||||
1850 | 0 | my $req = HTTP::Request->new (GET => $url); | |||||
1851 | 0 | my $resp = $self->ua->request($req); | |||||
1852 | 0 | 0 | if (! $resp->is_success) { | ||||
1853 | 0 | print __x(" no image: {status}\n", | |||||
1854 | status => $resp->status_line); | ||||||
1855 | 0 | return; | |||||
1856 | } | ||||||
1857 | |||||||
1858 | 0 | my $type = $resp->content_type; | |||||
1859 | ### $type | ||||||
1860 | # FIXME: is mime=>$type the right way? could give it a look at the url | ||||||
1861 | # basename or server's suggested filename too, for Read() to use the | ||||||
1862 | # extension. | ||||||
1863 | 0 | 0 | 0 | if ($type eq 'image/vnd.microsoft.icon' || $type eq 'image/x-icon') { | |||
0 | |||||||
1864 | # mime.xml of imagemagick 6.6.0 only has "image/x-ico", and nothing for | ||||||
1865 | # ico in magic.xml | ||||||
1866 | 0 | $type = 'ico'; | |||||
1867 | } elsif ($type =~ m{^image/(.*)$}i) { | ||||||
1868 | 0 | $type = $1; | |||||
1869 | } else { | ||||||
1870 | 0 | $self->verbose (2, 'ignore non-image icon type: ',$type); | |||||
1871 | 0 | return; | |||||
1872 | } | ||||||
1873 | |||||||
1874 | 0 | $resp->decode; | |||||
1875 | 0 | my $data = $resp->content; | |||||
1876 | 0 | 0 | 0 | if ($type ne 'png' | |||
0 | |||||||
0 | |||||||
0 | |||||||
1877 | || $width == 0 || $height == 0 | ||||||
1878 | || $width > 48 || $height > 48) { | ||||||
1879 | 0 | 0 | $data = $self->imagemagick_to_png($type,$data) // return; | ||||
1880 | } | ||||||
1881 | 0 | $self->verbose (2, " image for Face ",length($data)," bytes"); | |||||
1882 | |||||||
1883 | # use a space as a separator since MIME::Entity will collapse out a | ||||||
1884 | # newline and make an enormous long word which then can't be split across | ||||||
1885 | # header lines and will likely exceed the nntp 998 char single-line limit | ||||||
1886 | 0 | require MIME::Base64; | |||||
1887 | 0 | $data = MIME::Base64::encode_base64($data, " "); | |||||
1888 | ### $data | ||||||
1889 | |||||||
1890 | 0 | return $data; | |||||
1891 | } | ||||||
1892 | |||||||
1893 | sub face_wh_ok { | ||||||
1894 | 0 | 0 | 0 | my ($self, $width, $height) = @_; | |||
1895 | |||||||
1896 | 0 | 0 | 0 | if ($width > 0 && $width > 2*$height) { | |||
1897 | # some obnoxious banner | ||||||
1898 | 0 | $self->verbose (1, ' ', | |||||
1899 | __x('image is a banner ({width}x{height}), ignore', | ||||||
1900 | width => $width, height => $height)); | ||||||
1901 | 0 | return 0; | |||||
1902 | } | ||||||
1903 | 0 | return 1; | |||||
1904 | } | ||||||
1905 | |||||||
1906 | #------------------------------------------------------------------------------ | ||||||
1907 | # ImageMagick bits | ||||||
1908 | |||||||
1909 | # $type is "gif", "ico" etc, $data is an image in a byte string | ||||||
1910 | # return a byte string of png, or undef if $data unrecognised | ||||||
1911 | sub imagemagick_to_png { | ||||||
1912 | 0 | 0 | 0 | my ($self, $type, $data) = @_; | |||
1913 | ### $type | ||||||
1914 | 0 | 0 | my $image = $self->imagemagick_from_data($type,$data) // return; | ||||
1915 | |||||||
1916 | 0 | my $width = $image->Get('width'); | |||||
1917 | 0 | my $height = $image->Get('height'); | |||||
1918 | ### compress: $image->Get('compression') | ||||||
1919 | 0 | $self->verbose (2, " image ${width}x${height}"); | |||||
1920 | 0 | 0 | 0 | if ($width == 0 || $height == 0) { | |||
1921 | 0 | return; | |||||
1922 | } | ||||||
1923 | 0 | 0 | 0 | if ($width <= 48 && $height <= 48 && $type eq 'png') { | |||
0 | |||||||
1924 | 0 | return $data; | |||||
1925 | } | ||||||
1926 | |||||||
1927 | # having downloaded the image is it better to keep a banner but shrink it, | ||||||
1928 | # or discard as no good? | ||||||
1929 | # | ||||||
1930 | # $self->face_wh_ok ($width, $height) || return; | ||||||
1931 | |||||||
1932 | 0 | 0 | 0 | if ($width > 48 || $height > 48) { | |||
1933 | 0 | my $factor; | |||||
1934 | 0 | 0 | 0 | if ($width <= 2*48 && $height <= 2*48) { | |||
1935 | 0 | $factor = 0.5; | |||||
1936 | } else { | ||||||
1937 | 0 | $factor = min (48 / $width, 48 / $height); | |||||
1938 | } | ||||||
1939 | 0 | $width = POSIX::ceil ($width * $factor); | |||||
1940 | 0 | $height = POSIX::ceil ($height * $factor); | |||||
1941 | 0 | $self->verbose (2, " image shrink by $factor to ${width}x${height}"); | |||||
1942 | # cf LiquidResize() or plain Resize() | ||||||
1943 | 0 | $image->AdaptiveResize (width => $width, height => $height); | |||||
1944 | } | ||||||
1945 | |||||||
1946 | 0 | my $ret = $image->Set (magick => 'PNG8'); | |||||
1947 | ### ret: "$ret" | ||||||
1948 | ### ret: $ret+0 | ||||||
1949 | 0 | 0 | if ($ret != 0) { | ||||
1950 | 0 | print "oops, imagemagick doesn't like PNG8: $ret\n"; | |||||
1951 | 0 | return; | |||||
1952 | } | ||||||
1953 | ### compress: $image->Get('compression') | ||||||
1954 | |||||||
1955 | # $image->Write ('/tmp/x.png'); | ||||||
1956 | 0 | ($data) = $image->ImageToBlob (); | |||||
1957 | 0 | return $data; | |||||
1958 | } | ||||||
1959 | |||||||
1960 | |||||||
1961 | # $type is "png", "ico" etc, $data is an image in a byte string | ||||||
1962 | # return a Image::Magick object, or undef if Perl-Magick not available | ||||||
1963 | sub imagemagick_from_data { | ||||||
1964 | 0 | 0 | 0 | my ($self, $type, $data) = @_; | |||
1965 | ### imagemagick_from_data(): $type | ||||||
1966 | 0 | 0 | eval { require Image::Magick } or return; | ||||
0 | |||||||
1967 | |||||||
1968 | 0 | my $image = Image::Magick->new (magick=>$type); | |||||
1969 | # $image->Set(debug=>'All'); | ||||||
1970 | 0 | my $ret = $image->BlobToImage ($data); | |||||
1971 | ### ret: "$ret" | ||||||
1972 | ### ret: $ret+0 | ||||||
1973 | 0 | 0 | if ($ret == 1) { | ||||
1974 | 0 | return $image; | |||||
1975 | } | ||||||
1976 | |||||||
1977 | # try again without the $type forced, in case bad Content-Type from http | ||||||
1978 | 0 | $image = Image::Magick->new; | |||||
1979 | # $image->Set(debug=>'All'); | ||||||
1980 | 0 | $ret = $image->BlobToImage ($data); | |||||
1981 | ### ret: "$ret" | ||||||
1982 | ### ret: $ret+0 | ||||||
1983 | 0 | 0 | if ($ret == 1) { | ||||
1984 | 0 | return $image; | |||||
1985 | } | ||||||
1986 | |||||||
1987 | 0 | print __x(" imagemagick doesn't like image data ({length} bytes) from {url}: {error}\n", | |||||
1988 | length => length($data), | ||||||
1989 | url => $self->{'download_face_uncached'}, | ||||||
1990 | error => $ret); | ||||||
1991 | 0 | return undef; | |||||
1992 | } | ||||||
1993 | |||||||
1994 | |||||||
1995 | #------------------------------------------------------------------------------ | ||||||
1996 | # XML::Liberal | ||||||
1997 | |||||||
1998 | use constant::defer have_xml_liberal => sub { | ||||||
1999 | 0 | 0 | my ($self) = @_; | ||||
2000 | 0 | 0 | 0 | if (eval { require XML::Liberal; 1 }) { | |||
0 | 0 | ||||||
0 | 0 | ||||||
2001 | 0 | 0 | return 1; | ||||
2002 | } | ||||||
2003 | 0 | 0 | $self->verbose (3, __x('XML::Liberal not available: {error}', error => $@)); | ||||
2004 | 0 | 0 | return 0; | ||||
2005 | 1 | 1 | 7 | }; | |||
1 | 2 | ||||||
1 | 9 | ||||||
2006 | |||||||
2007 | # try to correct $xmlstr | ||||||
2008 | # if successful return a new xml string, otherwise return undef | ||||||
2009 | sub xml_liberal_correction { | ||||||
2010 | 0 | 0 | 0 | my ($self, $xmlstr) = @_; | |||
2011 | 0 | 0 | $self->have_xml_liberal or return; | ||||
2012 | |||||||
2013 | ### try XML-Liberal ... | ||||||
2014 | 0 | my $liberal = XML::Liberal->new('LibXML'); | |||||
2015 | 0 | 0 | if (my $doc = eval { $liberal->parse_string($xmlstr) }) { | ||||
0 | |||||||
2016 | 0 | return $doc->toString; | |||||
2017 | } else { | ||||||
2018 | 0 | $self->verbose (2, __x('XML::Liberal parse error: {error}', error => $@)); | |||||
2019 | 0 | return undef; | |||||
2020 | } | ||||||
2021 | } | ||||||
2022 | |||||||
2023 | |||||||
2024 | #------------------------------------------------------------------------------ | ||||||
2025 | # error as news message | ||||||
2026 | |||||||
2027 | sub error_message { | ||||||
2028 | 0 | 0 | 0 | my ($self, $subject, $message, $attach_bytes) = @_; | |||
2029 | |||||||
2030 | 0 | require Encode; | |||||
2031 | 0 | my $charset = 'utf-8'; | |||||
2032 | 0 | $message = str_ensure_newline ($message); | |||||
2033 | 0 | $message = Encode::encode ($charset, $message, Encode::FB_DEFAULT()); | |||||
2034 | |||||||
2035 | 0 | my $date = rfc822_time_now(); | |||||
2036 | 0 | require Digest::MD5; | |||||
2037 | 0 | my $msgid = $self->url_to_msgid | |||||
2038 | ('http://localhost', | ||||||
2039 | Digest::MD5::md5_base64 ($date.$subject.$message)); | ||||||
2040 | |||||||
2041 | 0 | my $top = $self->mime_build | |||||
2042 | ({ | ||||||
2043 | 'Path:' => 'localhost', | ||||||
2044 | 'Newsgroups:' => $self->{'nntp_group'}, | ||||||
2045 | From => __('RSS2Leafnode').' |
||||||
2046 | Subject => $subject, | ||||||
2047 | Date => $date, | ||||||
2048 | 'Message-ID' => $msgid, | ||||||
2049 | }, | ||||||
2050 | Top => 1, | ||||||
2051 | Type => 'text/plain', | ||||||
2052 | Charset => $charset, | ||||||
2053 | Data => $message); | ||||||
2054 | |||||||
2055 | 0 | 0 | if (defined $attach_bytes) { | ||||
2056 | 0 | $top->make_multipart; | |||||
2057 | 0 | my $part = $self->mime_build | |||||
2058 | ({}, | ||||||
2059 | Charset => 'none', | ||||||
2060 | Type => 'application/octet-stream', | ||||||
2061 | Data => $attach_bytes); | ||||||
2062 | 0 | $top->add_part ($part); | |||||
2063 | } | ||||||
2064 | |||||||
2065 | 0 | mime_entity_lines($top); | |||||
2066 | 0 | 0 | $self->nntp_post($top) || return; | ||||
2067 | 0 | say __x('{group} 1 new article', group => $self->{'nntp_group'}); | |||||
2068 | } | ||||||
2069 | |||||||
2070 | |||||||
2071 | #------------------------------------------------------------------------------ | ||||||
2072 | # fetch HTML | ||||||
2073 | |||||||
2074 | sub http_resp_to_from { | ||||||
2075 | 0 | 0 | 0 | my ($self, $resp) = @_; | |||
2076 | ### http_resp_to_from() | ||||||
2077 | 0 | 0 | return $self->http_resp_exiftool_author($resp) | ||||
2078 | // 'nobody@'.$self->uri_to_host; | ||||||
2079 | } | ||||||
2080 | sub http_resp_exiftool_author { | ||||||
2081 | 0 | 0 | 0 | my ($self, $resp) = @_; | |||
2082 | # PNG Author field, or HTML author | ||||||
2083 | 0 | 0 | my $author = resp_exiftool_info($resp)->{'Author'} // return; | ||||
2084 | 0 | return $self->email_format_maybe (Encode::decode_utf8($author), '', undef); | |||||
2085 | } | ||||||
2086 | |||||||
2087 | sub http_resp_to_copyright { | ||||||
2088 | 0 | 0 | 0 | my ($self, $resp) = @_; | |||
2089 | ### http_http_resp_to_copyright() ... | ||||||
2090 | |||||||
2091 | 0 | my @copyrights = non_empty($resp->header('X-Meta-Copyright')); | |||||
2092 | 0 | 0 | unless ($resp->content_type =~ m{^text/}) { | ||||
2093 | # PNG Copyright field, perhaps other formats | ||||||
2094 | 0 | push @copyrights, non_empty(resp_exiftool_info($resp)->{'Copyright'}); | |||||
2095 | } | ||||||
2096 | 0 | return \@copyrights; | |||||
2097 | } | ||||||
2098 | |||||||
2099 | # return a "Keywords:" string, or undef if nothing | ||||||
2100 | sub http_resp_to_keywords { | ||||||
2101 | 0 | 0 | 0 | my ($self, $resp) = @_; | |||
2102 | ### http_resp_to_keywords() ... | ||||||
2103 | |||||||
2104 | 0 | my @keywords = $resp->header('X-Meta-Keywords'); | |||||
2105 | |||||||
2106 | 0 | 0 | if ($resp->headers->content_is_html) { | ||||
2107 | 0 | $resp->decode; | |||||
2108 | 0 | require HTML::Parser; | |||||
2109 | my $p = HTML::Parser->new | ||||||
2110 | (api_version => 3, | ||||||
2111 | report_tags => ['meta'], | ||||||
2112 | start_h => [ sub { | ||||||
2113 | 0 | 0 | my ($tagname, $attr) = @_; | ||||
2114 | # facebook thing | ||||||
2115 | 0 | 0 | 0 | if ($tagname eq 'meta' | |||
0 | |||||||
2116 | && lc($attr->{'property'}||'') eq 'og:type') { | ||||||
2117 | 0 | push @keywords, $attr->{'content'}; | |||||
2118 | } | ||||||
2119 | |||||||
2120 | 0 | }, "tagname, attr" ]); | |||||
2121 | 0 | $p->parse ($resp->decoded_content); | |||||
2122 | } | ||||||
2123 | ### @keywords | ||||||
2124 | |||||||
2125 | 0 | return join_non_empty | |||||
2126 | 0 | (', ', List::MoreUtils::uniq(map {collapse_whitespace($_)} | |||||
2127 | @keywords)); | ||||||
2128 | } | ||||||
2129 | |||||||
2130 | sub fetch_html { | ||||||
2131 | 0 | 0 | 1 | my ($self, $group, $url, %options) = @_; | |||
2132 | ### fetch_html() ... | ||||||
2133 | |||||||
2134 | 0 | local @{$self}{keys %options} = values %options; # hash slice | |||||
0 | |||||||
2135 | 0 | $self->verbose (1, __x('page: {url}', url => $url)); | |||||
2136 | |||||||
2137 | 0 | my $group_uri = URI->new($group,'news'); | |||||
2138 | 0 | local $self->{'nntp_host'} = uri_to_nntp_host ($group_uri); | |||||
2139 | 0 | local $self->{'nntp_group'} = $group = $group_uri->group; | |||||
2140 | 0 | 0 | $self->nntp_group_check($group) or return; | ||||
2141 | |||||||
2142 | 0 | require HTTP::Request; | |||||
2143 | 0 | my $req = HTTP::Request->new (GET => $url); | |||||
2144 | 0 | $self->status_etagmod_req ($req); | |||||
2145 | 0 | my $resp = $self->ua->request($req); | |||||
2146 | 0 | 0 | if ($resp->code == 304) { | ||||
2147 | 0 | $self->status_unchanged ($url); | |||||
2148 | 0 | return; | |||||
2149 | } | ||||||
2150 | 0 | 0 | if (! $resp->is_success) { | ||||
2151 | 0 | print __x("rss2leafnode: {url}\n {status}\n", | |||||
2152 | url => $url, | ||||||
2153 | status => $resp->status_line); | ||||||
2154 | 0 | return; | |||||
2155 | } | ||||||
2156 | 0 | $self->verbose (2, $resp->headers->as_string); | |||||
2157 | 0 | $self->enforce_html_charset_from_content ($resp); | |||||
2158 | |||||||
2159 | # message id is either the etag if present, or an md5 of the content if not | ||||||
2160 | my $msgid = $self->url_to_msgid | ||||||
2161 | ($url, | ||||||
2162 | 0 | 0 | $resp->header('ETag') // do { | ||||
2163 | 0 | require Digest::MD5; | |||||
2164 | 0 | $resp->decode; | |||||
2165 | 0 | my $content = $resp->content; | |||||
2166 | 0 | Digest::MD5::md5_base64($content) | |||||
2167 | }); | ||||||
2168 | 0 | 0 | return 0 if $self->nntp_message_id_exists ($msgid); | ||||
2169 | |||||||
2170 | 0 | 0 | my $subject = (html_title($resp) | ||||
0 | |||||||
2171 | // $resp->filename | ||||||
2172 | # show original url in subject, not anywhere redirected | ||||||
2173 | // __x('RSS2Leafnode {url}', url => $url)); | ||||||
2174 | |||||||
2175 | 0 | my $from = $self->http_resp_to_from($resp); | |||||
2176 | 0 | my $date = $resp->header('Last-Modified'); | |||||
2177 | 0 | my $face = $self->http_resp_to_face($resp); | |||||
2178 | 0 | my $copyright = $self->http_resp_to_copyright($resp); | |||||
2179 | 0 | my $keywords = $self->http_resp_to_keywords($resp); | |||||
2180 | |||||||
2181 | 0 | my $part = $self->http_resp_extract_main($resp); | |||||
2182 | |||||||
2183 | 0 | my $top = $self->mime_part_from_response | |||||
2184 | ($resp, | ||||||
2185 | Top => 1, | ||||||
2186 | 'Path:' => scalar($self->uri_to_host), | ||||||
2187 | 'Newsgroups:' => $group, | ||||||
2188 | From => $from, | ||||||
2189 | Subject => $subject, | ||||||
2190 | Date => $date, | ||||||
2191 | 'Message-ID' => $msgid, | ||||||
2192 | Keywords => $keywords, | ||||||
2193 | 'Face:' => $face, | ||||||
2194 | 'X-Copyright:' => $copyright); | ||||||
2195 | 0 | 0 | if ($part) { | ||||
2196 | ### attach full part ... | ||||||
2197 | 0 | $top->make_multipart; | |||||
2198 | 0 | $top->add_part ($part); | |||||
2199 | } | ||||||
2200 | |||||||
2201 | 0 | mime_entity_lines($top); | |||||
2202 | 0 | 0 | $self->nntp_post($top) || return; | ||||
2203 | 0 | $self->status_etagmod_resp ($url, $resp); | |||||
2204 | 0 | say __x("{group} 1 new article", group => $group); | |||||
2205 | } | ||||||
2206 | |||||||
2207 | # $resp is a HTTP::Response | ||||||
2208 | # If the $self->{'html_extract_main'} option is true and $resp is html then | ||||||
2209 | # resplace the $resp content with HTML::ExtractMain extracted part. | ||||||
2210 | # | ||||||
2211 | sub http_resp_extract_main { | ||||||
2212 | 0 | 0 | 0 | my ($self, $resp) = @_; | |||
2213 | |||||||
2214 | 0 | 0 | $self->{'html_extract_main'} or return; | ||||
2215 | 0 | 0 | $resp->headers->content_is_html() or return; | ||||
2216 | |||||||
2217 | 0 | 0 | my $full_part | ||||
2218 | = (defined $self->{'html_extract_main'} | ||||||
2219 | && $self->{'html_extract_main'} eq 'attach_full' | ||||||
2220 | && $self->mime_part_from_response($resp, | ||||||
2221 | Disposition => "attachment")); | ||||||
2222 | |||||||
2223 | 0 | require HTML::ExtractMain; | |||||
2224 | 0 | HTML::ExtractMain->VERSION(0.63); # for output_type=>'html' | |||||
2225 | 0 | $resp->decode; # expand any compression | |||||
2226 | 0 | my $content = $resp->decoded_content; # as wide-chars | |||||
2227 | |||||||
2228 | # Output type 'html' differs from the default xhtml by a few entities, in | ||||||
2229 | # particular it avoids ' which is an xml-ism not in the html standards. | ||||||
2230 | # Various browsers support &apos anyway, but not for example by w3m. | ||||||
2231 | 0 | $content = HTML::ExtractMain::extract_main_html($content, | |||||
2232 | output_type => 'html'); | ||||||
2233 | 0 | 0 | if (! defined $content) { | ||||
2234 | 0 | $self->verbose(1, __(" HTML::ExtractMain no main part found, posting whole")); | |||||
2235 | 0 | return; | |||||
2236 | } | ||||||
2237 | ### main extracted: $content | ||||||
2238 | 0 | $resp->remove_header('Content-MD5'); # since changed content | |||||
2239 | 0 | my $charset = $resp->content_charset; | |||||
2240 | 0 | $content = Encode::encode ($charset, $content); | |||||
2241 | 0 | $resp->content($content); | |||||
2242 | |||||||
2243 | 0 | return $full_part; | |||||
2244 | } | ||||||
2245 | |||||||
2246 | #------------------------------------------------------------------------------ | ||||||
2247 | # RSS hacks | ||||||
2248 | |||||||
2249 | # This is a hack for Yahoo Finance feed uniqification. | ||||||
2250 | # $item is a feed hashref. If it has 'link' field with a yahoo.com | ||||||
2251 | # redirection like | ||||||
2252 | # | ||||||
2253 | # http://au.rd.yahoo.com/finance/news/rss/financenews/*http://au.biz.yahoo.com/071003/30/1fdvx.html | ||||||
2254 | # | ||||||
2255 | # then return the last target url part. Otherwise return false. | ||||||
2256 | # | ||||||
2257 | # This allows the item to be identified by its final target link, so as to | ||||||
2258 | # avoid duplication when the item appears in multiple yahoo feeds with a | ||||||
2259 | # different leading part. (There's no guid in yahoo feeds, as of Oct 2007.) | ||||||
2260 | # | ||||||
2261 | sub item_yahoo_permalink { | ||||||
2262 | 0 | 0 | 0 | my ($item) = @_; | |||
2263 | 0 | 0 | my $url = $item->first_child_text('link') | ||||
2264 | // return undef; | ||||||
2265 | 0 | 0 | $url =~ m{^http://[^/]*yahoo\.com/.*\*(http://.*yahoo\.com.*)$} | ||||
2266 | or return undef; | ||||||
2267 | 0 | return $1; | |||||
2268 | } | ||||||
2269 | |||||||
2270 | # This is a special case for Google Groups RSS feeds. | ||||||
2271 | # The arguments are link elements [$name,$uri]. If there's a google groups | ||||||
2272 | # like "http://groups.google.com/group/cfcdev/msg/445d4ccfdabf086b" then | ||||||
2273 | # return a mailing list address like "cfcdev@googlegroups.com". If not in | ||||||
2274 | # that form then return undef. | ||||||
2275 | # | ||||||
2276 | sub googlegroups_link_email { | ||||||
2277 | ## no critic (RequireInterpolationOfMetachars) | ||||||
2278 | 0 | 0 | 0 | foreach my $l (@_) { | |||
2279 | 0 | 0 | 0 | if ($l->{'uri'} | |||
2280 | && $l->{'uri'}->canonical =~ m{^http://groups\.google\.com/group/([^/]+)/}) { | ||||||
2281 | 0 | return ($1 . '@googlegroups.com'); | |||||
2282 | } | ||||||
2283 | } | ||||||
2284 | 0 | return undef; | |||||
2285 | } | ||||||
2286 | |||||||
2287 | # This is a nasty hack for http://www.aireview.com.au/rss.php | ||||||
2288 | # $url is a link url string just fetched, $resp is a HTTP::Response. The | ||||||
2289 | # return is a possibly new HTTP::Response object. | ||||||
2290 | # | ||||||
2291 | # The first fetch of an item link from aireview gives back content like | ||||||
2292 | # | ||||||
2293 | # | ||||||
2294 | # | ||||||
2295 | # plus some cookies in the headers. The URL "zz=1" in that line seems very | ||||||
2296 | # dodgy, it ends up going to the home page with mozilla. In any case a | ||||||
2297 | # fresh fetch of the link url with the cookies provided is enough to get the | ||||||
2298 | # actual content. | ||||||
2299 | # | ||||||
2300 | # The LWP::UserAgent::FramesReady module on cpan has a similar match of a | ||||||
2301 | # Refresh, for use with frames. It works by turning the response into a | ||||||
2302 | # "302 Moved temporarily" for LWP to follow. urlcheck.pl at | ||||||
2303 | # http://www.cpan.org/authors/id/P/PH/PHILMI/urlcheck-1.00.pl likewise | ||||||
2304 | # follows. But alas both obey the URL given in the , which is no good | ||||||
2305 | # here. | ||||||
2306 | # | ||||||
2307 | sub aireview_follow { | ||||||
2308 | 0 | 0 | 0 | my ($self, $url, $resp) = @_; | |||
2309 | |||||||
2310 | 0 | 0 | if ($resp->is_success) { | ||||
2311 | 0 | $resp->decode; | |||||
2312 | 0 | my $content = $resp->content; | |||||
2313 | 0 | 0 | if ($content =~ /]*Refresh[^>]*checkForCookies/i) { | ||||
2314 | 0 | $self->verbose (1, ' following aireview META Refresh with cookies'); | |||||
2315 | 0 | require HTTP::Request; | |||||
2316 | 0 | my $req = HTTP::Request->new (GET => $url); | |||||
2317 | 0 | $resp = $self->ua->request($req); | |||||
2318 | } | ||||||
2319 | } | ||||||
2320 | 0 | return $resp; | |||||
2321 | } | ||||||
2322 | |||||||
2323 | |||||||
2324 | #------------------------------------------------------------------------------ | ||||||
2325 | # RSS links | ||||||
2326 | |||||||
2327 | |||||||
2328 | # WordPress (http://wordpress.org/) pre 2.5 had a bug | ||||||
2329 | # http://core.trac.wordpress.org/ticket/6579 where it gave | ||||||
2330 | # type="appication/atom+xml" missing the "l" in "application/". | ||||||
2331 | # Don't want to workaround every bad generator, but this one is GPL | ||||||
2332 | # free and the past versions still found for instance in the | ||||||
2333 | # language log http://languagelog.ldc.upenn.edu/nll/ in Feb 2011 | ||||||
2334 | # | ||||||
2335 | sub mime_type_is_rss { | ||||||
2336 | 0 | 0 | 0 | my ($self, $type) = @_; | |||
2337 | 0 | return ($type =~ m{^appl?ication/atom\+xml$}); | |||||
2338 | } | ||||||
2339 | sub atom_link_is_rss { | ||||||
2340 | 0 | 0 | 0 | my ($self, $elt) = @_; | |||
2341 | 0 | 0 | my $type = $elt->att('atom:type') // $elt->att('type') // return 0; | ||||
0 | |||||||
2342 | 0 | return $self->mime_type_is_rss($type); | |||||
2343 | } | ||||||
2344 | |||||||
2345 | |||||||
2346 | |||||||
2347 | # return list of hashrefs, each being | ||||||
2348 | # { name => $str, | ||||||
2349 | # uri => $uri_object, | ||||||
2350 | # download => $boolean, | ||||||
2351 | # priority => $number, | ||||||
2352 | # } | ||||||
2353 | # | ||||||
2354 | # Links are listed from highest to lowest priority. The current priority | ||||||
2355 | # levels are | ||||||
2356 | # 0 plain links | ||||||
2357 | # -10 comment RSS | ||||||
2358 | # -20 author home page | ||||||
2359 | # -100 geo location text-only | ||||||
2360 | # -101 statusnet geo location | ||||||
2361 | # -200 | ||||||
2362 | # | ||||||
2363 | sub item_to_links { | ||||||
2364 | 0 | 0 | 0 | my ($self, $item) = @_; | |||
2365 | |||||||
2366 | # |
||||||
2367 | # something has been expanded into the item, or should it be shown? | ||||||
2368 | |||||||
2369 | # FIXME: |
||||||
2370 | # |
||||||
2371 | # Can have a |
||||||
2372 | # | ||||||
2373 | # ENHANCE-ME: |
||||||
2374 | # | ||||||
2375 | # FIXME: |
||||||
2376 | # formats etc. Have seen this from archive.org just duplicating | ||||||
2377 | # |
||||||
2378 | # | ||||||
2379 | # |
||||||
2380 | # case. | ||||||
2381 | # | ||||||
2382 | 0 | my @elts = $item->children (qr/^(link | |||||
2383 | |enclosure | ||||||
2384 | |content | ||||||
2385 | |wiki:diff | ||||||
2386 | |wiki:history | ||||||
2387 | |comments | ||||||
2388 | |wfw:comment | ||||||
2389 | |wfw:commentRss | ||||||
2390 | |foaf:maker | ||||||
2391 | |sioc:has_creator | ||||||
2392 | |sioc:has_discussion | ||||||
2393 | |sioc:links_to | ||||||
2394 | |sioc:reply_of | ||||||
2395 | |statusnet:origin | ||||||
2396 | |dc:source | ||||||
2397 | )$/ix); | ||||||
2398 | ### link elts: "@elts" | ||||||
2399 | |||||||
2400 | 0 | my @links; | |||||
2401 | 0 | foreach my $elt (@elts) { | |||||
2402 | 0 | 0 | if ($self->{'verbose'} >= 2) { | ||||
2403 | 0 | require Text::Wrap; | |||||
2404 | 0 | local $Text::Wrap::huge = 'overflow'; # don't break long URLs etc | |||||
2405 | 0 | $self->verbose (2, "link\n", Text::Trim::trim($elt->sprint)); | |||||
2406 | } | ||||||
2407 | |||||||
2408 | 0 | my $tag = lc($elt->tag); | |||||
2409 | ### $tag | ||||||
2410 | 0 | 0 | 0 | if ($tag eq 'content' && atom_content_flavour($elt) ne 'link') { | |||
2411 | 0 | next; | |||||
2412 | } | ||||||
2413 | 0 | my $l = { download => 1 }; | |||||
2414 | |||||||
2415 | 0 | foreach my $name ('hreflang', 'title', 'type') { | |||||
2416 | 0 | 0 | $l->{$name} = ($elt->att("atom:$name") // $elt->att($name)); | ||||
2417 | } | ||||||
2418 | |||||||
2419 | 0 | 0 | my $rel = non_empty($elt->att('atom:rel') // $elt->att('rel')); | ||||
2420 | 0 | 0 | if (defined $rel) { | ||||
2421 | # Atom rel="..." | ||||||
2422 | # Maybe: if ($rel eq 'next') ... # not sure about "next" link | ||||||
2423 | |||||||
2424 | 0 | 0 | 0 | if ($rel eq 'self' # the feed itself (in the channel normally) | |||
0 | |||||||
0 | |||||||
2425 | || $rel eq 'edit' # to edit the item, maybe | ||||||
2426 | || $rel eq 'service.edit' # to edit the item | ||||||
2427 | || $rel eq 'license' # probably only in the channel part normally | ||||||
2428 | ) { | ||||||
2429 | 0 | $self->verbose (1, ' ', __x('skip link "{type}"', type => $rel)); | |||||
2430 | 0 | next; | |||||
2431 | } | ||||||
2432 | 0 | 0 | if ($rel eq 'alternate') { | ||||
0 | |||||||
0 | |||||||
0 | |||||||
0 | |||||||
0 | |||||||
0 | |||||||
0 | |||||||
2433 | # "alternate" is supposed to be the content as the entry, but in a | ||||||
2434 | # web page or something. Not sure that's always quite true, so show | ||||||
2435 | # it as a plain link. If no |
||||||
2436 | # supposed to be mandatory. | ||||||
2437 | |||||||
2438 | } elsif ($rel eq 'enclosure') { | ||||||
2439 | 0 | $l->{'name'} = __('Encl'); | |||||
2440 | |||||||
2441 | } elsif ($rel eq 'ostatus:conversation') { | ||||||
2442 | 0 | $l->{'name'} = __('Conversation'); | |||||
2443 | 0 | $l->{'download'} = 0; | |||||
2444 | |||||||
2445 | } elsif ($rel eq 'ostatus:attention') { | ||||||
2446 | 0 | $l->{'name'} = __('Attention'); | |||||
2447 | 0 | $l->{'download'} = 0; | |||||
2448 | |||||||
2449 | } elsif ($rel eq 'related') { | ||||||
2450 | 0 | $l->{'name'} = __('Related'); | |||||
2451 | |||||||
2452 | } elsif ($rel eq 'replies') { | ||||||
2453 | # Atom | ||||||
2454 | 0 | my $count = $self->item_elt_comments_count($item,$elt); | |||||
2455 | 0 | 0 | if ($self->atom_link_is_rss($elt)) { | ||||
2456 | 0 | 0 | $l->{'name'} = (defined $count | ||||
2457 | ? __x('RSS Replies({count})', count => $count) | ||||||
2458 | : __('RSS Replies')); | ||||||
2459 | 0 | $l->{'priority'} = -10; | |||||
2460 | } else { | ||||||
2461 | 0 | 0 | $l->{'name'} = (defined $count | ||||
2462 | ? __x('Replies({count})', count => $count) | ||||||
2463 | : __('Replies')); | ||||||
2464 | } | ||||||
2465 | 0 | $l->{'download'} = 0; | |||||
2466 | |||||||
2467 | } elsif ($rel eq 'service.post') { | ||||||
2468 | 0 | $l->{'name'} = __('Comments'); | |||||
2469 | 0 | $l->{'download'} = 0; | |||||
2470 | |||||||
2471 | } elsif ($rel eq 'via') { | ||||||
2472 | 0 | $l->{'name'} = __('Via'); | |||||
2473 | 0 | $l->{'download'} = 0; | |||||
2474 | |||||||
2475 | } else { | ||||||
2476 | 0 | $l->{'name'} = __x('{linkrel}', linkrel => $rel); | |||||
2477 | } | ||||||
2478 | |||||||
2479 | } else { # ! defined $rel | ||||||
2480 | # tags without rel="" attribute | ||||||
2481 | # | ||||||
2482 | 0 | 0 | if ($tag eq 'enclosure') { | ||||
0 | |||||||
0 | |||||||
0 | |||||||
0 | |||||||
0 | |||||||
0 | |||||||
0 | |||||||
0 | |||||||
2483 | 0 | $l->{'name'} = __('Encl'); | |||||
2484 | |||||||
2485 | } elsif ($tag eq 'dc:source') { | ||||||
2486 | 0 | $l->{'name'} = __('Source'); | |||||
2487 | 0 | $l->{'download'} = 0; | |||||
2488 | |||||||
2489 | } elsif ($tag eq 'wiki:diff') { | ||||||
2490 | 0 | $l->{'name'} = __('Diff'); | |||||
2491 | |||||||
2492 | } elsif ($tag eq 'wiki:history') { | ||||||
2493 | 0 | $l->{'name'} = __('History'); | |||||
2494 | 0 | $l->{'download'} = 0; | |||||
2495 | |||||||
2496 | } elsif ($tag =~ /foaf:maker|sioc:has_creator/) { | ||||||
2497 | 0 | $l->{'name'} = __('Author'); | |||||
2498 | 0 | $l->{'download'} = 0; | |||||
2499 | 0 | $l->{'priority'} = -20; # low | |||||
2500 | |||||||
2501 | } elsif ($tag eq 'statusnet:origin') { | ||||||
2502 | 0 | $l->{'name'} = __('Geo location'); | |||||
2503 | 0 | $l->{'download'} = 0; | |||||
2504 | 0 | $l->{'priority'} = -101; # just after Geo location | |||||
2505 | |||||||
2506 | } elsif ($tag eq 'sioc:has_discussion') { | ||||||
2507 | 0 | $l->{'name'} = __('Discussion'); | |||||
2508 | 0 | $l->{'download'} = 0; | |||||
2509 | |||||||
2510 | } elsif ($tag eq 'wfw:commentrss') { | ||||||
2511 | 0 | 0 | if (defined (my $count = $self->item_elt_comments_count($item,$elt))) { | ||||
2512 | 0 | $l->{'name'} = __x('RSS Comments({count})', count => $count); | |||||
2513 | } else { | ||||||
2514 | 0 | $l->{'name'} = __('RSS Comments'); | |||||
2515 | } | ||||||
2516 | 0 | $l->{'download'} = 0; | |||||
2517 | 0 | $l->{'priority'} = -10; | |||||
2518 | |||||||
2519 | } elsif ($tag =~ /comment/) { # |
||||||
2520 | 0 | 0 | if (defined (my $count = $self->item_elt_comments_count($item,$elt))) { | ||||
2521 | 0 | $l->{'name'} = __x('Comments({count})', count => $count); | |||||
2522 | } else { | ||||||
2523 | 0 | $l->{'name'} = __('Comments'); | |||||
2524 | } | ||||||
2525 | 0 | $l->{'download'} = 0; | |||||
2526 | } | ||||||
2527 | } | ||||||
2528 | |||||||
2529 | # Atom | ||||||
2530 | # RSS http:.. | ||||||
2531 | # RSS |
||||||
2532 | 0 | 0 | my $uri = (non_empty ($elt->att('atom:href')) # Atom | ||||
0 | |||||||
0 | |||||||
0 | |||||||
0 | |||||||
0 | |||||||
0 | |||||||
2533 | // non_empty ($elt->att('href')) # Atom | ||||||
2534 | // non_empty ($elt->att('atom:src')) # Atom |
||||||
2535 | // non_empty ($elt->att('src')) # Atom |
||||||
2536 | // non_empty ($elt->att('url')) # RSS |
||||||
2537 | # |
||||||
2538 | // non_empty ($elt->att('rdf:resource')) | ||||||
2539 | // non_empty ($elt->trimmed_text) # RSS | ||||||
2540 | // next); | ||||||
2541 | 0 | $uri = App::RSS2Leafnode::XML::Twig::Other::elt_xml_based_uri ($elt, $uri); | |||||
2542 | |||||||
2543 | 0 | $l->{'uri'} = $uri; | |||||
2544 | 0 | 0 | $l->{'name'} //= __('Link'); | ||||
2545 | |||||||
2546 | 0 | my @paren; | |||||
2547 | # show length if biggish, often provided on enclosures but not plain | ||||||
2548 | # links | ||||||
2549 | 0 | 0 | 0 | if (defined (my $length = ($elt->att('atom:length') | |||
2550 | // $elt->att('length')))) { | ||||||
2551 | 0 | push @paren, $self->format_size_in_bytes($length); | |||||
2552 | } | ||||||
2553 | # |
||||||
2554 | # seconds, otherwise MM:SS or HH:MM:SS. | ||||||
2555 | 0 | 0 | 0 | if ($tag eq 'enclosure' | |||
2556 | && defined (my $duration = non_empty ($item->first_child_text('itunes:duration')))) { | ||||||
2557 | 0 | 0 | if ($duration !~ /:/) { | ||||
2558 | 0 | $duration = __px('s-for-seconds', '{duration}s', | |||||
2559 | duration => $duration); | ||||||
2560 | } | ||||||
2561 | 0 | push @paren, $duration; | |||||
2562 | } | ||||||
2563 | 0 | 0 | if (@paren) { | ||||
2564 | 0 | $l->{'name'} .= '('.join(', ',@paren). ')'; | |||||
2565 | } | ||||||
2566 | |||||||
2567 | ### push link: $l | ||||||
2568 | 0 | push @links, $l; | |||||
2569 | } | ||||||
2570 | |||||||
2571 | # eg. RSS | ||||||
2572 | # Atom | ||||||
2573 | # |
||||||
2574 | # |
||||||
2575 | # | ||||||
2576 | # |
||||||
2577 | # | ||||||
2578 | # | ||||||
2579 | # | ||||||
2580 | 0 | foreach my $elt ($item->children('source')) { | |||||
2581 | 0 | 0 | my $str = non_empty (elt_to_rendered_line($elt->first_child('title'))) | ||||
2582 | // non_empty ($elt->trimmed_text); | ||||||
2583 | 0 | 0 | if (defined $str) { | ||||
2584 | ### source: $str | ||||||
2585 | 0 | push @links, { name => __('Source') . ": $str", | |||||
2586 | download => 0, | ||||||
2587 | priority => -200, | ||||||
2588 | }; | ||||||
2589 | } | ||||||
2590 | 0 | foreach my $subelt | |||||
0 | |||||||
2591 | ($elt, | ||||||
2592 | grep {$self->atom_link_is_rss($_)} $elt->children('link') | ||||||
2593 | ) { | ||||||
2594 | 0 | 0 | 0 | if (defined $subelt | |||
0 | |||||||
0 | |||||||
2595 | && defined (my $url = non_empty ($subelt->att('url')) | ||||||
2596 | // non_empty ($subelt->att('href')) | ||||||
2597 | // non_empty ($subelt->att('atom:href')))) { | ||||||
2598 | 0 | push @links, { name => __('Source RSS'), | |||||
2599 | uri => App::RSS2Leafnode::XML::Twig::Other::elt_xml_based_uri($subelt,$url), | ||||||
2600 | download => 0, | ||||||
2601 | priority => -200, | ||||||
2602 | }; | ||||||
2603 | } | ||||||
2604 | } | ||||||
2605 | } | ||||||
2606 | |||||||
2607 | # Merge together duplicate urls, so as not to download two copies as | ||||||
2608 | # attachments, and so as to make it clear when there's only one | ||||||
2609 | # destination for two things. | ||||||
2610 | # | ||||||
2611 | # Have seen same url under and |
||||||
2612 | # http://sourceforge.net/export/rss2_keepsake.php?group_id=203650 | ||||||
2613 | # or same url under and |
||||||
2614 | # http://abc.net.au/rn/podcast/feeds/sci.xml | ||||||
2615 | { | ||||||
2616 | 0 | my %seen; | |||||
0 | |||||||
2617 | 0 | @links = grep { | |||||
2618 | 0 | my $l = $_; | |||||
2619 | 0 | my $want = 1; | |||||
2620 | 0 | 0 | if (my $uri = $l->{'uri'}) { | ||||
2621 | 0 | my $canonical = $uri->canonical; | |||||
2622 | 0 | $canonical->fragment(undef); # ignore #foo anchor for uniqueness | |||||
2623 | 0 | 0 | if (my $prev_l = $seen{$canonical}) { | ||||
2624 | 0 | $want = 0; | |||||
2625 | 0 | 0 | $prev_l->{'download'} ||= $l->{'download'}; | ||||
2626 | 0 | 0 | $l->{'priority'} = max ($l->{'priority'}||0, | ||||
0 | |||||||
2627 | $prev_l->{'priority'}||0); | ||||||
2628 | |||||||
2629 | # prefer no anchor if have both with and without | ||||||
2630 | 0 | 0 | if (is_empty($l->{'uri'}->fragment)) { | ||||
2631 | 0 | $prev_l->{'uri'} = $l->{'uri'}; | |||||
2632 | } | ||||||
2633 | |||||||
2634 | 0 | 0 | if ($prev_l->{'name'} eq __('Link')) { | ||||
0 | |||||||
0 | |||||||
2635 | # name "Link" doesn't say much, prefer the other over "Link" | ||||||
2636 | 0 | $prev_l->{'name'} = $l->{'name'}; | |||||
2637 | } elsif ($l->{'name'} eq __('Link')) { | ||||||
2638 | # don't append "Link" to the previous | ||||||
2639 | } elsif ($l->{'name'} eq $prev_l->{'name'}) { | ||||||
2640 | # don't double the same name | ||||||
2641 | } else { | ||||||
2642 | 0 | $prev_l->{'name'} .= ", $l->{'name'}"; | |||||
2643 | } | ||||||
2644 | } | ||||||
2645 | 0 | $seen{$canonical} = $l; | |||||
2646 | } | ||||||
2647 | $want | ||||||
2648 | 0 | } @links; | |||||
2649 | } | ||||||
2650 | 0 | foreach my $l (@links) { | |||||
2651 | 0 | 0 | if ($l->{'uri'}) { | ||||
2652 | 0 | $l->{'name'} .= ':'; | |||||
2653 | } | ||||||
2654 | } | ||||||
2655 | |||||||
2656 | 0 | 0 | if (defined (my $str = $self->item_to_lat_long_alt_str ($item))) { | ||||
2657 | 0 | push @links, { name => $str, | |||||
2658 | download => 0, | ||||||
2659 | priority => -100, # lat/long low priority | ||||||
2660 | }; | ||||||
2661 | } | ||||||
2662 | |||||||
2663 | # re:rank as for example from stackexchange.com | ||||||
2664 | # What does label="" usually show? Are parens like this good? | ||||||
2665 | 0 | foreach my $elt ($item->children('re:rank')) { | |||||
2666 | 0 | my $label = $elt->att('label'); | |||||
2667 | 0 | my $value = elt_to_rendered_line($elt); | |||||
2668 | 0 | 0 | push @links, { name => (defined $label | ||||
2669 | ? __x('Rank: {value} ({label})', value => $value, label => $label) | ||||||
2670 | : __x('Rank: {value}', value => $value)), | ||||||
2671 | download => 0, | ||||||
2672 | priority => -200, # low priority | ||||||
2673 | }; | ||||||
2674 | } | ||||||
2675 | |||||||
2676 | # eg. |
||||||
2677 | # is there any value in the role="" part? | ||||||
2678 | 0 | foreach my $elt ($item->children('media:credit')) { | |||||
2679 | 0 | push @links, { name => __x('Credit: {who}', | |||||
2680 | who => scalar(elt_to_rendered_line($elt))), | ||||||
2681 | download => 0, | ||||||
2682 | priority => -200, # very low priority | ||||||
2683 | }; | ||||||
2684 | } | ||||||
2685 | |||||||
2686 | # |
||||||
2687 | # | ||||||
2688 | # Allow for empty |
||||||
2689 | # http://abc.net.au/rn/podcast/feeds/sci.xml | ||||||
2690 | # | ||||||
2691 | 0 | foreach my $elt ($item->children('itunes:explicit')) { | |||||
2692 | 0 | 0 | my $line = elt_to_rendered_line($elt) | ||||
2693 | // next; # skip empty |
||||||
2694 | 0 | push @links, { name => __x('Explicit: {value}', value => $line), | |||||
2695 | download => 0, | ||||||
2696 | priority => -200, # very low priority | ||||||
2697 | }; | ||||||
2698 | } | ||||||
2699 | |||||||
2700 | # |
||||||
2701 | # a fun kind of commentary thing | ||||||
2702 | 0 | foreach my $elt ($item->children('slash:department')) { | |||||
2703 | 0 | push @links, { name => __x('Department: {department}', | |||||
2704 | department => scalar(elt_to_rendered_line($elt))), | ||||||
2705 | download => 0, | ||||||
2706 | priority => -200, # very low priority | ||||||
2707 | }; | ||||||
2708 | } | ||||||
2709 | |||||||
2710 | 0 | return @links; | |||||
2711 | } | ||||||
2712 | @known{qw( | ||||||
2713 | /channel/item/pheedo:origLink | ||||||
2714 | /channel/item/feedburner:origLink | ||||||
2715 | |||||||
2716 | /channel/item/link | ||||||
2717 | /channel/item/enclosure | ||||||
2718 | /channel/item/dc:source | ||||||
2719 | /channel/item/comments | ||||||
2720 | /channel/item/wfw:comment | ||||||
2721 | /channel/item/wfw:commentRss | ||||||
2722 | /channel/item/slash:comments | ||||||
2723 | /channel/item/slash:hit_parade | ||||||
2724 | /channel/item/slash:department | ||||||
2725 | /channel/item/thr:total | ||||||
2726 | /channel/item/content --atom | ||||||
2727 | /channel/item/wiki:diff | ||||||
2728 | /channel/item/itunes:duration | ||||||
2729 | /channel/item/re:rank | ||||||
2730 | |||||||
2731 | /channel/wiki:interwiki | ||||||
2732 | /channel/wiki:interwiki/rdf:Description | ||||||
2733 | /channel/wiki:interwiki/rdf:Description/rdf:value | ||||||
2734 | /channel/item/wiki:version | ||||||
2735 | /channel/item/wiki:status | ||||||
2736 | /channel/item/wiki:history | ||||||
2737 | /channel/item/foaf:maker | ||||||
2738 | /channel/item/sioc:has_creator | ||||||
2739 | /channel/item/sioc:has_discussion | ||||||
2740 | /channel/item/sioc:links_to | ||||||
2741 | /channel/item/sioc:reply_of | ||||||
2742 | /channel/item/media:credit | ||||||
2743 | /channel/item/itunes:explicit | ||||||
2744 | /channel/item/itunes:block | ||||||
2745 | |||||||
2746 | --believed-to-be-duplicate-of-description | ||||||
2747 | /channel/item/media:content | ||||||
2748 | /channel/item/media:text | ||||||
2749 | )} = (); | ||||||
2750 | |||||||
2751 | # sub any_link_replies_nonfeed { | ||||||
2752 | # foreach my $elt (@_) { | ||||||
2753 | # # ### any_link_replies_nonfeed(): ref $elt, "$elt", $elt->tag | ||||||
2754 | # # my $rel = ($elt->att('atom:rel') // $elt->att('rel') // ''); | ||||||
2755 | # # my $type = ($elt->att('atom:type') // $elt->att('type') // ''); | ||||||
2756 | # # ### $rel | ||||||
2757 | # # ### $type | ||||||
2758 | # if ($elt->tag eq 'link' | ||||||
2759 | # && (($elt->att('atom:rel') // $elt->att('rel') // '') | ||||||
2760 | # eq 'replies') | ||||||
2761 | # && (($elt->atom_link_is_rss($elt))) { | ||||||
2762 | # return 1; | ||||||
2763 | # } | ||||||
2764 | # } | ||||||
2765 | # return 0; | ||||||
2766 | # } | ||||||
2767 | |||||||
2768 | # Return a string which is the latitude, longitude and possibly altitude | ||||||
2769 | # from the item. If no location in the item then return undef. | ||||||
2770 | # | ||||||
2771 | sub item_to_lat_long_alt_str { | ||||||
2772 | 0 | 0 | 0 | my ($self, $item) = @_; | |||
2773 | 0 | 0 | my ($lat, $long, $alt) = $self->item_to_lat_long_alt_values ($item) | ||||
2774 | or return; | ||||||
2775 | ### $lat | ||||||
2776 | ### $long | ||||||
2777 | ### $alt | ||||||
2778 | |||||||
2779 | 0 | 0 | if (Scalar::Util::looks_like_number($lat)) { | ||||
2780 | 0 | 0 | $lat = ($lat >= 0 | ||||
2781 | # TRANSLATORS: the latin1/unicode degree symbol can be used here | ||||||
2782 | # instead of " deg", if it will be recognised in translation, | ||||||
2783 | # etc. | ||||||
2784 | ? __x('{latitude} deg N', latitude => $lat) | ||||||
2785 | : __x('{latitude} deg S', latitude => -$lat)); | ||||||
2786 | } | ||||||
2787 | 0 | 0 | if (Scalar::Util::looks_like_number($long)) { | ||||
2788 | 0 | 0 | $long = ($long >= 0 | ||||
2789 | ? __x('{longitude} deg E', longitude => $long) | ||||||
2790 | : __x('{longitude} deg W', longitude => -$long)); | ||||||
2791 | } | ||||||
2792 | |||||||
2793 | 0 | 0 | if (is_non_empty ($alt)) { | ||||
2794 | 0 | return __x('Geo location: {latitude}, {longitude}, alt {altitude}m', | |||||
2795 | latitude => $lat, | ||||||
2796 | longitude => $long, | ||||||
2797 | altitude => $alt); | ||||||
2798 | } else { | ||||||
2799 | 0 | return __x('Geo location: {latitude}, {longitude}', | |||||
2800 | latitude => $lat, | ||||||
2801 | longitude => $long); | ||||||
2802 | } | ||||||
2803 | } | ||||||
2804 | |||||||
2805 | # Return a list of values which are the latitude, longitude and possibly | ||||||
2806 | # altitude extracted from $item. | ||||||
2807 | # | ||||||
2808 | # ($latitude, $longitude, $altitude) | ||||||
2809 | # ($latitude, $longitude) | ||||||
2810 | # () | ||||||
2811 | # | ||||||
2812 | # If no location then return an empty list. Some of the values returned | ||||||
2813 | # might be empty strings if say there's a |
||||||
2814 | # | ||||||
2815 | # Latitude is degrees North, or negative for South. Longitude is degrees | ||||||
2816 | # East, or negative for West. Both possibly with decimal places. | ||||||
2817 | # | ||||||
2818 | sub item_to_lat_long_alt_values { | ||||||
2819 | 0 | 0 | 0 | my ($self, $item) = @_; | |||
2820 | |||||||
2821 | # per-item |
||||||
2822 | # http://earthquake.usgs.gov/eqcenter/recenteqsww/catalogs/eqs7day-M5.xml | ||||||
2823 | # |
||||||
2824 | # |
||||||
2825 | # |
||||||
2826 | # | ||||||
2827 | # or under geo:Point, maybe, eg. http://www.gdacs.org/xml/RSSTC.xml | ||||||
2828 | # |
||||||
2829 | # |
||||||
2830 | # |
||||||
2831 | # |
||||||
2832 | # | ||||||
2833 | 0 | foreach my $elt ($item, $item->children(qr/^geo:point$/i)) { | |||||
2834 | 0 | my $lat = $elt->first_child_trimmed_text('geo:lat'); | |||||
2835 | 0 | 0 | if (is_non_empty ($lat)) { | ||||
2836 | 0 | return ($lat, | |||||
2837 | $elt->first_child_trimmed_text('geo:long'), | ||||||
2838 | non_empty ($elt->first_child_trimmed_text('geo:alt'))); | ||||||
2839 | } | ||||||
2840 | } | ||||||
2841 | |||||||
2842 | # |
||||||
2843 | # |
||||||
2844 | # space separator per http://www.georss.org/Encodings | ||||||
2845 | { | ||||||
2846 | 0 | my $str = $item->first_child_trimmed_text ('georss:point'); | |||||
0 | |||||||
2847 | 0 | 0 | if (is_non_empty ($str)) { | ||||
2848 | 0 | return split(/\s+/, $str, 2); # no altitude | |||||
2849 | } | ||||||
2850 | } | ||||||
2851 | |||||||
2852 | # |
||||||
2853 | # | ||||||
2854 | # rdf:resource="http://sws.geonames.org/2638077/"> | ||||||
2855 | # | ||||||
2856 | 0 | 0 | if (my $elt = $item->first_child ('statusnet:origin')) { | ||||
2857 | 0 | 0 | if (defined (my $lat = $elt->att('geo:lat'))) { | ||||
2858 | 0 | my $long = $elt->att('geo:long'); | |||||
2859 | 0 | return ($lat, $long); | |||||
2860 | } | ||||||
2861 | } | ||||||
2862 | |||||||
2863 | 0 | return; # not found | |||||
2864 | } | ||||||
2865 | @known{qw(/channel/item/geo:lat | ||||||
2866 | /channel/item/geo:long | ||||||
2867 | /channel/item/geo:alt | ||||||
2868 | /channel/item/geo:Point | ||||||
2869 | /channel/item/geo:Point/geo:lat | ||||||
2870 | /channel/item/geo:Point/geo:long | ||||||
2871 | /channel/item/georss:point | ||||||
2872 | /channel/item/statusnet:origin | ||||||
2873 | )} = (); | ||||||
2874 | |||||||
2875 | |||||||
2876 | sub links_to_html { | ||||||
2877 | 0 | 0 | 0 | 0 | @_ or return ''; | ||
2878 | |||||||
2879 | # |
||||||
2880 | # up by a line-wrap, which can make it hard to cut and paste. can |
||||||
2881 | # prevent a line wrap, but it ends up treated as starting a paragraph, | ||||||
2882 | # separate from the 'name' part. | ||||||
2883 | # | ||||||
2884 | 0 | my $str = ''; | |||||
2885 | 0 | my $sep = "\n\n \n"; |
|||||
2886 | 0 | foreach my $l (@_) { | |||||
2887 | 0 | $str .= "$sep |
|||||
2888 | 0 | $sep = " \n"; |
|||||
2889 | |||||||
2890 | 0 | 0 | if (defined (my $uri = $l->{'uri'})) { | ||||
2891 | 0 | $str .= " | |||||
2892 | 0 | 0 | if (defined (my $hreflang = $l->{'hreflang'})) { | ||||
2893 | 0 | $str .= " hreflang=\"$Entitize{$hreflang}\""; | |||||
2894 | } | ||||||
2895 | 0 | 0 | if (defined (my $type = $l->{'type'})) { | ||||
2896 | 0 | $str .= " type=\"$Entitize{$type}\""; | |||||
2897 | } | ||||||
2898 | 0 | $uri = $Entitize{$uri}; | |||||
2899 | 0 | $str .= " href=\"$uri\">$uri"; | |||||
2900 | } | ||||||
2901 | 0 | $str .= "\n"; | |||||
2902 | } | ||||||
2903 | 0 | return "$str\n"; | |||||
2904 | } | ||||||
2905 | |||||||
2906 | sub links_to_text { | ||||||
2907 | 0 | 0 | 0 | return join ('', map { join_non_empty (' ', | |||
0 | |||||||
2908 | $_->{'name'}, | ||||||
2909 | $_->{'uri'}) . "\n" } @_); | ||||||
2910 | } | ||||||
2911 | |||||||
2912 | |||||||
2913 | #------------------------------------------------------------------------------ | ||||||
2914 | # "From:" and email addresses | ||||||
2915 | |||||||
2916 | 1 | 1 | 11072 | use constant DUMMY_EMAIL_ADDRESS => 'nobody@rss2leafnode.dummy'; | |||
1 | 10 | ||||||
1 | 8161 | ||||||
2917 | |||||||
2918 | { | ||||||
2919 | my %tag_to_link_name | ||||||
2920 | = (author => __('Author:'), | ||||||
2921 | creator => __('Creator:'), | ||||||
2922 | contributor => __('Contributor:'), | ||||||
2923 | managingEditor => __('Managing Editor:'), | ||||||
2924 | webMaster => __('Webmaster:'), | ||||||
2925 | publisher => __('Publisher:'), | ||||||
2926 | owner => __('Owner:'), | ||||||
2927 | username => __('User:'), | ||||||
2928 | ); | ||||||
2929 | |||||||
2930 | # Return ($from, $linkhash,$linkhash,...). | ||||||
2931 | # $from is a string like "foo@example.com". | ||||||
2932 | # Multiple authors are for example "foo@example.com, quux@example.com" as | ||||||
2933 | # per RFC5322 email, though currently no Sender: is picked out from among | ||||||
2934 | # them. | ||||||
2935 | # | ||||||
2936 | sub item_to_from { | ||||||
2937 | 0 | 0 | 0 | my ($self, $item) = @_; | |||
2938 | ### item_to_from() ... | ||||||
2939 | 0 | my $channel = item_to_channel($item); | |||||
2940 | |||||||
2941 | # |
||||||
2942 | # looser. The RSS recommendation is |
||||||
2943 | # and |
||||||
2944 | # | ||||||
2945 | # |
||||||
2946 | # attribute. | ||||||
2947 | # | ||||||
2948 | # |
||||||
2949 | # | ||||||
2950 | # |
||||||
2951 | # to show just the primary author or authors. | ||||||
2952 | # | ||||||
2953 | # The first | ||||||
2954 | # | ||||||
2955 | 0 | my @from; | |||||
2956 | my @links; | ||||||
2957 | 0 | foreach my $try ([$item, 'author'], | |||||
2958 | [$item, 'jf:author'], | ||||||
2959 | [$item, 'slate:author'], | ||||||
2960 | [$item, 'dc:creator'], | ||||||
2961 | [$item, 'dc:contributor'], | ||||||
2962 | [$item, 'wiki:username'], | ||||||
2963 | [$item, 'itunes:author'], | ||||||
2964 | |||||||
2965 | [$channel, 'author'], | ||||||
2966 | [$channel, 'dc:creator'], | ||||||
2967 | [$channel, 'itunes:author'], | ||||||
2968 | [$channel, 'managingEditor'], | ||||||
2969 | [$channel, 'webMaster'], | ||||||
2970 | |||||||
2971 | [$item , 'dc:publisher'], | ||||||
2972 | [$channel, 'dc:publisher'], | ||||||
2973 | [$channel, 'itunes:owner'], | ||||||
2974 | ) { | ||||||
2975 | 0 | my ($where, $tag) = @$try; | |||||
2976 | ### $tag | ||||||
2977 | |||||||
2978 | 0 | 0 | if (my @elts = $item->children($tag)) { | ||||
2979 | 0 | foreach my $elt (@elts) { | |||||
2980 | ### elt for From: $elt->sprint | ||||||
2981 | 0 | push @from, $self->elt_to_email($elt); | |||||
2982 | |||||||
2983 | # author's home page etc as a link | ||||||
2984 | 0 | 0 | 0 | if (my $uri = | |||
0 | |||||||
2985 | (# Atom | ||||||
2986 | # |
||||||
2987 | # |
||||||
2988 | # |
||||||
2989 | # | ||||||
2990 | # | ||||||
2991 | non_empty ($elt->first_child_text('uri')) | ||||||
2992 | |||||||
2993 | # slate.com | ||||||
2994 | # |
||||||
2995 | // non_empty ($elt->att('url')) | ||||||
2996 | |||||||
2997 | # ModWiki dc:contributor example | ||||||
2998 | # |
||||||
2999 | # |
||||||
3000 | # | ||||||
3001 | # The text shows rss:link= and the example just link=. | ||||||
3002 | # | ||||||
3003 | // non_empty (do { | ||||||
3004 | 0 | 0 | 0 | my $child; ($child = $elt->first_child('rdf:Description')) | |||
0 | |||||||
3005 | && ($child->att('link') // $child->att('rss:link')) | ||||||
3006 | }))) { | ||||||
3007 | 0 | my $tag = $elt->tag; | |||||
3008 | 0 | $tag =~ s/.*?://; | |||||
3009 | 0 | 0 | push @links, { uri => URI->new($uri), | ||||
3010 | name => ($tag_to_link_name{$tag} // "\u$tag:"), | ||||||
3011 | download => 0, | ||||||
3012 | priority => -20 }; | ||||||
3013 | } | ||||||
3014 | } | ||||||
3015 | } | ||||||
3016 | 0 | 0 | last if @from; | ||||
3017 | } | ||||||
3018 | 0 | 0 | if (! @from) { | ||||
3019 | # Atom |
||||||
3020 | # Hope the channel title is different from the item title. | ||||||
3021 | 0 | @from = ($self->email_format (elt_to_rendered_line | |||||
3022 | ($channel->first_child('title')))); | ||||||
3023 | } | ||||||
3024 | 0 | 0 | if (! @from) { | ||||
3025 | 0 | @from = ('nobody@'.$self->uri_to_host); | |||||
3026 | } | ||||||
3027 | |||||||
3028 | ### @from | ||||||
3029 | 0 | return (join(', ',@from), | |||||
3030 | @links); | ||||||
3031 | } | ||||||
3032 | @known{qw(/channel/author | ||||||
3033 | /channel/author/name --atom | ||||||
3034 | /channel/author/uri --atom | ||||||
3035 | /channel/author/url --atom-typo-maybe | ||||||
3036 | /channel/author/email --atom | ||||||
3037 | /channel/managingEditor | ||||||
3038 | /channel/webMaster | ||||||
3039 | /channel/dc:publisher | ||||||
3040 | /channel/dc:creator | ||||||
3041 | /channel/itunes:author | ||||||
3042 | |||||||
3043 | /channel/item/author | ||||||
3044 | /channel/item/author/name --atom | ||||||
3045 | /channel/item/author/uri --atom | ||||||
3046 | /channel/item/author/url --atom-typo-maybe | ||||||
3047 | /channel/item/author/email --atom | ||||||
3048 | /channel/item/author/gd:extendedProperty --good-dinner | ||||||
3049 | /channel/item/dc:creator | ||||||
3050 | /channel/item/dc:publisher | ||||||
3051 | /channel/item/wiki:username | ||||||
3052 | /channel/item/itunes:author | ||||||
3053 | /channel/item/dc:contributor | ||||||
3054 | /channel/item/dc:contributor/rdf:Description | ||||||
3055 | /channel/item/dc:contributor/rdf:Description/rdf:value | ||||||
3056 | /channel/item/jf:author | ||||||
3057 | /channel/item/slate:author | ||||||
3058 | |||||||
3059 | /channel/item/contributor --atom | ||||||
3060 | /channel/item/contributor/name | ||||||
3061 | /channel/item/contributor/uri | ||||||
3062 | /channel/item/contributor/url --atom-typo-maybe | ||||||
3063 | /channel/item/contributor/email | ||||||
3064 | |||||||
3065 | /channel/item/activity:actor | ||||||
3066 | /channel/item/activity:verb --usually-post-or-something | ||||||
3067 | /channel/item/activity:object-type --is-this-anything | ||||||
3068 | )} = (); | ||||||
3069 | } | ||||||
3070 | |||||||
3071 | # $elt is an XML::Twig::Elt | ||||||
3072 | # Return an email address, either just the text part of $elt or Atom | ||||||
3073 | # sub-elements |
||||||
3074 | # If $elt is empty then return an empty list. | ||||||
3075 | # | ||||||
3076 | sub elt_to_email { | ||||||
3077 | 0 | 0 | 0 | my ($self, $elt) = @_; | |||
3078 | 0 | 0 | return unless defined $elt; | ||||
3079 | |||||||
3080 | # |
||||||
3081 | # |
||||||
3082 | 0 | my $email = elt_to_rendered_line ($elt->first_child(qr/^(itunes:)?email$/)); | |||||
3083 | |||||||
3084 | # |
||||||
3085 | # |
||||||
3086 | 0 | 0 | my $display = elt_to_rendered_line ($elt->first_child(qr/^(itunes:)?name$/)) | ||||
3087 | // ''; | ||||||
3088 | |||||||
3089 | my $maybe = join | ||||||
3090 | (' ', | ||||||
3091 | non_empty ($elt->text_only), | ||||||
3092 | 0 | non_empty (do { | |||||
3093 | # |
||||||
3094 | 0 | 0 | my $rdfdesc; ($rdfdesc = $elt->first_child('rdf:Description')) | ||||
0 | |||||||
3095 | && $rdfdesc->first_child_text('rdf:value') | ||||||
3096 | })); | ||||||
3097 | |||||||
3098 | 0 | return $self->email_format_maybe ($maybe, $display, $email); | |||||
3099 | } | ||||||
3100 | |||||||
3101 | # $mailbox_re is a mailbox with domain, like "foo@example.com" | ||||||
3102 | # Allows no dots like "foo@localhost". | ||||||
3103 | # Allows dashes like "www-something@example.com". | ||||||
3104 | # | ||||||
3105 | # $mailbox_with_comment_re allows an optional paren comment part like | ||||||
3106 | # "foo@example.com (Foo)" | ||||||
3107 | # | ||||||
3108 | # cf Email::Address $addr_spec, but its version 1.890 loosened to allow a | ||||||
3109 | # domain-less bare "foo", which is no good | ||||||
3110 | # | ||||||
3111 | my $words_with_dots_re = qr/[[:word:]-]+(\.[[:word:]-]+)*/; | ||||||
3112 | my $mailbox_re = qr/$words_with_dots_re\@$words_with_dots_re/o; | ||||||
3113 | my $mailbox_with_comment_re = qr/$mailbox_re(\s*\([^\)]*\))?/os; | ||||||
3114 | |||||||
3115 | # $maybe is some free-form author part possibly including a foo@example.com | ||||||
3116 | # $display is a display part for the email like "Foo", possibly empty "" | ||||||
3117 | # $email is a mailbox "foo@example.com", or undef | ||||||
3118 | # return an rfc822 "Foo |
||||||
3119 | # | ||||||
3120 | sub email_format_maybe { | ||||||
3121 | 0 | 0 | 0 | my ($self, $maybe, $display, $email) = @_; | |||
3122 | ### email_format_maybe() start | ||||||
3123 | ### $maybe | ||||||
3124 | ### $display | ||||||
3125 | |||||||
3126 | |||||||
3127 | # look also at $display the same in case Atom no |
||||||
3128 | # which is a mailbox and can be corrected, | ||||||
3129 | # eg. http://www.weather.gov/alerts-beta/hi.php?x=0 | ||||||
3130 | # | ||||||
3131 | # Or $maybe full email like | ||||||
3132 | 0 | 0 | if (is_empty($email)) { | ||||
3133 | 0 | foreach ($maybe, $display) { | |||||
3134 | |||||||
3135 | 0 | 0 | 0 | if (/^\s*(mailto:)?($mailbox_with_comment_re)\s*$/o) { | |||
0 | |||||||
3136 | ### maybe or display is a mailbox | ||||||
3137 | # "foo@example.com" | ||||||
3138 | # "mailto:foo@example.com" | ||||||
3139 | # "foo@example.com (Foo)" | ||||||
3140 | 0 | $email = $2; | |||||
3141 | 0 | undef $_; | |||||
3142 | 0 | last; | |||||
3143 | |||||||
3144 | } elsif (/(.*)\((mailto:)?($mailbox_re)\)\s*$/o | ||||||
3145 | || /(.*)<(mailto:)?($mailbox_re)>\s*$/o) { | ||||||
3146 | ### maybe or display part is display plus mailbox | ||||||
3147 | # "Foo (mailto:foo@example.com)" | ||||||
3148 | # "Foo (foo@example.com)" | ||||||
3149 | # "Foo |
||||||
3150 | # | ||||||
3151 | 0 | $_ = $1; | |||||
3152 | 0 | $email = $3; | |||||
3153 | 0 | last; | |||||
3154 | } | ||||||
3155 | } | ||||||
3156 | } | ||||||
3157 | |||||||
3158 | 0 | 0 | $display .= ' '.($maybe//''); | ||||
3159 | 0 | my $ret; | |||||
3160 | 0 | 0 | 0 | if (is_empty($email) && $display =~ /^$mailbox_re$/o) { | |||
3161 | # display or maybe is a "foo@example.com" or "foo@example.com (Foo)", | ||||||
3162 | # return it as-is, in particular leave it in "(Foo)" style comment | ||||||
3163 | 0 | $ret = $display; | |||||
3164 | } else { | ||||||
3165 | 0 | $ret = $self->email_format ($display, $email); | |||||
3166 | } | ||||||
3167 | |||||||
3168 | # Collapse whitespace against possible tabs and newlines in an |
||||||
3169 | # from googlegroups for instance. MIME::Entity seems to collapse | ||||||
3170 | # newlines, but not tabs. | ||||||
3171 | 0 | return non_empty (collapse_whitespace ($ret)); | |||||
3172 | } | ||||||
3173 | |||||||
3174 | # $display is a display part for the email "Foo", possibly empty "" | ||||||
3175 | # $email is a mailbox "foo@example.com", or undef or empty "" | ||||||
3176 | # return an rfc822 "Foo |
||||||
3177 | # | ||||||
3178 | sub email_format { | ||||||
3179 | 0 | 0 | 0 | my ($self, $display, $email) = @_; | |||
3180 | ### $display | ||||||
3181 | 0 | $display = Text::Trim::trim($display); | |||||
3182 | 0 | $email = Text::Trim::trim($email); | |||||
3183 | 0 | 0 | if (is_empty($display)) { | ||||
3184 | 0 | 0 | if (is_empty($email)) { | ||||
3185 | 0 | return; | |||||
3186 | } else { | ||||||
3187 | 0 | return $email; | |||||
3188 | } | ||||||
3189 | } | ||||||
3190 | 0 | 0 | if (is_empty($email)) { | ||||
3191 | # think can't have empty <> or omitted, otherwise the quoted part is | ||||||
3192 | # still parsed as an address, certainly it's not rfc822 compliant to | ||||||
3193 | # omit | ||||||
3194 | 0 | $email = 'nobody@'.$self->uri_to_host; | |||||
3195 | } else { | ||||||
3196 | 0 | $email = $email; | |||||
3197 | } | ||||||
3198 | 0 | return email_phrase_quote_maybe($display) . " <$email>"; | |||||
3199 | } | ||||||
3200 | |||||||
3201 | # return $str with quotes like "Foo Bar" if it needs them to go in an email | ||||||
3202 | # display part | ||||||
3203 | sub email_phrase_quote_maybe { | ||||||
3204 | 0 | 0 | 0 | my ($str) = @_; | |||
3205 | 0 | 0 | return if ! defined $str; | ||||
3206 | |||||||
3207 | # RFC2822 "atext" characters, with "-" last | ||||||
3208 | 0 | 0 | if ($str =~ m<[^[:alnum:][:space:]!#\$%&'*+/=?^_`{|}~-]>) { | ||||
3209 | # strange chars, need to quote | ||||||
3210 | 0 | return email_phrase_quote($str); | |||||
3211 | } else { | ||||||
3212 | # alphanumeric and whitespace, no quotes | ||||||
3213 | 0 | return $str; | |||||
3214 | } | ||||||
3215 | } | ||||||
3216 | sub email_phrase_quote { | ||||||
3217 | 0 | 0 | 0 | my ($str) = @_; | |||
3218 | 0 | 0 | return if ! defined $str; | ||||
3219 | 0 | $str =~ s/^"(.*)"$/$1/; # strip existing quotes | |||||
3220 | 0 | $str =~ s/(["\\])/\\$1/g; # escape internal quotes and backslashes | |||||
3221 | 0 | return "\"$str\""; | |||||
3222 | } | ||||||
3223 | |||||||
3224 | |||||||
3225 | #------------------------------------------------------------------------------ | ||||||
3226 | # rss_newest_only | ||||||
3227 | |||||||
3228 | { | ||||||
3229 | my %multiplier = (minute => 60, | ||||||
3230 | hour => 3600, | ||||||
3231 | day => 86400, | ||||||
3232 | week => 86400 * 7, | ||||||
3233 | month => 365.25 * 86400 / 12, | ||||||
3234 | year => 365.25 * 86400, | ||||||
3235 | ); | ||||||
3236 | # return a target time_t, or undef | ||||||
3237 | sub rss_newest_only_timet { | ||||||
3238 | 0 | 0 | 0 | my ($self) = @_; | |||
3239 | |||||||
3240 | 0 | 0 | if (defined (my $str = $self->{'rss_newest_only'})) { | ||||
3241 | 0 | 0 | if ($str =~ /^\s*(\d+)\s*(minute|hour|day|week|month|year)s?\s*$/) { | ||||
3242 | 0 | return time() - $1*$multiplier{$2}; | |||||
3243 | } | ||||||
3244 | } | ||||||
3245 | 0 | return undef; | |||||
3246 | } | ||||||
3247 | } | ||||||
3248 | |||||||
3249 | # return a number, or undef | ||||||
3250 | sub rss_newest_only_count { | ||||||
3251 | 0 | 0 | 0 | my ($self) = @_; | |||
3252 | 0 | 0 | if (defined (my $str = $self->{'rss_newest_only'})) { | ||||
3253 | 0 | 0 | if (Scalar::Util::looks_like_number($str)) { | ||||
3254 | ### rss_newest_only number: $str | ||||||
3255 | 0 | return $str; | |||||
3256 | } | ||||||
3257 | } | ||||||
3258 | 0 | return undef; | |||||
3259 | } | ||||||
3260 | |||||||
3261 | # return @items restricted or filtered by rss_newest_only | ||||||
3262 | sub rss_newest_only_items { | ||||||
3263 | 0 | 0 | 0 | my ($self, @items) = @_; | |||
3264 | |||||||
3265 | 0 | 0 | if (defined (my $count = $self->rss_newest_only_count)) { | ||||
3266 | 0 | 0 | if ($count == 0) { | ||||
3267 | # rss_newest_only=>0 means don't apply a newest | ||||||
3268 | 0 | return @items; | |||||
3269 | } | ||||||
3270 | 0 | my $before = scalar(@items); | |||||
3271 | 0 | require Sort::Key::Top; | |||||
3272 | 0 | 0 | @items = Sort::Key::Top::rnkeytop (sub { $self->item_to_timet($_) }, | ||||
3273 | 0 | $count, @items); | |||||
3274 | |||||||
3275 | 0 | my $after = scalar(@items); | |||||
3276 | 0 | 0 | if ($before != $after) { | ||||
3277 | 0 | $self->verbose (1, " rss_newest_only reduce by count from $before items to $after items"); | |||||
3278 | } | ||||||
3279 | 0 | return @items; | |||||
3280 | } | ||||||
3281 | |||||||
3282 | 0 | 0 | if (defined (my $target_timet = $self->rss_newest_only_timet)) { | ||||
3283 | 0 | my $before = scalar(@items); | |||||
3284 | 0 | @items = grep { my $got_timet = $self->item_to_timet($_); | |||||
0 | |||||||
3285 | 0 | 0 | ! defined $got_timet || $got_timet >= $target_timet } | ||||
3286 | @items; | ||||||
3287 | 0 | my $after = scalar(@items); | |||||
3288 | 0 | 0 | if ($before != $after) { | ||||
3289 | 0 | $self->verbose (1, " rss_newest_only reduce by age from $before to $after items"); | |||||
3290 | } | ||||||
3291 | 0 | return @items; | |||||
3292 | } | ||||||
3293 | |||||||
3294 | 0 | 0 | if (defined (my $str = $self->{'rss_newest_only'})) { | ||||
3295 | 0 | die "rss2leafnode: unrecognised rss_newest_only: ",$str; | |||||
3296 | } | ||||||
3297 | 0 | return @items; | |||||
3298 | } | ||||||
3299 | |||||||
3300 | |||||||
3301 | #------------------------------------------------------------------------------ | ||||||
3302 | # fetch RSS | ||||||
3303 | |||||||
3304 | my $map_xmlns | ||||||
3305 | = { | ||||||
3306 | 'http://purl.org/rss/1.0/' => 'rss', | ||||||
3307 | 'http://www.w3.org/2005/Atom' => 'atom', | ||||||
3308 | 'http://www.w3.org/1999/02/22-rdf-syntax-ns#' => 'rdf', | ||||||
3309 | 'http://purl.org/rss/1.0/modules/content/' => 'content', | ||||||
3310 | 'http://purl.org/rss/1.0/modules/slash/' => 'slash', | ||||||
3311 | 'http://purl.org/rss/1.0/modules/syndication/' => 'syn', | ||||||
3312 | 'http://purl.org/syndication/thread/1.0' => 'thr', | ||||||
3313 | 'http://wellformedweb.org/CommentAPI/' => 'wfw', | ||||||
3314 | 'http://www.w3.org/1999/xhtml' => 'xhtml', | ||||||
3315 | 'http://www.itunes.com/dtds/podcast-1.0.dtd' => 'itunes', | ||||||
3316 | 'http://rssnamespace.org/feedburner/ext/1.0' => 'feedburner', | ||||||
3317 | |||||||
3318 | # http://www.rssboard.org/media-rss | ||||||
3319 | 'http://search.yahoo.com/mrss' => 'media', | ||||||
3320 | |||||||
3321 | 'http://www.w3.org/2003/01/geo/wgs84_pos#' => 'geo', | ||||||
3322 | 'http://www.georss.org/georss' => 'georss', | ||||||
3323 | 'http://www.pheedo.com/namespace/pheedo' => 'pheedo', | ||||||
3324 | 'http://api.twitter.com' => 'twitter', | ||||||
3325 | 'http://xmlns.com/foaf/0.1/' => 'foaf', | ||||||
3326 | 'http://status.net/ont/' => 'statusnet', | ||||||
3327 | 'http://rdfs.org/sioc/ns#' => 'sioc', | ||||||
3328 | 'http://www.slate.com' => 'slate', | ||||||
3329 | 'http://activitystrea.ms/spec/1.0/' => 'activity', | ||||||
3330 | 'http://ostatus.org/schema/1.0' => 'ostatus', | ||||||
3331 | |||||||
3332 | # http://tools.ietf.org/html/draft-snell-atompub-feed-index-10 | ||||||
3333 | 'http://purl.org/atompub/rank/1.0' => 're', | ||||||
3334 | |||||||
3335 | # per http://docs.jivesoftware.com/latest/documentation/rss.html#output | ||||||
3336 | 'http://www.jivesoftware.com/xmlns/jiveforums/rss' => 'jf', | ||||||
3337 | |||||||
3338 | # these two are different, but treat the same for now | ||||||
3339 | 'http://backend.userland.com/creativeCommonsRssModule'=>'creativeCommons', | ||||||
3340 | 'http://creativecommons.org/ns#' =>'creativeCommons', | ||||||
3341 | |||||||
3342 | # Common Alerts Protocol | ||||||
3343 | 'urn:oasis:names:tc:emergency:cap:1.1' => 'cap', | ||||||
3344 | |||||||
3345 | # central bank exchange rates format, | ||||||
3346 | # spec http://www.cbwiki.net/wiki/index.php/RSS-CBMain | ||||||
3347 | # eg. RBA http://www.rba.gov.au/rss/rss-cb-exchange-rates.xml | ||||||
3348 | 'http://www.cbwiki.net/wiki/index.php/Specification_1.1' => 'cb', | ||||||
3349 | |||||||
3350 | # earthquakes | ||||||
3351 | # eg. http://earthquake.usgs.gov/earthquakes/shakemap/rss.xml | ||||||
3352 | 'http://earthquake.usgs.gov/rss/1.0/' => 'eq', | ||||||
3353 | |||||||
3354 | 'http://purl.org/dc/elements/1.1/' => 'dc', | ||||||
3355 | 'http://purl.org/dc/terms/' => 'dcterms', | ||||||
3356 | |||||||
3357 | # purl.org might be supposed to be the home for wiki:, but it's a 404 | ||||||
3358 | # and usemod.com suggests its page instead | ||||||
3359 | # Spec at http://www.meatballwiki.org/wiki/ModWiki | ||||||
3360 | 'http://purl.org/rss/1.0/modules/wiki/' => 'wiki', | ||||||
3361 | 'http://www.usemod.com/cgi-bin/mb.pl?ModWiki' => 'wiki', | ||||||
3362 | |||||||
3363 | # not sure if this is supposed to be necessary, but without it | ||||||
3364 | # "xml:lang" attributes are turned into "lang" | ||||||
3365 | 'http://www.w3.org/XML/1998/namespace' => 'xml', | ||||||
3366 | }; | ||||||
3367 | |||||||
3368 | sub twig_parse { | ||||||
3369 | 0 | 0 | 0 | my ($self, $xml) = @_; | |||
3370 | ### twig_parse() ... | ||||||
3371 | |||||||
3372 | # default "discard_spaces" chucks leading and trailing space on content, | ||||||
3373 | # which is usually a good thing | ||||||
3374 | # | ||||||
3375 | 0 | require XML::Twig; | |||||
3376 | 0 | XML::Twig->VERSION('3.34'); # for att_exists() | |||||
3377 | 0 | my $twig = XML::Twig->new (map_xmlns => $map_xmlns, | |||||
3378 | pretty_print => 'wrapped'); | ||||||
3379 | 0 | $twig->safe_parse ($xml); | |||||
3380 | 0 | my $err = $@; | |||||
3381 | ### $err | ||||||
3382 | |||||||
3383 | # Try to fix bad non-ascii chars by putting it through Encode::from_to(). | ||||||
3384 | # Encode::FB_DEFAULT substitutes U+FFFD when going to unicode, or question | ||||||
3385 | # mark "?" going to non-unicode. Mozilla does some sort of similar | ||||||
3386 | # liberal byte interpretation so as to at least display something from a | ||||||
3387 | # dodgy feed. | ||||||
3388 | # | ||||||
3389 | 0 | 0 | 0 | if ($err && $err =~ /not well-formed \(invalid token\) at (line \d+, column \d+, byte (\d+))/) { | |||
3390 | 0 | my $where = $1; | |||||
3391 | 0 | my $byte = ord(substr($xml,$2,1)); | |||||
3392 | 0 | 0 | if ($byte >= 128) { | ||||
3393 | 0 | 0 | my $charset = $twig->encoding // 'utf-8'; | ||||
3394 | 0 | $self->verbose (1, sprintf ("parse error, attempt re-code $charset for byte 0x%02X\n", $byte)); | |||||
3395 | 0 | require Encode; | |||||
3396 | 0 | my $recoded_xml = $xml; | |||||
3397 | 0 | Encode::from_to($recoded_xml, $charset, $charset, Encode::FB_DEFAULT()); | |||||
3398 | |||||||
3399 | 0 | $twig = XML::Twig->new (map_xmlns => $map_xmlns); | |||||
3400 | 0 | 0 | if ($twig->safe_parse ($recoded_xml)) { | ||||
3401 | 0 | $twig->root->set_att('rss2leafnode:fixup', | |||||
3402 | "Recoded bad bytes to charset $charset"); | ||||||
3403 | 0 | print __x("Feed {url}\n recoded {charset} to parse, expect substitutions for bad non-ascii\n ({where})\n", | |||||
3404 | url => $self->{'uri'}, | ||||||
3405 | charset => $charset, | ||||||
3406 | where => $where); | ||||||
3407 | 0 | undef $err; | |||||
3408 | } | ||||||
3409 | } | ||||||
3410 | } | ||||||
3411 | |||||||
3412 | # Or attempt to put it through XML::Liberal, if available. | ||||||
3413 | # | ||||||
3414 | 0 | 0 | if ($err) { | ||||
3415 | 0 | my $liberal_xml = $self->xml_liberal_correction($xml); | |||||
3416 | 0 | 0 | if (defined $liberal_xml) { | ||||
3417 | ### reparse xml liberal fixup with twig ... | ||||||
3418 | 0 | $twig = XML::Twig->new (map_xmlns => $map_xmlns); | |||||
3419 | 0 | 0 | if ($twig->safe_parse ($liberal_xml)) { | ||||
3420 | ### now ok ... | ||||||
3421 | 0 | $err = Text::Trim::trim($err); | |||||
3422 | 0 | $twig->root->set_att('rss2leafnode:fixup', | |||||
3423 | "XML::Liberal fixed: {error}", | ||||||
3424 | error => $err); | ||||||
3425 | 0 | print __x("Feed {url}\n parse error: {error}\n continuing with repairs by XML::Liberal\n", | |||||
3426 | url => $self->{'uri'}, | ||||||
3427 | error => $err); | ||||||
3428 | 0 | undef $err; | |||||
3429 | } | ||||||
3430 | } | ||||||
3431 | ### now err: $err | ||||||
3432 | } | ||||||
3433 | |||||||
3434 | 0 | 0 | if ($err) { | ||||
3435 | # XML::Parser seems to stick some spurious leading whitespace on the error | ||||||
3436 | 0 | $err = Text::Trim::trim($err); | |||||
3437 | |||||||
3438 | 0 | $self->verbose (1, __x("Parse error on URL {url}\n{error}", | |||||
3439 | url => $self->{'uri'}, | ||||||
3440 | error => $err)); | ||||||
3441 | 0 | return (undef, $err); | |||||
3442 | } | ||||||
3443 | |||||||
3444 | # Strip any explicit "rss:" or "atom:" namespace down to bare part. | ||||||
3445 | # Should be unambiguous and is easier than giving tag names both with and | ||||||
3446 | # without the namespace. Undocumented set_ns_as_default() might do this | ||||||
3447 | # ... or might not. | ||||||
3448 | # | ||||||
3449 | 0 | my $root = $twig->root; | |||||
3450 | 0 | App::RSS2Leafnode::XML::Twig::Other::elt_tree_strip_prefix ($root, 'atom'); | |||||
3451 | 0 | App::RSS2Leafnode::XML::Twig::Other::elt_tree_strip_prefix ($root, 'rss'); | |||||
3452 | |||||||
3453 | # somehow map_xmlns mangles default attributes like "decimals=...", prefer | ||||||
3454 | # to see them without rss: or atom: -- maybe | ||||||
3455 | # foreach my $child ($root->descendants_or_self) { | ||||||
3456 | # foreach my $attname ($child->att_names) { | ||||||
3457 | # if ($attname =~ /^(atom|rss):(.*)/) { | ||||||
3458 | # $child->change_att_name($attname, $2); | ||||||
3459 | # } | ||||||
3460 | # } | ||||||
3461 | # } | ||||||
3462 | |||||||
3463 | ### add xml base | ||||||
3464 | 0 | 0 | 0 | if (defined $self->{'uri'} && ! $root->att_exists('xml:base')) { | |||
3465 | 0 | $root->set_att ('xml:base', $self->{'uri'}); | |||||
3466 | } | ||||||
3467 | |||||||
3468 | ### success | ||||||
3469 | 0 | return ($twig, undef); | |||||
3470 | } | ||||||
3471 | |||||||
3472 | sub item_to_channel { | ||||||
3473 | 0 | 0 | 0 | my ($item) = @_; | |||
3474 | # parent for RSS or Atom, but sibling "channel" for RDF | ||||||
3475 | 0 | my $channel = $item->parent; | |||||
3476 | 0 | 0 | return ($channel->first_child('channel') | ||||
3477 | // $channel); | ||||||
3478 | } | ||||||
3479 | |||||||
3480 | # return a Message-ID string for this $item coming from $self->{'uri'} | ||||||
3481 | # | ||||||
3482 | sub item_to_msgid { | ||||||
3483 | 0 | 0 | 0 | my ($self, $item) = @_; | |||
3484 | |||||||
3485 | 0 | 0 | if (is_non_empty (my $id = $item->first_child_text('id'))) { | ||||
3486 | # Atom |
||||||
3487 | 0 | return $self->url_to_msgid ($id, $item->first_child_text('updated')); | |||||
3488 | } | ||||||
3489 | |||||||
3490 | 0 | my $guid; | |||||
3491 | 0 | my $isPermaLink = 0; | |||||
3492 | 0 | 0 | if (my $elt = $item->first_child('guid')) { | ||||
3493 | # ignore empty |
||||||
3494 | # http://abc.net.au/rn/podcast/feeds/sci.xml | ||||||
3495 | 0 | 0 | if (is_non_empty (my $str = collapse_whitespace ($elt->text))) { | ||||
3496 | 0 | $guid = $str; | |||||
3497 | 0 | 0 | $isPermaLink = (lc($elt->att('isPermaLink') // 'true') eq 'true'); | ||||
3498 | } | ||||||
3499 | } | ||||||
3500 | |||||||
3501 | 0 | 0 | if ($isPermaLink) { # |
||||
3502 | 0 | return $self->url_to_msgid ($guid); | |||||
3503 | } | ||||||
3504 | 0 | 0 | if (my $link = item_yahoo_permalink ($item)) { | ||||
3505 | 0 | return $self->url_to_msgid ($link); | |||||
3506 | } | ||||||
3507 | 0 | 0 | if (defined $guid) { # |
||||
3508 | 0 | return $self->url_to_msgid ($self->{'uri'}, $guid); | |||||
3509 | } | ||||||
3510 | |||||||
3511 | # nothing in the item, use the feed url and MD5 of some fields which | ||||||
3512 | # will hopefully distinguish it from other items at this url | ||||||
3513 | 0 | $self->verbose (2, ' msgid from MD5'); | |||||
3514 | 0 | return $self->url_to_msgid | |||||
3515 | ($self->{'uri'}, | ||||||
3516 | md5_of_utf8 (join_non_empty ('', | ||||||
3517 | 0 | map {$item->first_child_text($_)} | |||||
3518 | qw(title | ||||||
3519 | author | ||||||
3520 | dc:creator | ||||||
3521 | description | ||||||
3522 | content | ||||||
3523 | link | ||||||
3524 | pubDate | ||||||
3525 | published | ||||||
3526 | updated | ||||||
3527 | )))); | ||||||
3528 | } | ||||||
3529 | # FIXME: is |
||||||
3530 | # |
||||||
3531 | @known{qw(/channel/item/guid | ||||||
3532 | /channel/item/id | ||||||
3533 | /channel/item/wordzilla:id | ||||||
3534 | /channel/item/slate:id | ||||||
3535 | )} = (); | ||||||
3536 | |||||||
3537 | # Return an "In-Reply-To:" value for $item, being a space-separated list of | ||||||
3538 | # Message-ID strings including angles <>, or undef if nothing. The message | ||||||
3539 | # ids match up to an Atom |
||||||
3540 | # | ||||||
3541 | # RFC 4685 has |
||||||
3542 | # reply to multiple originals. | ||||||
3543 | # | ||||||
3544 | # Eg. comment feeds under | ||||||
3545 | # http://wickedgooddinner.blogspot.com/feeds/posts/default | ||||||
3546 | # | ||||||
3547 | sub item_to_in_reply_to { | ||||||
3548 | 0 | 0 | 0 | my ($self, $item) = @_; | |||
3549 | |||||||
3550 | 0 | my @ids; | |||||
3551 | 0 | foreach my $elt ($item->children('thr:in-reply-to')) { | |||||
3552 | 0 | 0 | my $ref = ($elt->att('thr:ref') | ||||
0 | |||||||
0 | |||||||
3553 | // $elt->att('ref') | ||||||
3554 | // $elt->att('atom:ref') # comes out atom: under map_xmlns ... | ||||||
3555 | // next); | ||||||
3556 | # probably shouldn't be relative actually ... | ||||||
3557 | 0 | $ref = App::RSS2Leafnode::XML::Twig::Other::elt_xml_based_uri ($elt, $ref); | |||||
3558 | 0 | push @ids, $self->url_to_msgid ($ref); | |||||
3559 | } | ||||||
3560 | 0 | 0 | if (@ids) { | ||||
3561 | 0 | return join (' ', @ids); | |||||
3562 | } else { | ||||||
3563 | 0 | return undef; | |||||
3564 | } | ||||||
3565 | } | ||||||
3566 | @known{qw(/channel/item/thr:in-reply-to | ||||||
3567 | )} = (); | ||||||
3568 | |||||||
3569 | # Return a string of comma separated keywords per RFC1036 and RFC2822. | ||||||
3570 | # | ||||||
3571 | # RSS |
||||||
3572 | # that in as a bit of a fallback, being better than nothing for | ||||||
3573 | # classification. | ||||||
3574 | # | ||||||
3575 | # Atom |
||||||
3576 | # attribute being the displayable part. Have seen only the "term" attribute | ||||||
3577 | # though. | ||||||
3578 | # | ||||||
3579 | # |
||||||
3580 | # it in for more classification for now. Can have child |
||||||
3581 | # elements as sub-categories, but don't worry about them, haven't seen any | ||||||
3582 | # real ones, only the sample at | ||||||
3583 | # http://www.apple.com/itunes/podcasts/specs.html#example | ||||||
3584 | # | ||||||
3585 | # |
||||||
3586 | # |
||||||
3587 | # particularly informative. | ||||||
3588 | # | ||||||
3589 | # |
||||||
3590 | # should be in the keywords if it's also in the body text, but at least | ||||||
3591 | # offers a bit of classification in the headers. | ||||||
3592 | # | ||||||
3593 | # |
||||||
3594 | # want a bit of decoding. Not much used, but for instance | ||||||
3595 | # http://www.gdacs.org/xml/RSSTC.xml | ||||||
3596 | # http://earthquake.usgs.gov/eqcenter/recenteqsww/catalogs/eqs7day-M5.xml | ||||||
3597 | # | ||||||
3598 | # How much value is there in the channel keywords? | ||||||
3599 | # | ||||||
3600 | { | ||||||
3601 | my $re = qr/^(category | ||||||
3602 | |itunes:category | ||||||
3603 | |cap:category | ||||||
3604 | |itunes:keywords | ||||||
3605 | |media:keywords | ||||||
3606 | |dc:subject | ||||||
3607 | |slash:section | ||||||
3608 | |slate:topic | ||||||
3609 | )$/x; | ||||||
3610 | sub item_to_keywords { | ||||||
3611 | 0 | 0 | 0 | my ($self, $item) = @_; | |||
3612 | 0 | my $channel = item_to_channel($item); | |||||
3613 | |||||||
3614 | 0 | return join_non_empty | |||||
3615 | (', ', | ||||||
3616 | List::MoreUtils::uniq | ||||||
3617 | 0 | (map { collapse_whitespace($_) } | |||||
3618 | 0 | 0 | map { split /,/ } | ||||
0 | |||||||
0 | |||||||
0 | |||||||
0 | |||||||
0 | |||||||
3619 | 0 | map { ($_->att('text') # itunes:category | |||||
3620 | // $_->att('itunes:text') # itunes:category | ||||||
3621 | // $_->att('atom:label') # atom |
||||||
3622 | // $_->att('label') # atom |
||||||
3623 | // $_->att('atom:term') # atom |
||||||
3624 | // $_->att('term') # atom |
||||||
3625 | // $_->text) } # other | ||||||
3626 | ($item->children($re), | ||||||
3627 | $channel->children($re), | ||||||
3628 | # |
||||||
3629 | 0 | map {$_->children('cb:keyword')} $item->children, | |||||
3630 | ))); | ||||||
3631 | } | ||||||
3632 | # maybe could show |
||||||
3633 | # too, for now just omit | ||||||
3634 | @known{qw(/channel/category | ||||||
3635 | /channel/itunes:category | ||||||
3636 | /channel/itunes:category/itunes:category | ||||||
3637 | |||||||
3638 | /channel/item/category | ||||||
3639 | /channel/item/itunes:keywords | ||||||
3640 | /channel/item/media:keywords | ||||||
3641 | /channel/item/slash:section | ||||||
3642 | /channel/item/slate:topic | ||||||
3643 | /channel/item/slate:section | ||||||
3644 | )} = (); | ||||||
3645 | } | ||||||
3646 | |||||||
3647 | { | ||||||
3648 | # Feturn a string for the "Importance:" header of RFC 1911, RFC 2156 | ||||||
3649 | # voice and X.400 messaging. Possible values 'high', 'normal', 'low'. | ||||||
3650 | # 'normal' is the header default, return undef in that case in the | ||||||
3651 | # interests of not junking up headers with defaults | ||||||
3652 | # | ||||||
3653 | my %cap_severity_high = (extreme => 1, | ||||||
3654 | severe => 1); | ||||||
3655 | my %cap_severity_normal = (moderate => 1); | ||||||
3656 | my %cap_severity_low = (minor => 1); | ||||||
3657 | |||||||
3658 | sub item_to_importance { | ||||||
3659 | 0 | 0 | 0 | my ($self, $item) = @_; | |||
3660 | |||||||
3661 | 0 | 0 | my $cap_severity = lc($item->first_child_trimmed_text('cap:severity') | ||||
3662 | // ''); | ||||||
3663 | 0 | 0 | my $wiki_importance = ($item->first_child_trimmed_text('wiki:importance') | ||||
3664 | // ''); | ||||||
3665 | 0 | 0 | if ($cap_severity) { | ||||
3666 | 0 | $self->verbose (2, " CAP severity: ",$cap_severity); | |||||
3667 | 0 | $self->verbose (2, " Wiki importance: ",$wiki_importance); | |||||
3668 | } | ||||||
3669 | |||||||
3670 | 0 | 0 | if ($cap_severity_high{$cap_severity}) { | ||||
3671 | 0 | return 'high'; | |||||
3672 | } | ||||||
3673 | 0 | 0 | if ($cap_severity_normal{$cap_severity}) { | ||||
3674 | 0 | return undef; # default "normal" | |||||
3675 | } | ||||||
3676 | 0 | 0 | 0 | if ($cap_severity_low{$cap_severity} | |||
3677 | || $wiki_importance eq 'minor') { | ||||||
3678 | 0 | return 'low'; | |||||
3679 | } | ||||||
3680 | 0 | return undef; # unknown | |||||
3681 | } | ||||||
3682 | @known{qw(/channel/item/wiki:importance | ||||||
3683 | )} = (); | ||||||
3684 | } | ||||||
3685 | { | ||||||
3686 | # Return a string for the "Priority:" header of RFC 1327, RFC 2156. | ||||||
3687 | # Possible values 'urgent', 'normal', 'non-urgent'. | ||||||
3688 | # 'normal' is the header default, return undef in that case in the | ||||||
3689 | # interests of not junking up headers with defaults | ||||||
3690 | # | ||||||
3691 | # |
||||||
3692 | # for when response action should be taken. Is the |
||||||
3693 | # better indicator of transmission priority? | ||||||
3694 | # | ||||||
3695 | my %cap_severity_urgent = (extreme => 1, | ||||||
3696 | severe => 1); | ||||||
3697 | my %cap_severity_normal = (moderate => 1); | ||||||
3698 | |||||||
3699 | sub item_to_priority { | ||||||
3700 | 0 | 0 | 0 | my ($self, $item) = @_; | |||
3701 | |||||||
3702 | 0 | 0 | my $cap_severity = lc($item->first_child_trimmed_text('cap:severity') | ||||
3703 | // ''); | ||||||
3704 | |||||||
3705 | 0 | 0 | if ($cap_severity_urgent{$cap_severity}) { | ||||
3706 | 0 | return 'urgent'; | |||||
3707 | } | ||||||
3708 | 0 | 0 | if ($cap_severity_normal{$cap_severity}) { | ||||
3709 | 0 | return undef; # default "normal" | |||||
3710 | } | ||||||
3711 | 0 | if (0) { # nothing for this yet | |||||
3712 | return 'non-urgent'; | ||||||
3713 | } | ||||||
3714 | 0 | return undef; # unknown | |||||
3715 | } | ||||||
3716 | } | ||||||
3717 | |||||||
3718 | # return a string for the slightly unofficial "Precedence:" header | ||||||
3719 | # might be able to identify lists gatewayed to RSS and give "list" for them | ||||||
3720 | # maybe "bulk" would suit low priority stuff | ||||||
3721 | # for now nothing | ||||||
3722 | # | ||||||
3723 | # sub item_to_precedence { | ||||||
3724 | # my ($self, $item) = @_; | ||||||
3725 | # return undef; # nothing | ||||||
3726 | # } | ||||||
3727 | |||||||
3728 | # return the host part of $self->{'uri'}, or "localhost" if none | ||||||
3729 | sub uri_to_host { | ||||||
3730 | 0 | 0 | 0 | my ($self) = @_; | |||
3731 | 0 | my $uri = $self->{'uri'}; | |||||
3732 | ### uri_to_host(): $uri | ||||||
3733 | 0 | 0 | return (non_empty ($uri && $uri->can('host') && $uri->host) | ||||
0 | |||||||
3734 | // 'localhost'); | ||||||
3735 | } | ||||||
3736 | |||||||
3737 | sub item_to_subject { | ||||||
3738 | 0 | 0 | 0 | my ($self, $item) = @_; | |||
3739 | |||||||
3740 | # Atom |
||||||
3741 | return | ||||||
3742 | 0 | 0 | (elt_to_rendered_line ($item->first_child('title')) | ||||
0 | |||||||
0 | |||||||
0 | |||||||
3743 | |||||||
3744 | # |
||||||
3745 | # present without a plain |
||||||
3746 | # | ||||||
3747 | // elt_to_rendered_line ($item->first_child('dc:title')) | ||||||
3748 | |||||||
3749 | # eg. https://archive.org/services/collection-rss.php has |
||||||
3750 | # in addition to plain |
||||||
3751 | # |
||||||
3752 | # | ||||||
3753 | // elt_to_rendered_line ($item->first_child('media:title')) | ||||||
3754 | |||||||
3755 | # |
||||||
3756 | # better than nothing. Not sure have ever actually seen |
||||||
3757 | # without |
||||||
3758 | # | ||||||
3759 | // elt_to_rendered_line ($item->first_child('dc:subject')) | ||||||
3760 | |||||||
3761 | // __('no subject')); | ||||||
3762 | } | ||||||
3763 | @known{qw(/channel/title | ||||||
3764 | /channel/dc:subject | ||||||
3765 | /channel/subtitle | ||||||
3766 | /channel/itunes:subtitle | ||||||
3767 | |||||||
3768 | /channel/item/dc:subject | ||||||
3769 | /channel/item/title | ||||||
3770 | /channel/item/media:title | ||||||
3771 | /channel/item/dc:title | ||||||
3772 | /channel/item/itunes:title | ||||||
3773 | /channel/item/itunes:subtitle --not-using-this-as-yet | ||||||
3774 | /channel/item/slate:menuline --copy-of-subject-it-seems | ||||||
3775 | /channel/item/slate:rubric --blog-title | ||||||
3776 | /channel/item/slate:blog --blog-title | ||||||
3777 | /channel/item/slate:legacy_url --same-as-link-it-seems | ||||||
3778 | )} = (); | ||||||
3779 | |||||||
3780 | |||||||
3781 | # return language code string for Content-Language, or undef | ||||||
3782 | # return is per RFC 1766, RFC 3066, RFC 4646 | ||||||
3783 | # | ||||||
3784 | # xml:lang is defined to be per RFC 4646, no mangling needed | ||||||
3785 | # RSS |
||||||
3786 | # |
||||||
3787 | # cf. I18N::LangTags if mangling might be needed one day | ||||||
3788 | # | ||||||
3789 | sub item_to_language { | ||||||
3790 | 0 | 0 | 0 | my ($self, $item) = @_; | |||
3791 | 0 | my $lang; | |||||
3792 | |||||||
3793 | 0 | 0 | if (my $elt = $item->first_child('content')) { | ||||
3794 | 0 | $lang = non_empty ($elt->att('xml:lang')); | |||||
3795 | } | ||||||
3796 | # Either |
||||||
3797 | # xml:lang="" tag, in the item itself or in channel, and maybe xml:lang in | ||||||
3798 | # toplevel |
||||||
3799 | # xml:lang, not a |
||||||
3800 | 0 | for ( ; $item; $item = $item->parent) { | |||||
3801 | 0 | 0 | $lang //= (non_empty ($item->first_child_trimmed_text | ||||
0 | |||||||
0 | |||||||
3802 | (qr/^((dc:)?language|twitter:lang)$/)) | ||||||
3803 | // non_empty ($item->att('xml:lang')) | ||||||
3804 | // next); | ||||||
3805 | } | ||||||
3806 | 0 | 0 | return ($lang // $self->{'resp'}->content_language); | ||||
3807 | } | ||||||
3808 | @known{qw(/channel/language | ||||||
3809 | /channel/dc:language | ||||||
3810 | /channel/twitter:lang | ||||||
3811 | /channel/item/language | ||||||
3812 | /channel/item/dc:language | ||||||
3813 | /channel/item/twitter:lang | ||||||
3814 | )} = (); | ||||||
3815 | |||||||
3816 | # return arrayref of copyright strings | ||||||
3817 | # Keep all of multiple rights/license/etc in the interests of preserving all | ||||||
3818 | # statements. | ||||||
3819 | sub item_to_copyright { | ||||||
3820 | 0 | 0 | 0 | my ($self, $item) = @_; | |||
3821 | 0 | my $channel = item_to_channel($item); | |||||
3822 | |||||||
3823 | # |
||||||
3824 | # suppress the latter in the presence of the former (dcterms: collapsed to | ||||||
3825 | # dc: by the map_xmlns). | ||||||
3826 | # | ||||||
3827 | # Atom |
||||||
3828 | # always plain text | ||||||
3829 | # | ||||||
3830 | 0 | my $re = qr/^(rights # Atom | |||||
3831 | |copyright # RSS, don't think entity-encoded html allowed there | ||||||
3832 | |dcterms:license | ||||||
3833 | |dc:rights | ||||||
3834 | |creativeCommons:licen[cs]e | ||||||
3835 | )$/x; | ||||||
3836 | # Atom sub-elem | ||||||
3837 | 0 | my @parents = ($item, $channel, $item->children('source')); | |||||
3838 | |||||||
3839 | 0 | my @strings; | |||||
3840 | 0 | foreach my $elt (map {$_->children($re)} @parents) { | |||||
0 | |||||||
3841 | 0 | push @strings, | |||||
3842 | join_non_empty(' ', | ||||||
3843 | elt_to_rendered_line($elt), | ||||||
3844 | # eg. |
||||||
3845 | $elt->att('rdf:resource')); | ||||||
3846 | } | ||||||
3847 | |||||||
3848 | # | ||||||
3849 | 0 | foreach my $link (map {$_->children('link')} @parents) { | |||||
0 | |||||||
3850 | ### link for copyright: $link->sprint | ||||||
3851 | 0 | 0 | 0 | if (($link->att('atom:rel')//$link->att('rel')//'') eq 'license') { | |||
0 | |||||||
3852 | 0 | 0 | push @strings, $link->att('atom:href')//$link->att('href'); | ||||
3853 | } | ||||||
3854 | } | ||||||
3855 | ### @strings | ||||||
3856 | 0 | return [ List::MoreUtils::uniq(grep {defined} @strings) ]; | |||||
0 | |||||||
3857 | } | ||||||
3858 | @known{qw(/channel/copyright | ||||||
3859 | /channel/rights | ||||||
3860 | /channel/dc:rights | ||||||
3861 | /channel/dc:license | ||||||
3862 | /channel/creativeCommons:licence | ||||||
3863 | /channel/creativeCommons:license | ||||||
3864 | /channel/item/dc:rights | ||||||
3865 | /channel/item/dc:license | ||||||
3866 | /channel/item/creativeCommons:licence | ||||||
3867 | /channel/item/creativeCommons:license | ||||||
3868 | )} = (); | ||||||
3869 | # /channel/item/media:credit --nothing-much-in-this-one | ||||||
3870 | |||||||
3871 | |||||||
3872 | # return string or undef | ||||||
3873 | sub item_to_generator { | ||||||
3874 | 0 | 0 | 0 | my ($self, $item) = @_; | |||
3875 | 0 | my $channel = item_to_channel($item); | |||||
3876 | 0 | my @strings; | |||||
3877 | |||||||
3878 | # both RSS and Atom use |
||||||
3879 | # Atom can include version="" and uri="" | ||||||
3880 | 0 | 0 | if (my $generator = $channel->first_child('generator')) { | ||||
3881 | 0 | push @strings, join_non_empty (' ', | |||||
3882 | $generator->text, | ||||||
3883 | $generator->att('atom:version'), | ||||||
3884 | $generator->att('version'), | ||||||
3885 | $generator->att('atom:uri'), | ||||||
3886 | $generator->att('uri')); | ||||||
3887 | } | ||||||
3888 | |||||||
3889 | # FIXME: is this bit right? | ||||||
3890 | # | ||||||
3891 | # source="<a href="http://nongnu.org/identica-mode/" rel="nofollow">Emacs Identica-mode</a>" | ||||||
3892 | # source_link="http://nongnu.org/identica-mode/"> | ||||||
3893 | # | ||||||
3894 | 0 | 0 | if (my $notice = $item->first_child('statusnet:notice_info')) { | ||||
3895 | 0 | 0 | if (defined (my $html = $notice->att('atom:source'))) { | ||||
3896 | 0 | push @strings, join_non_empty (' ', | |||||
3897 | html_to_rendered_line($html), | ||||||
3898 | $notice->att('atom:source_link')); | ||||||
3899 | } | ||||||
3900 | } | ||||||
3901 | |||||||
3902 | 0 | return collapse_whitespace (join_non_empty (', ', @strings)); | |||||
3903 | } | ||||||
3904 | @known{qw(/channel/item/statusnet:notice_info | ||||||
3905 | )} = (); | ||||||
3906 | |||||||
3907 | # return URL string or undef/empty | ||||||
3908 | sub item_to_feedburner { | ||||||
3909 | 0 | 0 | 0 | my ($self, $item) = @_; | |||
3910 | 0 | my $channel = item_to_channel($item); | |||||
3911 | 0 | 0 | my $elt = $channel->first_child('feedburner:info') || return; | ||||
3912 | 0 | 0 | my $uri = $elt->att('uri') // return; | ||||
3913 | 0 | return URI->new_abs ($uri, 'http://feeds.feedburner.com/')->as_string; | |||||
3914 | } | ||||||
3915 | |||||||
3916 | # $elt is an Atom |
||||||
3917 | sub atom_content_flavour { | ||||||
3918 | 0 | 0 | 0 | my ($elt) = @_; | |||
3919 | 0 | 0 | if (! defined $elt) { return ''; } | ||||
0 | |||||||
3920 | 0 | 0 | my $type = ($elt->att('atom:type') // $elt->att('type')); | ||||
3921 | 0 | 0 | 0 | if ($elt->att('atom:src') || $elt->att('src')) { | |||
3922 | # |
||||||
3923 | 0 | return 'link'; | |||||
3924 | } | ||||||
3925 | 0 | 0 | 0 | if (! defined $type | |||
0 | |||||||
0 | |||||||
0 | |||||||
3926 | || $type eq 'html' | ||||||
3927 | || $type eq 'xhtml' | ||||||
3928 | || $type eq 'application/xhtml+xml' | ||||||
3929 | || $type =~ m{^text/}) { | ||||||
3930 | 0 | return 'body'; | |||||
3931 | } | ||||||
3932 | 0 | return 'attach'; | |||||
3933 | } | ||||||
3934 | |||||||
3935 | sub html_wrap_fragment { | ||||||
3936 | 0 | 0 | 0 | my ($item, $fragment, $language) = @_; | |||
3937 | 0 | 0 | my $charset = (is_ascii($fragment) ? 'us-ascii' : 'utf-8'); | ||||
3938 | 0 | my $base_uri = App::RSS2Leafnode::XML::Twig::Other::elt_xml_base($item); | |||||
3939 | 0 | 0 | my $base_header = (defined $base_uri | ||||
3940 | ? " |
||||||
3941 | : ''); | ||||||
3942 | 0 | 0 | if (is_non_empty ($language)) { | ||||
3943 | 0 | $language = " lang=\"$Entitize{$language}\""; | |||||
3944 | } else { | ||||||
3945 | 0 | $language = ''; | |||||
3946 | } | ||||||
3947 | 0 | return (<<"HERE", $charset); | |||||
3948 | |||||||
3949 | |||||||
3950 | |||||||
3951 | |||||||
3952 | $base_header | ||||||
3953 | |||||||
3954 | $fragment | ||||||
3955 | |||||||
3956 | HERE | ||||||
3957 | } | ||||||
3958 | |||||||
3959 | # $self->{'rss_charset_override'}, if set, means the bytes are actually in | ||||||
3960 | # that charset. Enforce this by replacing the " | ||||||
3961 | # bytes. Do a decode() and re-encode() to cope with non-ascii like say | ||||||
3962 | # utf-16. | ||||||
3963 | # | ||||||
3964 | # XML::RSS::LibXML has an "encoding" option on its new(), but that's for | ||||||
3965 | # feed creation or something, a parse() still follows the tag. | ||||||
3966 | # | ||||||
3967 | sub enforce_rss_charset_override { | ||||||
3968 | 0 | 0 | 0 | my ($self, $xml) = @_; | |||
3969 | 0 | 0 | if (my $charset = $self->{'rss_charset_override'}) { | ||||
3970 | 0 | $xml = Encode::decode ($charset, $xml); | |||||
3971 | 0 | 0 | if ($xml =~ s/(<\?xml[^>]*encoding="?)([^">]+)/$1$charset/i) { | ||||
0 | |||||||
3972 | 0 | $self->verbose (2, "replace encoding=$2 tag with encoding=$charset"); | |||||
3973 | } elsif ($xml =~ s/(<\?xml[^?>]*)/$1 encoding="$charset"/i) { | ||||||
3974 | 0 | $self->verbose (2, "insert encoding=\"$charset\""); | |||||
3975 | } else { | ||||||
3976 | 0 | my $str = "\n"; | |||||
3977 | 0 | $self->verbose (2, "insert $str"); | |||||
3978 | 0 | $xml = $str . $xml; | |||||
3979 | } | ||||||
3980 | 0 | $self->verbose (3, "xml now:\n$xml\n"); | |||||
3981 | 0 | $xml = Encode::encode ($charset, $xml); | |||||
3982 | } | ||||||
3983 | 0 | return $xml; | |||||
3984 | } | ||||||
3985 | |||||||
3986 | # slightly experimental extract of "cap" fields as from | ||||||
3987 | # http://www.nws.noaa.gov/alerts-beta/ | ||||||
3988 | # http://www.weather.gov/alerts-beta/ca.php?x=0 | ||||||
3989 | sub item_common_alert_protocol { | ||||||
3990 | 0 | 0 | 0 | my ($self, $item, $want_html) = @_; | |||
3991 | 0 | my @fields; | |||||
3992 | 0 | foreach my $elt ($item->children(qr/^cap:/)) { | |||||
3993 | 0 | (my $field = $elt->name) =~ s/^cap://; | |||||
3994 | 0 | 0 | 0 | if ($field eq 'geocode' || $field eq 'parameter') { | |||
3995 | # dunno how to show these yet ... | ||||||
3996 | 0 | next; | |||||
3997 | } | ||||||
3998 | 0 | $known{'/channel/item/'.$elt->name} = undef; | |||||
3999 | |||||||
4000 | 0 | my $value = elt_to_rendered_line ($elt); | |||||
4001 | 0 | $value = Text::Trim::trim ($value); | |||||
4002 | 0 | 0 | if (is_non_empty ($value)) { | ||||
4003 | 0 | push @fields, [ "\u$field: ", $value ]; | |||||
4004 | } | ||||||
4005 | } | ||||||
4006 | 0 | 0 | if (! @fields) { | ||||
4007 | 0 | return ''; | |||||
4008 | } | ||||||
4009 | # FIXME: This $width padding doesn't come out in html, only in text. The | ||||||
4010 | # NOAA is Atom plain text, so that one is ok at least. | ||||||
4011 | 0 | my $width = max(map {length $_->[0]} @fields); | |||||
0 | |||||||
4012 | 0 | @fields = map { my $field = $_->[0]; | |||||
0 | |||||||
4013 | 0 | my $value = $_->[1]; | |||||
4014 | 0 | $field = sprintf ('%-*s', $width, $field); | |||||
4015 | 0 | $self->text_wrap ($value, $field) | |||||
4016 | } @fields; | ||||||
4017 | 0 | 0 | if ($want_html) { | ||||
4018 | 0 | return " \n" |
|||||
4019 | 0 | . join(" \n", map {$Entitize{$_}} @fields) |
|||||
4020 | . "\n\n"; | ||||||
4021 | } else { | ||||||
4022 | 0 | return "\n" | |||||
4023 | . join("\n", @fields) | ||||||
4024 | . "\n"; | ||||||
4025 | } | ||||||
4026 | } | ||||||
4027 | |||||||
4028 | sub item_unknowns { | ||||||
4029 | 0 | 0 | 0 | my ($self, $item, $want_html) = @_; | |||
4030 | ### item_unknowns() ... | ||||||
4031 | |||||||
4032 | 0 | my $xml = ''; | |||||
4033 | 0 | 0 | foreach my $elt (map {$_->tag eq 'media:group' # descend into media:group | ||||
0 | |||||||
4034 | ? $_->children : $_} | ||||||
4035 | $item->children) { | ||||||
4036 | 0 | 0 | next if $elt->tag =~ /^#/; # text | ||||
4037 | 0 | 0 | next if App::RSS2Leafnode::XML::Twig::Other::elt_is_empty($elt); | ||||
4038 | 0 | my $path = $elt->path; | |||||
4039 | 0 | $path =~ s{^/(rss|channel)/channel}{/channel}; | |||||
4040 | 0 | $path =~ s{^/(feed|rdf:RDF)}{/channel}; | |||||
4041 | 0 | $path =~ s{^/channel/entry}{/channel/item}; | |||||
4042 | 0 | 0 | next if $path =~ m{/xhtml}; | ||||
4043 | 0 | 0 | next if $path =~ m{^/channel/item/(description|content:encoded)/}; | ||||
4044 | 0 | 0 | next if exists $known{$path}; | ||||
4045 | ### unknown path: $path | ||||||
4046 | |||||||
4047 | 0 | require Text::Wrap; | |||||
4048 | 0 | my $part = do { | |||||
4049 | 0 | local $Text::Wrap::columns = $self->{'render_width'} + 1 + 4; | |||||
4050 | 0 | local $Text::Wrap::huge = 'overflow'; # don't break long words | |||||
4051 | 0 | local $Text::Wrap::unexpand = 0; # no tabs in output | |||||
4052 | 0 | $elt->sprint | |||||
4053 | }; | ||||||
4054 | 0 | $part =~ s/^ //mg; # indentation from element depth | |||||
4055 | 0 | $part =~ s/^\n+//; # leading blank lines | |||||
4056 | 0 | $xml .= $part; | |||||
4057 | } | ||||||
4058 | 0 | 0 | if ($xml eq '') { | ||||
4059 | 0 | return ''; | |||||
4060 | } | ||||||
4061 | ### $xml | ||||||
4062 | |||||||
4063 | 0 | 0 | if ($want_html) { | ||||
4064 | 0 | return "\n \n" . __('Further feed XML:') . " |
|||||
4065 | . "$Entitize{$xml}\n\n"; |
||||||
4066 | } else { | ||||||
4067 | 0 | return "\n" . __('Further feed XML:') . "\n" . $xml; | |||||
4068 | } | ||||||
4069 | } | ||||||
4070 | |||||||
4071 | @known{qw(/channel/item/media:group/media:title | ||||||
4072 | /channel/item/media:group/media:description | ||||||
4073 | /channel/item/media:group/media:credit | ||||||
4074 | /channel/item/media:group/media:player | ||||||
4075 | /channel/item/media:group/media:thumbnail | ||||||
4076 | /channel/item/media:group/media:content | ||||||
4077 | /channel/item/media:group/media:copyright | ||||||
4078 | |||||||
4079 | --ENHANCE-ME--nothing-for-these-yet | ||||||
4080 | /channel/item/media:group/media:category | ||||||
4081 | /channel/item/media:group/media:rating | ||||||
4082 | )} = (); # hash slice | ||||||
4083 | |||||||
4084 | sub media_group_to_html { | ||||||
4085 | 0 | 0 | 0 | my ($self, $group) = @_; | |||
4086 | ### media_group_to_html(): "$group" | ||||||
4087 | |||||||
4088 | 0 | my $ret = " \n"; |
|||||
4089 | 0 | my @lines; | |||||
4090 | |||||||
4091 | 0 | foreach my $elt ($group->children('media:title'), | |||||
4092 | $group->children('media:description')) { | ||||||
4093 | 0 | push @lines, elt_to_html($elt); | |||||
4094 | } | ||||||
4095 | |||||||
4096 | 0 | foreach my $elt ($group->children('media:credit')) { | |||||
4097 | 0 | my $html = elt_to_html($elt); | |||||
4098 | 0 | 0 | if (defined (my $role = non_empty($elt->att('role')))) { | ||||
4099 | 0 | $html .= " ($Entitize{$role})"; | |||||
4100 | } | ||||||
4101 | 0 | push @lines, $html; | |||||
4102 | } | ||||||
4103 | 0 | foreach my $elt ($group->children('media:player'), | |||||
4104 | $group->children('media:thumbnail'), | ||||||
4105 | $group->children('media:content')) { | ||||||
4106 | 0 | 0 | my $url = $elt->att('url') // next; | ||||
4107 | 0 | my $abs_url = App::RSS2Leafnode::XML::Twig::Other::elt_xml_based_uri | |||||
4108 | ($group, $url); | ||||||
4109 | |||||||
4110 | 0 | my $html = " | |||||
4111 | 0 | 0 | if (defined (my $type = non_empty($elt->att('type')))) { | ||||
4112 | 0 | $html .= " type=\"$Entitize{$type}\""; | |||||
4113 | } | ||||||
4114 | 0 | 0 | if (defined (my $lang = non_empty($elt->att('lang')))) { | ||||
4115 | 0 | $html .= " hreflang=\"$Entitize{$lang}\""; | |||||
4116 | } | ||||||
4117 | 0 | $html .= ">$Entitize{$url}$url"; | |||||
4118 | { | ||||||
4119 | 0 | my @paren; | |||||
0 | |||||||
4120 | 0 | 0 | if (defined (my $size = non_empty($elt->att('fileSize')))) { | ||||
4121 | 0 | push @paren, $self->format_size_in_bytes($size); | |||||
4122 | } | ||||||
4123 | 0 | 0 | if (defined (my $duration = non_empty($elt->att('duration')))) { | ||||
4124 | 0 | 0 | if ($duration !~ /:/) { | ||||
4125 | 0 | $duration = __px('s-for-seconds', '{duration}s', | |||||
4126 | duration => $duration); | ||||||
4127 | } | ||||||
4128 | 0 | push @paren, $duration; | |||||
4129 | } | ||||||
4130 | 0 | 0 | if (@paren) { | ||||
4131 | 0 | $html .= $Entitize{' (' . join(', ',@paren). ')'}; | |||||
4132 | } | ||||||
4133 | } | ||||||
4134 | 0 | $html .= "\n"; | |||||
4135 | 0 | push @lines, $html; | |||||
4136 | } | ||||||
4137 | |||||||
4138 | 0 | foreach my $elt ($group->children('media:copyright')) { | |||||
4139 | 0 | push @lines, "Copyright: ".elt_to_html($elt); | |||||
4140 | } | ||||||
4141 | |||||||
4142 | ### total lines: scalar(@lines) | ||||||
4143 | 0 | return " \n" . join(" |
|||||
4144 | } | ||||||
4145 | |||||||
4146 | sub elt_to_html { | ||||||
4147 | 0 | 0 | 0 | my ($elt) = @_; | |||
4148 | 0 | 0 | defined $elt or return; | ||||
4149 | |||||||
4150 | 0 | my $type = elt_content_type ($elt); | |||||
4151 | 0 | 0 | if ($type eq 'xhtml') { | ||||
4152 | 0 | return elt_xhtml_to_html($elt); | |||||
4153 | } | ||||||
4154 | 0 | my $str = elt_subtext($elt); | |||||
4155 | 0 | 0 | if ($type eq 'html') { | ||||
4156 | 0 | return $str; | |||||
4157 | } else { | ||||||
4158 | 0 | return $Entitize{$str}; | |||||
4159 | } | ||||||
4160 | } | ||||||
4161 | |||||||
4162 | # $body construction below | ||||||
4163 | @known{qw(/channel/item/description | ||||||
4164 | /channel/item/dc:description | ||||||
4165 | /channel/item/itunes:summary | ||||||
4166 | /channel/item/content:encoded | ||||||
4167 | /channel/item/summary | ||||||
4168 | )} = (); | ||||||
4169 | |||||||
4170 | # $item is an XML::Twig::Elt | ||||||
4171 | # | ||||||
4172 | sub fetch_rss_process_one_item { | ||||||
4173 | 0 | 0 | 0 | my ($self, $item) = @_; | |||
4174 | 0 | my $subject = $self->item_to_subject ($item); | |||||
4175 | 0 | $self->verbose (1, ' ', __x('item: {subject}', subject => $subject)); | |||||
4176 | |||||||
4177 | 0 | my $msgid = $self->item_to_msgid ($item); | |||||
4178 | 0 | my $new = 0; | |||||
4179 | |||||||
4180 | 0 | 0 | if (! $self->nntp_message_id_exists ($msgid)) { | ||||
4181 | 0 | my $channel = item_to_channel($item); | |||||
4182 | 0 | my ($from, @from_links) = $self->item_to_from($item); | |||||
4183 | 0 | my @links = ($self->item_to_links ($item), | |||||
4184 | @from_links); | ||||||
4185 | |||||||
4186 | # For comments feeds show "Re: Foo" as the subject. Haven't seen a | ||||||
4187 | # comments feed with anything useful in the |
||||||
4188 | # including it at the start of the message body if it was any good. | ||||||
4189 | # | ||||||
4190 | # | ||||||
4191 | # http://www.netzpolitik.org/feed/ has |
||||||
4192 | # just "Von: Foo" where Foo is the poster's name. | ||||||
4193 | # | ||||||
4194 | # my $dummy = $self->DUMMY_EMAIL_ADDRESS; | ||||||
4195 | # if ($from =~ /(.*) <\Q$dummy\E>$/ | ||||||
4196 | # && $subject eq "Von: $1") { | ||||||
4197 | # $subject = $self->{'getting_rss_comments'}; | ||||||
4198 | # } | ||||||
4199 | # | ||||||
4200 | 0 | 0 | if (defined $self->{'getting_rss_comments'}) { | ||||
4201 | 0 | $subject = $self->{'getting_rss_comments'}; | |||||
4202 | } | ||||||
4203 | |||||||
4204 | 0 | my $list_post = googlegroups_link_email(@links); | |||||
4205 | 0 | 0 | my $precedence = (defined $list_post ? 'list' : undef); | ||||
4206 | 0 | my $language = $self->item_to_language($item); | |||||
4207 | |||||||
4208 | # RSS |
||||||
4209 | # http://www.w3.org/TR/REC-PICS-labels | ||||||
4210 | # ENHANCE-ME: Maybe transform |
||||||
4211 | # PICS too maybe, unless it only applies to the enclosure as such. Maybe | ||||||
4212 | # |
||||||
4213 | 0 | my $pics_label = collapse_whitespace ($channel->first_child_text('rating')); | |||||
4214 | |||||||
4215 | # Crib: an undef value for a header means omit that header, which is good | ||||||
4216 | # for say the merely optional "Content-Language" | ||||||
4217 | # | ||||||
4218 | # there can be multiple "feed" links from Atom ... | ||||||
4219 | # 'X-RSS-Feed-Link:' => $channel->{'link'}, | ||||||
4220 | # | ||||||
4221 | 0 | my %headers | |||||
4222 | = ('Path:' => scalar ($self->uri_to_host), | ||||||
4223 | 'Newsgroups:' => $self->{'nntp_group'}, | ||||||
4224 | From => $from, | ||||||
4225 | Subject => $subject, | ||||||
4226 | Keywords => scalar ($self->item_to_keywords($item)), | ||||||
4227 | Date => scalar ($self->item_to_date($item)), | ||||||
4228 | 'In-Reply-To:' => scalar ($self->item_to_in_reply_to($item)), | ||||||
4229 | References => $self->{'References:'}, | ||||||
4230 | 'Message-ID' => $msgid, | ||||||
4231 | 'Content-Language:' => $language, | ||||||
4232 | 'Importance:' => scalar ($self->item_to_importance($item)), | ||||||
4233 | 'Priority:' => scalar ($self->item_to_priority($item)), | ||||||
4234 | 'Face:' => scalar ($self->item_to_face($item)), | ||||||
4235 | 'List-Post:' => $list_post, | ||||||
4236 | 'Precedence:' => $precedence, | ||||||
4237 | 'PICS-Label:' => $pics_label, | ||||||
4238 | 'X-Copyright:' => scalar ($self->item_to_copyright($item)), | ||||||
4239 | 'X-RSS-URL:' => scalar ($self->{'uri'}->as_string), | ||||||
4240 | 'X-RSS-Feedburner:' => scalar ($self->item_to_feedburner($item)), | ||||||
4241 | 'X-RSS-Generator:' => scalar ($self->item_to_generator($item)), | ||||||
4242 | ); | ||||||
4243 | |||||||
4244 | 0 | my $attach_elt; | |||||
4245 | |||||||
4246 | # |
||||||
4247 | # a copy of |
||||||
4248 | # | ||||||
4249 | # ENHANCE-ME: |
||||||
4250 | # as well as |
||||||
4251 | # | ||||||
4252 | my $body = ( | ||||||
4253 | # |
||||||
4254 | # |
||||||
4255 | $item->first_child('content:encoded') | ||||||
4256 | || $item->first_child('description') | ||||||
4257 | || $item->first_child('dc:description') | ||||||
4258 | || $item->first_child('itunes:summary') | ||||||
4259 | 0 | 0 | || do { | ||||
4260 | # Atom spec is for no more than one |
||||||
4261 | # Exclude "link", and leave "attach" to code below. | ||||||
4262 | my $elt = $item->first_child('content'); | ||||||
4263 | my $flavour = atom_content_flavour($elt); | ||||||
4264 | ($flavour eq 'link' ? undef | ||||||
4265 | : $flavour eq 'attach' ? do { $attach_elt = $elt; undef } | ||||||
4266 | : $elt) | ||||||
4267 | } | ||||||
4268 | || $item->first_child('summary')); # Atom | ||||||
4269 | |||||||
4270 | 0 | my $body_type = elt_content_type ($body); | |||||
4271 | 0 | $self->verbose (3, ' body_type from elt: ', $body_type); | |||||
4272 | 0 | my $body_charset = 'utf-8'; | |||||
4273 | 0 | my $body_base_url = App::RSS2Leafnode::XML::Twig::Other::elt_xml_base ($body); | |||||
4274 | 0 | 0 | if (! defined $body_type) { # no $body element at all | ||||
0 | |||||||
0 | |||||||
0 | |||||||
0 | |||||||
4275 | 0 | $body = ''; | |||||
4276 | 0 | $body_type = 'text/plain'; | |||||
4277 | |||||||
4278 | } elsif ($body_type eq 'xhtml') { # Atom | ||||||
4279 | 0 | $body = elt_xhtml_to_html ($body); | |||||
4280 | 0 | $body_type = 'html'; | |||||
4281 | |||||||
4282 | } elsif ($body_type eq 'html') { # RSS or Atom | ||||||
4283 | 0 | $body = elt_subtext($body); | |||||
4284 | |||||||
4285 | } elsif ($body_type eq 'text') { # Atom 'text' to be flowed | ||||||
4286 | # should be text-only, no sub-elements, but extract sub-elements to | ||||||
4287 | # cope with dodgy feeds with improperly escaped html etc | ||||||
4288 | 0 | $body = $self->text_wrap (elt_subtext ($body)); | |||||
4289 | 0 | $body_type = 'text/plain'; | |||||
4290 | } elsif ($body_type =~ m{^text/}) { # Atom mime text type | ||||||
4291 | 0 | $body = elt_subtext ($body); | |||||
4292 | |||||||
4293 | } else { # Atom base64 something | ||||||
4294 | 0 | $body = MIME::Base64::decode ($body->text); | |||||
4295 | 0 | $body_charset = undef; | |||||
4296 | } | ||||||
4297 | 0 | 0 | $self->verbose (3, " body: $body_type charset=", | ||||
4298 | $body_charset//'undef', "\n", | ||||||
4299 | "$body\n"); | ||||||
4300 | |||||||
4301 | 0 | 0 | my $body_is_html = ($body_type eq 'html'|| $body_type eq 'text/html'); | ||||
4302 | 0 | 0 | my $links_want_html = ($body_is_html && ! $self->{'render'}); | ||||
4303 | 0 | 0 | $self->verbose (3, " links_want_html: ", | ||||
4304 | ($links_want_html ? "yes" : "no")); | ||||||
4305 | |||||||
4306 | # sort downloadables to the start, then by "priority" | ||||||
4307 | 1 | 1 | 1208 | use sort 'stable'; | |||
1 | 636 | ||||||
1 | 6 | ||||||
4308 | 0 | 0 | 0 | @links = sort {($b->{'download'}||0) <=> ($a->{'download'}||0) | |||
0 | 0 | ||||||
0 | |||||||
0 | |||||||
4309 | || ($b->{'priority'}||0) <=> ($a->{'priority'}||0)} | ||||||
4310 | @links; | ||||||
4311 | 0 | 0 | my $links_str = ($links_want_html | ||||
4312 | ? links_to_html(@links) | ||||||
4313 | : links_to_text(@links)); | ||||||
4314 | 0 | $links_str .= $self->item_common_alert_protocol($item, $links_want_html); | |||||
4315 | 0 | my @parts; | |||||
4316 | |||||||
4317 | # |
||||||
4318 | { | ||||||
4319 | 0 | my $content = join ("\n", | |||||
0 | |||||||
4320 | 0 | map {$self->media_group_to_html($_)} | |||||
4321 | $item->children('media:group')); | ||||||
4322 | 0 | 0 | if (is_non_empty($content)) { | ||||
4323 | 0 | ($content, my $charset) = html_wrap_fragment ($item, $content); | |||||
4324 | 0 | my $content_type = 'text/html'; | |||||
4325 | 0 | ($content, $content_type, $charset, my $rendered) | |||||
4326 | = $self->render_maybe ($content, $content_type, $charset, | ||||||
4327 | $body_base_url); | ||||||
4328 | ### media group content: $content | ||||||
4329 | 0 | 0 | if ($content_type eq 'text/plain') { | ||||
4330 | 0 | $links_str .= $content; | |||||
4331 | } else { | ||||||
4332 | 0 | $content = Encode::encode ($charset, $content); | |||||
4333 | 0 | push @parts, $self->mime_build ({}, # headers | |||||
4334 | Type => $content_type, | ||||||
4335 | Charset => $charset, | ||||||
4336 | Data => $content); | ||||||
4337 | } | ||||||
4338 | } | ||||||
4339 | } | ||||||
4340 | |||||||
4341 | 0 | 0 | if (is_non_empty(my $content | ||||
4342 | = $self->item_unknowns($item, $links_want_html))) { | ||||||
4343 | 0 | 0 | my $content_type = ($links_want_html ? 'text/html' : 'text/plain'); | ||||
4344 | 0 | 0 | if (@parts) { | ||||
4345 | 0 | 0 | my $charset = (is_ascii($content) ? 'us-ascii' : 'utf-8'); | ||||
4346 | 0 | $content = Encode::encode ($charset, $content); | |||||
4347 | 0 | push @parts, $self->mime_build ({}, # headers | |||||
4348 | Type => $content_type, | ||||||
4349 | Charset => $charset, | ||||||
4350 | Data => $content); | ||||||
4351 | } else { | ||||||
4352 | 0 | $links_str .= $content; | |||||
4353 | } | ||||||
4354 | } | ||||||
4355 | |||||||
4356 | 0 | 0 | if ($self->{'rss_get_links'}) { | ||||
4357 | 0 | foreach my $l (@links) { | |||||
4358 | 0 | 0 | next if ! $l->{'download'}; | ||||
4359 | 0 | my $url = $l->{'uri'}; | |||||
4360 | 0 | $self->verbose (1, ' ', __x('link: "{name}" {url}', | |||||
4361 | name => $l->{'name'}, | ||||||
4362 | url => $url)); | ||||||
4363 | 0 | require HTTP::Request; | |||||
4364 | 0 | my $req = HTTP::Request->new (GET => $url); | |||||
4365 | 0 | my $resp = $self->ua->request($req); | |||||
4366 | 0 | $resp = $self->aireview_follow ($url, $resp); | |||||
4367 | |||||||
4368 | 0 | 0 | if (! $resp->is_success) { | ||||
4369 | 0 | print __x("rss2leafnode: {url}\n {status}\n", | |||||
4370 | url => $l->{'uri'}, | ||||||
4371 | status => $resp->status_line); | ||||||
4372 | 0 | my $msg = __x("Cannot download link {url}\n {status}", | |||||
4373 | url => $l->{'uri'}, | ||||||
4374 | status => $resp->status_line); | ||||||
4375 | 0 | 0 | if ($links_want_html) { | ||||
4376 | 0 | $msg = $Entitize{$msg}; | |||||
4377 | 0 | $msg =~ s/\n/ /; |
|||||
4378 | 0 | $links_str .= " $msg\n \n"; |
|||||
4379 | } else { | ||||||
4380 | 0 | $links_str .= "\n$msg\n"; | |||||
4381 | } | ||||||
4382 | 0 | next; | |||||
4383 | } | ||||||
4384 | |||||||
4385 | # suspect little value in a description when inlined | ||||||
4386 | # 'Content-Description:' => mimewords_non_ascii($l->{'title'}) | ||||||
4387 | # favicon used for Face if nothing in the item | ||||||
4388 | # | ||||||
4389 | 0 | $self->enforce_html_charset_from_content ($resp); | |||||
4390 | 0 | 0 | $headers{'Face:'} ||= $self->http_resp_to_face($resp); | ||||
4391 | 0 | $self->http_resp_extract_main($resp); | |||||
4392 | 0 | push @parts, $self->mime_part_from_response($resp); | |||||
4393 | } | ||||||
4394 | } | ||||||
4395 | 0 | 0 | 0 | if ($links_want_html && $body_type eq 'html') { | |||
4396 | # append to html fragment | ||||||
4397 | 0 | $body .= $links_str; | |||||
4398 | 0 | undef $links_str; | |||||
4399 | } | ||||||
4400 | |||||||
4401 | 0 | 0 | if ($body_type eq 'html') { | ||||
4402 | 0 | ($body, $body_charset) = html_wrap_fragment ($item, $body, $language); | |||||
4403 | 0 | $body_type = 'text/html'; | |||||
4404 | } | ||||||
4405 | 0 | 0 | if (defined $body_charset) { | ||||
4406 | 0 | $body = Encode::encode ($body_charset, $body); | |||||
4407 | } | ||||||
4408 | |||||||
4409 | 0 | ($body, $body_type, $body_charset) | |||||
4410 | = $self->render_maybe ($body, $body_type, $body_charset, $body_base_url); | ||||||
4411 | |||||||
4412 | 0 | 0 | if ($body_type eq 'text/plain') { | ||||
4413 | # remove trailing whitespace from any text | ||||||
4414 | 0 | $body =~ s/\s+$//; | |||||
4415 | 0 | $body .= "\n"; | |||||
4416 | |||||||
4417 | 0 | 0 | if (! $links_want_html) { | ||||
4418 | # append to text/plain, either atom type=text or rendered html | ||||||
4419 | 0 | 0 | unless (is_empty ($links_str)) { | ||||
4420 | 0 | $links_str = Encode::encode ($body_charset, $links_str); | |||||
4421 | 0 | $body .= "\n$links_str\n"; | |||||
4422 | } | ||||||
4423 | 0 | undef $links_str; | |||||
4424 | } | ||||||
4425 | } | ||||||
4426 | |||||||
4427 | 0 | 0 | unless (is_empty ($links_str)) { | ||||
4428 | 0 | my $links_type; | |||||
4429 | my $links_charset; | ||||||
4430 | 0 | 0 | if ($links_want_html) { | ||||
4431 | 0 | $links_type = 'text/html'; | |||||
4432 | 0 | ($links_str, $links_charset) = html_wrap_fragment ($item, $links_str); | |||||
4433 | } else { | ||||||
4434 | 0 | $links_type = 'text/plain'; | |||||
4435 | 0 | 0 | $links_charset = (is_ascii($links_str) ? 'us-ascii' : 'utf-8'); | ||||
4436 | } | ||||||
4437 | 0 | $links_str = Encode::encode ($links_charset, $links_str); | |||||
4438 | 0 | unshift @parts, $self->mime_build ({}, | |||||
4439 | Type => $links_type, | ||||||
4440 | Encoding => $links_charset, | ||||||
4441 | Data => $links_str); | ||||||
4442 | } | ||||||
4443 | |||||||
4444 | |||||||
4445 | 0 | my $top = $self->mime_build (\%headers, | |||||
4446 | Top => 1, | ||||||
4447 | Type => $body_type, | ||||||
4448 | Charset => $body_charset, | ||||||
4449 | Data => $body); | ||||||
4450 | |||||||
4451 | # Atom |
||||||
4452 | 0 | 0 | if ($attach_elt) { | ||||
4453 | # ENHANCE-ME: this decodes base64 from the xml and then re-encodes for | ||||||
4454 | # the mime, is it possible to pass straight in? | ||||||
4455 | 0 | 0 | unshift @parts, $self->mime_build | ||||
4456 | ({ 'Content-Location:' => $self->{'uri'}->as_string }, | ||||||
4457 | Type => scalar ($attach_elt->att('atom:type') | ||||||
4458 | // $attach_elt->att('type')), | ||||||
4459 | Encoding => 'base64', | ||||||
4460 | Data => MIME::Base64::decode($attach_elt->text)); | ||||||
4461 | } | ||||||
4462 | |||||||
4463 | 0 | $self->verbose (2, 'parts count: ',scalar(@parts)); | |||||
4464 | 0 | foreach my $part (@parts) { | |||||
4465 | 0 | $top->make_multipart; | |||||
4466 | 0 | $top->add_part ($part); | |||||
4467 | } | ||||||
4468 | |||||||
4469 | 0 | mime_entity_lines($top); | |||||
4470 | 0 | 0 | $self->nntp_post($top) || return 0; | ||||
4471 | 0 | $self->verbose (1, ' ', __('posted')); | |||||
4472 | 0 | $new++; | |||||
4473 | } | ||||||
4474 | |||||||
4475 | # ENHANCE-ME: check the replies count to see if more to fetch | ||||||
4476 | 0 | 0 | if ($self->{'rss_get_comments'}) { | ||||
4477 | 0 | my ($comments_rss_url, $comments_count) | |||||
4478 | = $self->item_to_comments_rss($item); | ||||||
4479 | ### rss_get_comments: $comments_rss_url, $comments_count | ||||||
4480 | 0 | 0 | if (defined $comments_rss_url) { | ||||
4481 | |||||||
4482 | # ENHANCE-ME: There's also a thr:updated in RFC 4685, but haven't seen | ||||||
4483 | # that ever actually used. | ||||||
4484 | 0 | my $status = $self->status_geturl ($comments_rss_url); | |||||
4485 | 0 | 0 | 0 | if (defined $status->{'comments_count'} | |||
0 | |||||||
4486 | && defined $comments_count | ||||||
4487 | && $status->{'comments_count'} == $comments_count) { | ||||||
4488 | 0 | $self->verbose (1, ' ', __x('comments count unchanged: {count}', | |||||
4489 | count => $comments_count)); | ||||||
4490 | |||||||
4491 | } else { | ||||||
4492 | 0 | local $self->{'rss_get_links'} = 0; | |||||
4493 | 0 | local $self->{'rss_get_comments'} = 0; | |||||
4494 | 0 | local $self->{'comments_count'} = $comments_count; | |||||
4495 | # "Re:" is not translated, variants of that are very annoying | ||||||
4496 | 0 | local $self->{'getting_rss_comments'} = "Re: $subject"; | |||||
4497 | 0 | local $self->{'References:'} = $msgid; | |||||
4498 | 0 | $new += fetch_rss ($self, $self->{'nntp_group'}, $comments_rss_url); | |||||
4499 | } | ||||||
4500 | } | ||||||
4501 | } | ||||||
4502 | 0 | return $new; | |||||
4503 | } | ||||||
4504 | |||||||
4505 | sub item_to_comments_rss { | ||||||
4506 | 0 | 0 | 0 | my ($self, $item) = @_; | |||
4507 | 0 | my ($url, $url_elt); | |||||
4508 | |||||||
4509 | # Atom | ||||||
4510 | # href='http:/...' /> | ||||||
4511 | 0 | foreach my $elt ($item->children('link')) { | |||||
4512 | 0 | 0 | my $rel = ($elt->att('rel') | ||||
0 | |||||||
4513 | // $elt->att('atom:rel') | ||||||
4514 | // next); | ||||||
4515 | 0 | 0 | $rel eq 'replies' or next; | ||||
4516 | 0 | 0 | $self->atom_link_is_rss($elt) or next; | ||||
4517 | 0 | 0 | my $href = ($elt->att('href') | ||||
4518 | // $elt->att('atom:href')); | ||||||
4519 | 0 | 0 | if (is_non_empty ($href)) { | ||||
4520 | 0 | $url = $href; | |||||
4521 | 0 | $elt = $url_elt; | |||||
4522 | } | ||||||
4523 | } | ||||||
4524 | |||||||
4525 | # |
||||||
4526 | # it appeared in the spec page as wfw:commentRSS, so ignore case | ||||||
4527 | 0 | 0 | if (! defined $url) { | ||||
4528 | 0 | my $u = $item->first_child_trimmed_text (qr/^wfw:commentRss$/i); | |||||
4529 | 0 | 0 | if (is_non_empty ($u)) { | ||||
4530 | 0 | $url = $u; | |||||
4531 | } | ||||||
4532 | } | ||||||
4533 | |||||||
4534 | 0 | 0 | return ($url, | ||||
4535 | (defined($url) && $self->item_elt_comments_count($item,$url_elt))); | ||||||
4536 | } | ||||||
4537 | |||||||
4538 | # |
||||||
4539 | # are which are replies, there's no comments link as such for it to refer | ||||||
4540 | # to, it seems | ||||||
4541 | sub item_elt_comments_count { | ||||||
4542 | 0 | 0 | 0 | my ($self, $item, $elt) = @_; | |||
4543 | 0 | 0 | return (($elt && $elt->att('thr:count')) | ||||
0 | |||||||
0 | |||||||
0 | |||||||
0 | |||||||
0 | |||||||
0 | |||||||
4544 | // ($elt && $elt->att('count')) | ||||||
4545 | // ($elt && $elt->att('atom:count')) | ||||||
4546 | // non_empty ($item->first_child_trimmed_text('thr:total')) | ||||||
4547 | // non_empty ($item->first_child_trimmed_text('slash:comments'))); | ||||||
4548 | } | ||||||
4549 | @known{qw(/channel/item/jf:replyCount | ||||||
4550 | )} = (); | ||||||
4551 | |||||||
4552 | # $group is a string, the name of a local newsgroup | ||||||
4553 | # $url is a string, an RSS feed to be read | ||||||
4554 | # | ||||||
4555 | sub fetch_rss { | ||||||
4556 | 0 | 0 | 1 | my ($self, $group, $url, %options) = @_; | |||
4557 | 0 | local @{$self}{keys %options} = values %options; # hash slice | |||||
0 | |||||||
4558 | 0 | $self->verbose (2, "fetch_rss: $group $url"); | |||||
4559 | |||||||
4560 | 0 | my $group_uri = URI->new($group,'news'); | |||||
4561 | 0 | local $self->{'nntp_host'} = uri_to_nntp_host ($group_uri); | |||||
4562 | 0 | local $self->{'nntp_group'} = $group = $group_uri->group; | |||||
4563 | 0 | 0 | $self->nntp_group_check($group) or return 0; | ||||
4564 | |||||||
4565 | # an in-memory cookie jar, used only per-RSS feed and then discarded, | ||||||
4566 | # which means only kept for fetching for $self->{'rss_get_links'} from a | ||||||
4567 | # feed | ||||||
4568 | 0 | $self->ua->cookie_jar({}); | |||||
4569 | |||||||
4570 | 0 | 0 | if (defined $self->{'getting_rss_comments'}) { | ||||
4571 | 0 | $self->verbose (1, ' ', __x('rss comments: {url}', url => $url)); | |||||
4572 | } else { | ||||||
4573 | 0 | $self->verbose (1, __x('feed: {url}', url => $url)); | |||||
4574 | } | ||||||
4575 | 0 | require HTTP::Request; | |||||
4576 | 0 | my $req = HTTP::Request->new (GET => $url); | |||||
4577 | 0 | 0 | $self->status_etagmod_req($req,1) || return 0; | ||||
4578 | |||||||
4579 | # $req->uri can be a URI object or a string | ||||||
4580 | 0 | local $self->{'uri'} = URI->new ($req->uri); | |||||
4581 | |||||||
4582 | 0 | my $resp = $self->ua->request($req); | |||||
4583 | 0 | 0 | if ($resp->code == 304) { | ||||
4584 | 0 | $self->status_unchanged ($url); | |||||
4585 | 0 | return 0; | |||||
4586 | } | ||||||
4587 | 0 | 0 | if (! $resp->is_success) { | ||||
4588 | 0 | print __x("rss2leafnode: {url}\n {status}\n", | |||||
4589 | url => $url, | ||||||
4590 | status => $resp->status_line); | ||||||
4591 | 0 | return 0; | |||||
4592 | } | ||||||
4593 | 0 | local $self->{'resp'} = $resp; | |||||
4594 | |||||||
4595 | 0 | $self->verbose (3, "response:", $resp->dump, "\n"); # extra newline | |||||
4596 | 0 | 0 | $resp->decode | ||||
4597 | or die "Oops, cannot decode Content-Encoding: ", | ||||||
4598 | $self->header("Content-Encoding"); | ||||||
4599 | |||||||
4600 | 0 | my $xml = $resp->content; # raw bytes | |||||
4601 | 0 | $xml = $self->enforce_rss_charset_override ($xml); | |||||
4602 | |||||||
4603 | 0 | my ($twig, $err) = $self->twig_parse($xml); | |||||
4604 | 0 | 0 | if (defined $err) { | ||||
4605 | 0 | my $message = __x("XML::Twig parse error on\n\n {url}\n\n", | |||||
4606 | url => $url); | ||||||
4607 | 0 | 0 | if ($resp->request->uri ne $url) { | ||||
4608 | 0 | $message .= __x("which redirected to\n\n {url}\n\n", | |||||
4609 | url => $resp->request->uri); | ||||||
4610 | } | ||||||
4611 | 0 | $message .= $err . "\n\n" . __("Raw XML below.\n") . "\n"; | |||||
4612 | 0 | $self->error_message | |||||
4613 | (__x("Error parsing {url}", url => $url), | ||||||
4614 | $message, $xml); | ||||||
4615 | # after successful error message to news | ||||||
4616 | 0 | $self->status_etagmod_resp ($url, $resp); | |||||
4617 | 0 | return 0; | |||||
4618 | } | ||||||
4619 | 0 | 0 | if ($self->{'verbose'} >= 3) { | ||||
4620 | 0 | require Data::Dumper; | |||||
4621 | 0 | $self->verbose (3, | |||||
4622 | Data::Dumper->new([$twig->root],['root']) | ||||||
4623 | ->Indent(1)->Sortkeys(1)->Dump); | ||||||
4624 | } | ||||||
4625 | |||||||
4626 | # "item" for RSS/RDF, "entry" for Atom | ||||||
4627 | 0 | my @items = $twig->descendants(qr/^(item|entry)$/); | |||||
4628 | |||||||
4629 | 0 | @items = $self->rss_newest_only_items(@items); | |||||
4630 | |||||||
4631 | 0 | my $new = 0; | |||||
4632 | 0 | foreach my $item (@items) { | |||||
4633 | 0 | $new += $self->fetch_rss_process_one_item ($item); | |||||
4634 | } | ||||||
4635 | |||||||
4636 | 0 | 0 | if ($self->{'verbose'} >= 2) { | ||||
4637 | 0 | my $jar = $self->ua->cookie_jar; | |||||
4638 | 0 | 0 | 0 | if ($jar && (my $str = $jar->as_string ne '')) { | |||
4639 | 0 | $self->verbose (2, "accumulated cookies from this feed:\n", $str); | |||||
4640 | } else { | ||||||
4641 | 0 | $self->verbose (2, 'no cookies from this feed'); | |||||
4642 | } | ||||||
4643 | } | ||||||
4644 | 0 | $self->ua->cookie_jar (undef); | |||||
4645 | |||||||
4646 | 0 | $self->status_etagmod_resp ($url, $resp, $twig); | |||||
4647 | 0 | say __xn('{group}: {count} new article', | |||||
4648 | '{group}: {count} new articles', | ||||||
4649 | $new, | ||||||
4650 | group => $group, | ||||||
4651 | count => $new); | ||||||
4652 | |||||||
4653 | 0 | return $new; | |||||
4654 | } | ||||||
4655 | |||||||
4656 | 1; | ||||||
4657 | __END__ |