| blib/lib/App/RSS2Leafnode.pm | |||
|---|---|---|---|
| Criterion | Covered | Total | % | 
| statement | 75 | 1586 | 4.7 | 
| branch | 0 | 718 | 0.0 | 
| condition | 0 | 532 | 0.0 | 
| subroutine | 26 | 162 | 16.0 | 
| pod | 4 | 123 | 3.2 | 
| total | 105 | 3121 | 3.3 | 
| line | stmt | bran | cond | sub | pod | time | code | 
|---|---|---|---|---|---|---|---|
| 1 | # Copyright 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014 Kevin Ryde | ||||||
| 2 | # | ||||||
| 3 | # This file is part of RSS2Leafnode. | ||||||
| 4 | # | ||||||
| 5 | # RSS2Leafnode is free software; you can redistribute it and/or modify it | ||||||
| 6 | # under the terms of the GNU General Public License as published by the Free | ||||||
| 7 | # Software Foundation; either version 3, or (at your option) any later | ||||||
| 8 | # version. | ||||||
| 9 | # | ||||||
| 10 | # RSS2Leafnode is distributed in the hope that it will be useful, but | ||||||
| 11 | # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY | ||||||
| 12 | # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License | ||||||
| 13 | # for more details. | ||||||
| 14 | # | ||||||
| 15 | # You should have received a copy of the GNU General Public License along | ||||||
| 16 |  # with RSS2Leafnode.  If not, see  | 
||||||
| 17 | |||||||
| 18 | |||||||
| 19 | # maybe: | ||||||
| 20 | # location links | ||||||
| 21 | # http://maps.google.com/maps?ll=-35.066667,148.1 | ||||||
| 22 | # http://maps.google.com/maps?ll=-35.066667,148.1&spn=0.01,0.01&t=m | ||||||
| 23 | # | ||||||
| 24 | # | ||||||
| 25 | # when site has mutliple names for a page, relative or absolute | ||||||
| 26 | |||||||
| 27 | |||||||
| 28 | |||||||
| 29 | package App::RSS2Leafnode; | ||||||
| 30 | 1 | 1 | 1476 | use 5.010; | |||
| 1 | 4 | ||||||
| 1 | 37 | ||||||
| 31 | 1 | 1 | 4 | use strict; | |||
| 1 | 2 | ||||||
| 1 | 25 | ||||||
| 32 | 1 | 1 | 5 | use warnings; | |||
| 1 | 1 | ||||||
| 1 | 20 | ||||||
| 33 | 1 | 1 | 4 | use Carp; | |||
| 1 | 1 | ||||||
| 1 | 110 | ||||||
| 34 | 1 | 1 | 1173 | use Encode; | |||
| 1 | 12210 | ||||||
| 1 | 88 | ||||||
| 35 | 1 | 1 | 6753 | use Hash::Util::FieldHash; | |||
| 1 | 1138 | ||||||
| 1 | 46 | ||||||
| 36 | 1 | 1 | 7 | use List::Util 'min', 'max'; | |||
| 1 | 1 | ||||||
| 1 | 94 | ||||||
| 37 | 1 | 1 | 886 | use List::MoreUtils; | |||
| 1 | 1222 | ||||||
| 1 | 43 | ||||||
| 38 | 1 | 1 | 1036 | use POSIX (); # ENOENT, etc | |||
| 1 | 6409 | ||||||
| 1 | 26 | ||||||
| 39 | 1 | 1 | 17 | use Scalar::Util; | |||
| 1 | 2 | ||||||
| 1 | 48 | ||||||
| 40 | 1 | 1 | 861 | use Text::Trim 1.02; # version 1.02 for undef support | |||
| 1 | 588 | ||||||
| 1 | 68 | ||||||
| 41 | 1 | 1 | 798 | use URI; | |||
| 1 | 8036 | ||||||
| 1 | 28 | ||||||
| 42 | 1 | 1 | 791 | use HTML::Entities::Interpolate; | |||
| 1 | 7548 | ||||||
| 1 | 5 | ||||||
| 43 | |||||||
| 44 | 1 | 1 | 545 | use App::RSS2Leafnode::XML::Twig::Other; | |||
| 1 | 2 | ||||||
| 1 | 43 | ||||||
| 45 | |||||||
| 46 | # version 1.17 for __p(), and version 1.16 for turn_utf_8_on() | ||||||
| 47 | 1 | 1 | 943 | use Locale::TextDomain 1.17; | |||
| 1 | 9373 | ||||||
| 1 | 7 | ||||||
| 48 | 1 | 1 | 9041 | use Locale::TextDomain ('App-RSS2Leafnode'); | |||
| 1 | 3 | ||||||
| 1 | 5 | ||||||
| 49 | BEGIN { | ||||||
| 50 | 1 | 1 | 25 | use Locale::Messages; | |||
| 1 | 2 | ||||||
| 1 | 59 | ||||||
| 51 | 1 | 1 | 5 | Locale::Messages::bind_textdomain_codeset ('App-RSS2Leafnode','UTF-8'); | |||
| 52 | 1 | 19 | Locale::Messages::bind_textdomain_filter ('App-RSS2Leafnode', | ||||
| 53 | \&Locale::Messages::turn_utf_8_on); | ||||||
| 54 | } | ||||||
| 55 | |||||||
| 56 | # uncomment this to run the ### lines | ||||||
| 57 | # use Smart::Comments; | ||||||
| 58 | |||||||
| 59 | our $VERSION; | ||||||
| 60 | BEGIN { | ||||||
| 61 | 1 | 1 | 750 | $VERSION = 77; | |||
| 62 | } | ||||||
| 63 | |||||||
| 64 | ## no critic (ProhibitFixedStringMatches) | ||||||
| 65 | |||||||
| 66 | |||||||
| 67 | # Cribs: | ||||||
| 68 | # | ||||||
| 69 | # RSS | ||||||
| 70 | # http://my.netscape.com/publish/help/ | ||||||
| 71 | # RSS 0.9 spec. | ||||||
| 72 | # http://my.netscape.com/publish/help/mnn20/quickstart.html | ||||||
| 73 | # RSS 0.91 spec. | ||||||
| 74 | # http://purl.org/rss/1.0/ | ||||||
| 75 | # RSS 1.0 spec. | ||||||
| 76 | # http://www.rssboard.org/rss-specification | ||||||
| 77 | # http://www.rssboard.org/files/rss-2.0-sample.xml | ||||||
| 78 | # RSS 2.0 spec and sample. | ||||||
| 79 | # | ||||||
| 80 | # http://www.rssboard.org/rss-profile | ||||||
| 81 | # "Best practices." | ||||||
| 82 | # | ||||||
| 83 | # Dublin Core | ||||||
| 84 | # RFC 5013 -- summary | ||||||
| 85 | # http://dublincore.org/documents/dcmi-terms/ -- dc/terms | ||||||
| 86 | # | ||||||
| 87 | # Atom | ||||||
| 88 | # RFC 4287 -- Atom spec | ||||||
| 89 | # RFC 3339 -- ISO timestamps as used in Atom | ||||||
| 90 | # RFC 4685 -- "thr" threading extensions | ||||||
| 91 | # RFC 4946 -- | ||||||
| 92 | # RFC 5005 -- etc paging and archiving | ||||||
| 93 | # http://diveintomark.org/archives/2004/05/28/howto-atom-id | ||||||
| 94 |  #      Making an  | 
||||||
| 95 | # http://www.iana.org/assignments/link-relations/link-relations.xhtml | ||||||
| 96 | # assigned values | ||||||
| 97 | # | ||||||
| 98 | # RSS Modules: | ||||||
| 99 | # http://www.meatballwiki.org/wiki/ModWiki -- wiki | ||||||
| 100 | # http://web.resource.org/rss/1.0/modules/slash/ | ||||||
| 101 | # http://code.google.com/apis/feedburner/feedburner_namespace_reference.html | ||||||
| 102 | # http://backend.userland.com/creativeCommonsRSSModule | ||||||
| 103 | # | ||||||
| 104 | # http://web.resource.org/rss/1.0/modules/content/ | ||||||
| 105 | # http://www.rssboard.org/rss-profile#namespace-elements-content | ||||||
| 106 | # http://validator.w3.org/feed/docs/warning/NeedDescriptionBeforeContent.html | ||||||
| 107 |  #        | 
||||||
| 108 | # | ||||||
| 109 | # http://www.apple.com/itunes/podcasts/specs.html | ||||||
| 110 | # http://www.feedforall.com/itunes.htm | ||||||
| 111 |  #   http://www.w3.org/2003/01/geo/wgs84_pos --  | 
||||||
| 112 | # http://www.georss.org/ | ||||||
| 113 | # http://www.georss.org/Encodings | ||||||
| 114 | # http://www.georss.org/atom | ||||||
| 115 | # http://www.georss.org/rdf_rss1 | ||||||
| 116 | # | ||||||
| 117 | # http://activitystrea.ms/specs/atom/1.0/ | ||||||
| 118 | # activity: | ||||||
| 119 | # http://prismstandard.org/namespaces/basic/2.0/ | ||||||
| 120 | # http://www.prismstandard.org/specifications/2.0/PRISM_prism_namespace_2.0.pdf | ||||||
| 121 | # Prism | ||||||
| 122 | # | ||||||
| 123 | # URIs | ||||||
| 124 | # RFC 1738, RFC 2396, RFC 3986 -- URI formats (news/nntp in 1738) | ||||||
| 125 | # draft-ellermann-news-nntp-uri-11.txt -- news/nntp update | ||||||
| 126 | # RFC 2732 -- ipv6 "[]" hostnames | ||||||
| 127 | # RFC 2141 -- urn: | ||||||
| 128 | # RFC 4122 -- uuid format (as under urn:uuid:) | ||||||
| 129 | # RFC 4151 -- tag: | ||||||
| 130 | # RFC 1034, RFC 1123 -- domain names | ||||||
| 131 | # RFC 2606 -- reserved domain names ".invalid" | ||||||
| 132 | # | ||||||
| 133 | # XML | ||||||
| 134 | # http://www.w3.org/TR/xmlbase/ -- xml:base | ||||||
| 135 | # RFC 3023 text/xml etc media types | ||||||
| 136 | # | ||||||
| 137 | # Mail Messages | ||||||
| 138 | # RFC 850, RFC 1036 | ||||||
| 139 | # -- News message format, inc headers and rnews format | ||||||
| 140 | # RFC 2822, RFC 5322, RFC 5536 | ||||||
| 141 | # -- Email message format. | ||||||
| 142 | # RFC 2076, RFC 4021 -- headers summary. | ||||||
| 143 | # RFC 2557 -- MHTML Content-Location | ||||||
| 144 | # RFC 1864 -- Content-MD5 header | ||||||
| 145 | # RFC 2369 -- List-Post header and friends | ||||||
| 146 | # http://www.ietf.org/proceedings/98dec/I-D/draft-ietf-drums-mail-followup-to-00.txt | ||||||
| 147 | # Draft "Mail-Followup-To" header. | ||||||
| 148 | # | ||||||
| 149 | # RFC 1327 -- X.400 to RFC822 introducing Language header | ||||||
| 150 | # RFC 3282 -- Content-Language header | ||||||
| 151 | # RFC 1766, RFC 3066, RFC 4646 -- language tag form | ||||||
| 152 | # | ||||||
| 153 | # | ||||||
| 154 | # NNTP | ||||||
| 155 | # RFC 977 -- NNTP | ||||||
| 156 | # RFC 2616 -- HTTP/1.1 Accept-Encoding header | ||||||
| 157 | # RFC 2980 -- NNTP extensions | ||||||
| 158 | # | ||||||
| 159 | # RFC 4642 -- NNTP with SSL | ||||||
| 160 | # | ||||||
| 161 | # For XML in Perl there's several ways to do it! | ||||||
| 162 | # - XML::Parser looks likely for stream/event processing, but its builtin | ||||||
| 163 | # tree mode is very basic. | ||||||
| 164 | # - XML::Twig extends XML::Parser to a good tree, though the docs are | ||||||
| 165 | # slightly light on. It only does a subset of "XPath" but the | ||||||
| 166 | # functions/regexps are more perl-like for matching and there's various | ||||||
| 167 | # handy shortcuts for common operations. | ||||||
| 168 | # - XML::LibXML is the full blown libxml and is rather a lot to learn. | ||||||
| 169 | # Because it's mainly C it's not easy to find where or how you're going | ||||||
| 170 | # wrong when your code doesn't work. libxml also seems stricter about | ||||||
| 171 | # namespace matters than XML::Parser/XML::Twig. | ||||||
| 172 | # - XML::RSS uses XML::Parser to build its own style tree of RSS, | ||||||
| 173 | # including unifying differences among RSS/RDF 0.91, 1.0 and 2.0. | ||||||
| 174 | # Nested elements seem to need specific handling in its code, which can | ||||||
| 175 | # make it tricky for sub-element oddities. A fair amount of it is about | ||||||
| 176 | # writing RSS too. | ||||||
| 177 | # - XML::RSS::LibXML uses libxml for XML::RSS compatible reading and | ||||||
| 178 | # writing. It seems to do better on unrecognised sub-elements. | ||||||
| 179 | # - XML::Atom offers the basic Atom elements but doesn't seem to give | ||||||
| 180 | # access to extra stuff that might be in a feed. | ||||||
| 181 | # - XML::Feed tries to unify XML::RSS and XML::Atom but again doesn't seem | ||||||
| 182 | # to go much beyond the basics. It too is geared towards writing as | ||||||
| 183 | # well as reading. | ||||||
| 184 | # - XML::TreePP pure perl parser to a hash tree. | ||||||
| 185 | # | ||||||
| 186 | # The choice of XML::Twig is based on wanting both RSS and Atom, but | ||||||
| 187 | # XML::Feed not going far enough. Tree processing is easier than stream, | ||||||
| 188 | # and an RSS isn't meant to be huge. A tree may help if channel fields | ||||||
| 189 | # follow items or something equally unnatural, but will probably assume that | ||||||
| 190 | # doesn't happen and look at the twig partial-tree mode. Between the tree | ||||||
| 191 | # styles XML::LibXML is harder to get into than Twig. | ||||||
| 192 | # | ||||||
| 193 | |||||||
| 194 | #------------------------------------------------------------------------------ | ||||||
| 195 | # mostly generic | ||||||
| 196 | |||||||
| 197 | # return $str with a newline at the end, if it doesn't already have one | ||||||
| 198 | sub str_ensure_newline { | ||||||
| 199 | 0 | 0 | 0 | my ($str) = @_; | |||
| 200 | 0 | 0 | if ($str !~ /\n$/) { $str .= "\n" } | ||||
| 0 | |||||||
| 201 | 0 | return $str; | |||||
| 202 | } | ||||||
| 203 | |||||||
| 204 | sub md5_of_utf8 { | ||||||
| 205 | 0 | 0 | 0 | my ($str) = @_; | |||
| 206 | 0 | require Digest::MD5; | |||||
| 207 | 0 | return Digest::MD5::md5_base64 (Encode::encode_utf8 ($str)); | |||||
| 208 | } | ||||||
| 209 | |||||||
| 210 | sub is_empty { | ||||||
| 211 | 0 | 0 | 0 | my ($str) = @_; | |||
| 212 | 0 | 0 | return (! defined $str || $str =~ /^\s*$/); | ||||
| 213 | } | ||||||
| 214 | sub is_non_empty { | ||||||
| 215 | 0 | 0 | 0 | my ($str) = @_; | |||
| 216 | 0 | return ! is_empty($str); | |||||
| 217 | } | ||||||
| 218 | sub non_empty { | ||||||
| 219 | 0 | 0 | 0 | my ($str) = @_; | |||
| 220 | 0 | 0 | return (is_non_empty($str) ? $str : ()); | ||||
| 221 | } | ||||||
| 222 | |||||||
| 223 | sub join_non_empty { | ||||||
| 224 | 0 | 0 | 0 | my $sep = shift; | |||
| 225 | 0 | return non_empty (join($sep, map {non_empty($_)} @_)); | |||||
| 0 | |||||||
| 226 | } | ||||||
| 227 | |||||||
| 228 | sub collapse_whitespace { | ||||||
| 229 | 0 | 0 | 0 | my ($str) = @_; | |||
| 230 | 0 | 0 | defined $str or return undef; | ||||
| 231 | 0 | 0 | $str =~ s/(\s+)/($1 eq ' ' ? $1 : ' ')/ge; | ||||
| 0 | |||||||
| 232 | 0 | return Text::Trim::trim($str); | |||||
| 233 | } | ||||||
| 234 | |||||||
| 235 | # return true if $str is entirely ascii chars 0 to 127 | ||||||
| 236 | sub is_ascii { | ||||||
| 237 | 0 | 0 | 0 | my ($str) = @_; | |||
| 238 | 0 | return ($str !~ /[^[:ascii:]]/); | |||||
| 239 | } | ||||||
| 240 | |||||||
| 241 | # Return the number of lines in $str. | ||||||
| 242 | # If $str ends with a newline then that counts as the last line, so "xyz\n" | ||||||
| 243 | # is one line. If $str doesn't end with a newline then the final chars are | ||||||
| 244 | # a line, so "abc\ndef" is two lines. | ||||||
| 245 | sub str_count_lines { | ||||||
| 246 | 0 | 0 | 0 | my ($str) = @_; | |||
| 247 | 0 | 0 | return scalar($str =~ tr/\n//) + (length($str) && substr($str,-1) ne "\n"); | ||||
| 248 | } | ||||||
| 249 | |||||||
| 250 | sub File_Temp_DEBUG_saver { | ||||||
| 251 | 0 | 0 | 0 | my ($self, $newval) = @_; | |||
| 252 | 0 | require Scope::Guard; | |||||
| 253 | 0 | require File::Temp; | |||||
| 254 | 0 | my $oldval = $File::Temp::DEBUG; | |||||
| 255 | 0 | 0 | my $ret = Scope::Guard->new (sub { $File::Temp::DEBUG = $oldval }); | ||||
| 0 | |||||||
| 256 | 0 | $File::Temp::DEBUG = $newval; | |||||
| 257 | 0 | return $ret; | |||||
| 258 | } | ||||||
| 259 | sub MIME_Tools_debugging { | ||||||
| 260 | 0 | 0 | 0 | my ($self, $newval) = @_; | |||
| 261 | 0 | require Scope::Guard; | |||||
| 262 | 0 | require MIME::Tools; | |||||
| 263 | 0 | my $oldval = MIME::Tools->debugging; | |||||
| 264 | 0 | 0 | my $ret = Scope::Guard->new (sub { MIME::Tools->debugging($oldval) }); | ||||
| 0 | |||||||
| 265 | 0 | MIME::Tools->debugging ($newval); | |||||
| 266 | 0 | return $ret; | |||||
| 267 | } | ||||||
| 268 | |||||||
| 269 | sub homedir { | ||||||
| 270 | # my ($self) = @_; | ||||||
| 271 | 0 | 0 | 0 | require File::HomeDir; | |||
| 272 | # call each time just in case playing tricks with $ENV{HOME} in conf file | ||||||
| 273 | 0 | 0 | return File::HomeDir->my_home | ||||
| 274 | // croak 'File::HomeDir says you have no home directory'; | ||||||
| 275 | } | ||||||
| 276 | |||||||
| 277 | #------------------------------------------------------------------------------ | ||||||
| 278 | # Number::Format for sizes in bytes | ||||||
| 279 | |||||||
| 280 | use constant::defer NUMBER_FORMAT => sub { | ||||||
| 281 | 0 | 0 | require Number::Format; | ||||
| 282 | 0 | 0 | Number::Format->VERSION(1.5); # for format_bytes() options params | ||||
| 283 | 0 | 0 | return Number::Format->new | ||||
| 284 | (-kilo_suffix => __p('number-format-kilobytes','K'), | ||||||
| 285 | -mega_suffix => __p('number-format-megabytes','M'), | ||||||
| 286 | -giga_suffix => __p('number-format-gigabytes','G')); | ||||||
| 287 | 1 | 1 | 820 | }; | |||
| 1 | 689 | ||||||
| 1 | 9 | ||||||
| 288 | |||||||
| 289 | sub format_size_in_bytes { | ||||||
| 290 | 0 | 0 | 0 | my ($self, $length) = @_; | |||
| 291 | 0 | 0 | if ($length >= 2000) { | ||||
| 292 | 0 | return $self->NUMBER_FORMAT()->format_bytes ($length, precision => 1); | |||||
| 293 | } else { | ||||||
| 294 | 0 | return __x('{size} bytes', size => $length); | |||||
| 295 | } | ||||||
| 296 | } | ||||||
| 297 | |||||||
| 298 | #------------------------------------------------------------------------------ | ||||||
| 299 | |||||||
| 300 | sub new { | ||||||
| 301 | 0 | 0 | 1 | my $class = shift; | |||
| 302 | 0 | return bless { | |||||
| 303 | # config variables | ||||||
| 304 | verbose => 0, | ||||||
| 305 | render => 0, | ||||||
| 306 | render_width => 60, | ||||||
| 307 | rss_get_links => 0, | ||||||
| 308 | rss_get_comments => 0, | ||||||
| 309 | rss_newest_only => 0, | ||||||
| 310 | get_icon => 0, | ||||||
| 311 | html_charset_from_content => 0, | ||||||
| 312 | |||||||
| 313 | # secret extra | ||||||
| 314 | msgidextra => '', | ||||||
| 315 | |||||||
| 316 | @_, | ||||||
| 317 | }, $class; | ||||||
| 318 | } | ||||||
| 319 | |||||||
| 320 | sub command_line { | ||||||
| 321 | 0 | 0 | 1 | my ($self) = @_; | |||
| 322 | |||||||
| 323 | 0 | my $done_version; | |||||
| 324 | 0 | require Getopt::Long; | |||||
| 325 | 0 | Getopt::Long::Configure ('no_ignore_case'); | |||||
| 326 | Getopt::Long::GetOptions | ||||||
| 327 | ('config=s' => \$self->{'config_filename'}, | ||||||
| 328 | 'verbose:1' => \$self->{'verbose'}, | ||||||
| 329 | 'version' => sub { | ||||||
| 330 | 0 | 0 | say __x("RSS2Leafnode version {version}", version => $VERSION); | ||||
| 331 | 0 | $done_version = 1; | |||||
| 332 | }, | ||||||
| 333 | 'bareversion' => sub { | ||||||
| 334 | 0 | 0 | say $VERSION; | ||||
| 335 | 0 | $done_version = 1; | |||||
| 336 | }, | ||||||
| 337 | 'msgid=s' => \$self->{'msgidextra'}, | ||||||
| 338 | 'help|?' => sub { | ||||||
| 339 | 0 | 0 | say __x("rss2leafnode [--options]"); | ||||
| 340 | 0 | say __x(" --config=filename configuration file (default ~/.rss2leafnode.conf)"); | |||||
| 341 | 0 | say __x(" --help print this help"); | |||||
| 342 | 0 | say __x(" --verbose describe what's done"); | |||||
| 343 | 0 | say __x(" --verbose=2 show technical details of what's done"); | |||||
| 344 | 0 | say __x(" --version print program version number"); | |||||
| 345 | 0 | exit 0; | |||||
| 346 | 0 | 0 | }) or return 1; | ||||
| 347 | 0 | 0 | if (! $done_version) { | ||||
| 348 | 0 | $self->do_config_file; | |||||
| 349 | 0 | $self->nntp_close; | |||||
| 350 | } | ||||||
| 351 | 0 | return 0; | |||||
| 352 | } | ||||||
| 353 | |||||||
| 354 | sub verbose { | ||||||
| 355 | 0 | 0 | 0 | my $self = shift; | |||
| 356 | 0 | my $count = shift; | |||||
| 357 | 0 | 0 | if ($self->{'verbose'} >= $count) { | ||||
| 358 | 0 | say @_; | |||||
| 359 | } | ||||||
| 360 | } | ||||||
| 361 | |||||||
| 362 | sub config_filename { | ||||||
| 363 | 0 | 0 | 0 | my ($self) = @_; | |||
| 364 | 0 | 0 | return $self->{'config_filename'} // do { | ||||
| 365 | 0 | require File::Spec; | |||||
| 366 | 0 | File::Spec->catfile ($self->homedir, '.rss2leafnode.conf'); | |||||
| 367 | }; | ||||||
| 368 | } | ||||||
| 369 | sub status_filename { | ||||||
| 370 | 0 | 0 | 0 | my ($self) = @_; | |||
| 371 | 0 | 0 | return $self->{'status_filename'} // do { | ||||
| 372 | 0 | require File::Spec; | |||||
| 373 | 0 | File::Spec->catfile ($self->homedir, '.rss2leafnode.status'); | |||||
| 374 | }; | ||||||
| 375 | } | ||||||
| 376 | |||||||
| 377 | sub do_config_file { | ||||||
| 378 | 0 | 0 | 0 | my ($self) = @_; | |||
| 379 | 0 | my @guards; | |||||
| 380 | |||||||
| 381 | 0 | 0 | open STDERR, '>&STDOUT' or die "Oops, can't join STDERR to STDOUT"; | ||||
| 382 | |||||||
| 383 | # File::Temp::DEBUG for possible temp files used by HTML::FormatExternal | ||||||
| 384 | # these debugs turned on only for the duration of running the config file | ||||||
| 385 | # and the downloading etc in it | ||||||
| 386 | 0 | 0 | if ($self->{'verbose'} >= 2) { | ||||
| 387 | 0 | push @guards, $self->File_Temp_DEBUG_saver(1); | |||||
| 388 | 0 | push @guards, $self->MIME_Tools_debugging(1); | |||||
| 389 | } | ||||||
| 390 | |||||||
| 391 | 0 | my $config_filename = $self->config_filename; | |||||
| 392 | 0 | $self->verbose (1, "config: ", $config_filename); | |||||
| 393 | |||||||
| 394 | 0 | require App::RSS2Leafnode::Conf; | |||||
| 395 | 0 | local $App::RSS2Leafnode::Conf::r2l = $self; | |||||
| 396 | 0 | 0 | if (! defined (do { package App::RSS2Leafnode::Conf; | ||||
| 397 | 0 | do $config_filename; | |||||
| 398 | })) { | ||||||
| 399 | 0 | 0 | if (! -e $config_filename) { | ||||
| 400 | 0 | croak "rss2leafnode: config file $config_filename doesn't exist\n"; | |||||
| 401 | } else { | ||||||
| 402 | 0 | croak $@; | |||||
| 403 | } | ||||||
| 404 | } | ||||||
| 405 | } | ||||||
| 406 | |||||||
| 407 | #------------------------------------------------------------------------------ | ||||||
| 408 | # LWP stuff | ||||||
| 409 | |||||||
| 410 | sub user_agent { | ||||||
| 411 | 0 | 0 | 0 | my ($self) = @_; | |||
| 412 | 0 | 0 | if (defined $self->{'user_agent'}) { | ||||
| 413 | 0 | return $self->{'user_agent'}; | |||||
| 414 | } else { | ||||||
| 415 | 0 | return 'RSS2leafnode/' . $self->VERSION . ' '; | |||||
| 416 | } | ||||||
| 417 | } | ||||||
| 418 | |||||||
| 419 | sub ua { | ||||||
| 420 | 0 | 0 | 0 | my ($self) = @_; | |||
| 421 | 0 | 0 | return ($self->{'ua'} ||= do { | ||||
| 422 | 0 | require LWP::UserAgent; | |||||
| 423 | 0 | LWP::UserAgent->VERSION(5.832); # 5.832 for content_charset() | |||||
| 424 | |||||||
| 425 | # one connection kept alive | ||||||
| 426 | 0 | my $ua = LWP::UserAgent->new (keep_alive => 1); | |||||
| 427 | 0 | Scalar::Util::weaken ($ua->{(__PACKAGE__)} = $self); | |||||
| 428 | 0 | $ua->agent ($self->user_agent); | |||||
| 429 | |||||||
| 430 | 0 | Scalar::Util::weaken (my $weak_self = $self); | |||||
| 431 | 0 | $ua->add_handler (request_send => \&lwp_request_send__verbose); | |||||
| 432 | $ua->add_handler (response_done => sub { | ||||||
| 433 | 0 | 0 | lwp_response_done__check_md5 ($weak_self, @_); | ||||
| 434 | 0 | }); | |||||
| 435 | |||||||
| 436 | # ask for everything $resp->decode() / $resp->decoded_content() can cope | ||||||
| 437 | # with, in particular "gzip" and "deflate" compression if Compress::Zlib | ||||||
| 438 | # or whatever is available | ||||||
| 439 | # | ||||||
| 440 | 0 | require HTTP::Message; | |||||
| 441 | 0 | my $decodable = HTTP::Message::decodable(); | |||||
| 442 | 0 | $self->verbose (2, "HTTP decodable: ", $decodable); | |||||
| 443 | 0 | $ua->default_header ('Accept-Encoding' => $decodable); | |||||
| 444 | |||||||
| 445 | 0 | $ua | |||||
| 446 | }); | ||||||
| 447 | } | ||||||
| 448 | |||||||
| 449 | sub lwp_request_send__verbose { | ||||||
| 450 | 0 | 0 | 0 | my ($req, $ua, $h) = @_; | |||
| 451 | 0 | my $self = $ua->{(__PACKAGE__)}; | |||||
| 452 | 0 | $self->verbose (2, "request_send:", $req->dump, "\n"); # extra newline | |||||
| 453 | 0 | return; # continue processing | |||||
| 454 | } | ||||||
| 455 | |||||||
| 456 | sub lwp_response_done__check_md5 { | ||||||
| 457 | 0 | 0 | 0 | my ($self, $resp, $ua, $h) = @_; | |||
| 458 | 0 | 0 | $self || return; | ||||
| 459 | ### lwp_response_done__check_md5() ... | ||||||
| 460 | 0 | 0 | my $want = $resp->header('Content-MD5') // do { | ||||
| 461 | 0 | $self->verbose (2, 'no Content-MD5 header'); | |||||
| 462 | 0 | return; | |||||
| 463 | }; | ||||||
| 464 | 0 | $resp->decode; | |||||
| 465 | 0 | my $cref = $resp->content_ref; | |||||
| 466 | 0 | require Digest::MD5; | |||||
| 467 | 0 | my $got = Digest::MD5::md5_hex($$cref); | |||||
| 468 | 0 | 0 | if ($got ne $want) { | ||||
| 469 | 0 | print __x("Warning, MD5 checksum mismatch on download {url}\n", | |||||
| 470 | url => $resp->request->uri); | ||||||
| 471 | } else { | ||||||
| 472 | 0 | $self->verbose(2, 'Content-MD5 ok'); | |||||
| 473 | } | ||||||
| 474 | } | ||||||
| 475 | |||||||
| 476 | # $resp is a HTTP::Response object. Modify its headers to apply our | ||||||
| 477 | # $html_charset_from_content option, which means if it's set then prefer the | ||||||
| 478 | # document's Content-Type over what the server says. | ||||||
| 479 | # | ||||||
| 480 | # The LWP::UserAgent parse_head option appends the document bits to | ||||||
| 481 | # the message headers. If the server and the document both offer a | ||||||
| 482 | # Content-Type then there's two, with the document one last, so all we have | ||||||
| 483 | # to do is change to make the last one the only one. | ||||||
| 484 | # | ||||||
| 485 | sub enforce_html_charset_from_content { | ||||||
| 486 | 0 | 0 | 0 | my ($self, $resp) = @_; | |||
| 487 | 0 | 0 | 0 | if ($self->{'html_charset_from_content'} | |||
| 488 | && $resp->headers->content_is_html) { | ||||||
| 489 | 0 | my $old = $resp->header('Content-Type'); | |||||
| 490 | 0 | $resp->header('Content-Type' => $resp->headers->content_type); | |||||
| 491 | |||||||
| 492 | 0 | $self->verbose (2, 'html_charset_from_content mangled Content-Type from'); | |||||
| 493 | 0 | $self->verbose (2, " from ", $old); | |||||
| 494 | 0 | $self->verbose (2, " to ", $resp->header('Content-Type')); | |||||
| 495 | 0 | $self->verbose (2, " giving charset ", $resp->content_charset); | |||||
| 496 | } | ||||||
| 497 | } | ||||||
| 498 | |||||||
| 499 | |||||||
| 500 | #------------------------------------------------------------------------------ | ||||||
| 501 | my %known; | ||||||
| 502 | |||||||
| 503 |  #  | 
||||||
| 504 | # ENHANCE-ME: is this something to work into the skipdays? or a message expiry? | ||||||
| 505 | # | ||||||
| 506 | $known{'/channel/item/dcterms:valid'} = undef; | ||||||
| 507 | |||||||
| 508 |  #  | 
||||||
| 509 | $known{'/channel/item/dcterms:audience'} = undef; | ||||||
| 510 | |||||||
| 511 |  #  | 
||||||
| 512 |  #  | 
||||||
| 513 | @known{qw(/channel/item/eq:depth | ||||||
| 514 | /channel/item/eq:seconds)} = (); | ||||||
| 515 | |||||||
| 516 | # rdf structure stuff | ||||||
| 517 | @known{qw(/channel/items | ||||||
| 518 | /channel/items/rdf:Seq | ||||||
| 519 | /channel/items/rdf:Seq/rdf:li)} = (); | ||||||
| 520 | |||||||
| 521 | @known{('/channel/cloud', | ||||||
| 522 | '/channel/link', | ||||||
| 523 | '/channel/docs', | ||||||
| 524 | '/channel/generator', | ||||||
| 525 | '/channel/rating', | ||||||
| 526 | '/channel/id', | ||||||
| 527 | '/channel/description', | ||||||
| 528 | '/channel/tagline', | ||||||
| 529 | '/channel/info', # atom something freeform | ||||||
| 530 | '/channel/itunes:summary', | ||||||
| 531 | '/channel/feedburner:info', | ||||||
| 532 | |||||||
| 533 | # nothing much in these as yet eg. rssboard | ||||||
| 534 | '/channel/item/sitemap:priority', | ||||||
| 535 | '/channel/item/sitemap:changefreq', | ||||||
| 536 | |||||||
| 537 | # feedburner junk | ||||||
| 538 | '/channel/feedburner:feedFlare', | ||||||
| 539 | |||||||
| 540 | # images | ||||||
| 541 | '/channel/itunes:owner', | ||||||
| 542 | '/channel/itunes:owner/itunes:name', | ||||||
| 543 | '/channel/itunes:owner/itunes:email', | ||||||
| 544 | |||||||
| 545 | '/channel/textInput', | ||||||
| 546 | '/channel/textInput/description', | ||||||
| 547 | '/channel/textInput/link', | ||||||
| 548 | '/channel/textInput/name', | ||||||
| 549 | '/channel/textInput/title', | ||||||
| 550 | '/channel/textinput', | ||||||
| 551 | '/channel/textinput/title', | ||||||
| 552 | '/channel/textinput/description', | ||||||
| 553 | '/channel/textinput/name', | ||||||
| 554 | '/channel/textinput/link', | ||||||
| 555 | |||||||
| 556 | '/channel/openSearch:totalResults', | ||||||
| 557 | '/channel/openSearch:startIndex', | ||||||
| 558 | '/channel/openSearch:itemsPerPage', | ||||||
| 559 | |||||||
| 560 | '/channel/item', | ||||||
| 561 | '/channel/item/source', | ||||||
| 562 | |||||||
| 563 | '/channel/item/twitter:source', | ||||||
| 564 | |||||||
| 565 | # something from radio free france | ||||||
| 566 | # eg. http://radiofrance-podcast.net/podcast09/rss_10193.xml | ||||||
| 567 | '/channel/item/podcastRF:businessReference', | ||||||
| 568 | |||||||
| 569 | # google documents stuff | ||||||
| 570 | '/channel/item/gd:extendedProperty', | ||||||
| 571 | |||||||
| 572 |          #  | 
||||||
| 573 | # eg. RBA http://www.rba.gov.au/rss/rss-cb-exchange-rates.xml | ||||||
| 574 | 'channel/item/cb:statistics', | ||||||
| 575 | |||||||
| 576 |          #  | 
||||||
| 577 | # eg. Fed Reserve http://www.federalreserve.gov/feeds/press_taf.xml | ||||||
| 578 | '/channel/item/cb:news', | ||||||
| 579 | |||||||
| 580 |          # FIXME:  | 
||||||
| 581 | # Fed eg. http://www.federalreserve.gov/feeds/speeches.xml | ||||||
| 582 | '/channel/item/cb:speech', | ||||||
| 583 | |||||||
| 584 |          #  | 
||||||
| 585 |          #  | 
||||||
| 586 | # Fed eg. http://www.federalreserve.gov/feeds/ifdp.xml | ||||||
| 587 |          # FIXME: except  | 
||||||
| 588 | '/channel/item/cb:paper', | ||||||
| 589 | '/channel/item/cb:event', | ||||||
| 590 | |||||||
| 591 |          #  | 
||||||
| 592 | '/channel/item/media:hash', | ||||||
| 593 | |||||||
| 594 | # not sure what these are, but don't seem very interesting | ||||||
| 595 |          '/channel/item/slate:slate_plus', #  | 
||||||
| 596 |          '/channel/item/slate:paywall',    #  | 
||||||
| 597 |          '/channel/item/slate:sponsored',  #  | 
||||||
| 598 | )} = (); | ||||||
| 599 | |||||||
| 600 | # weather | ||||||
| 601 | # '/channel/item/w:current', | ||||||
| 602 | # '/channel/item/w:forecast', | ||||||
| 603 | # '/channel/yweather:location', | ||||||
| 604 | # '/channel/yweather:units', | ||||||
| 605 | # '/channel/yweather:wind', | ||||||
| 606 | # '/channel/yweather:atmosphere', | ||||||
| 607 | # '/channel/yweather:astronomy', | ||||||
| 608 | # '/channel/item/yweather:condition', | ||||||
| 609 | # '/channel/item/yweather:forecast', | ||||||
| 610 | |||||||
| 611 | # --central-bank | ||||||
| 612 | # /channel/item/cb:statistics | ||||||
| 613 | # /channel/item/cb:statistics/cb:country | ||||||
| 614 | # /channel/item/cb:statistics/cb:institutionAbbrev | ||||||
| 615 | # /channel/item/cb:statistics/cb:exchangeRate | ||||||
| 616 | # /channel/item/cb:statistics/cb:exchangeRate/cb:value | ||||||
| 617 | # /channel/item/cb:statistics/cb:exchangeRate/cb:baseCurrency | ||||||
| 618 | # /channel/item/cb:statistics/cb:exchangeRate/cb:targetCurrency | ||||||
| 619 | # /channel/item/cb:statistics/cb:exchangeRate/cb:rateType | ||||||
| 620 | # /channel/item/cb:statistics/cb:exchangeRate/cb:observationPeriod | ||||||
| 621 | # /channel/item/cb:speech | ||||||
| 622 | # /channel/item/cb:speech/cb:simpleTitle | ||||||
| 623 | # /channel/item/cb:speech/cb:occurrenceDate | ||||||
| 624 | # /channel/item/cb:speech/cb:person | ||||||
| 625 | # /channel/item/cb:speech/cb:person/cb:givenName | ||||||
| 626 | # /channel/item/cb:speech/cb:person/cb:surname | ||||||
| 627 | # /channel/item/cb:speech/cb:person/cb:personalTitle | ||||||
| 628 | # /channel/item/cb:speech/cb:person/cb:nameAsWritten | ||||||
| 629 | # /channel/item/cb:speech/cb:person/cb:role | ||||||
| 630 | # /channel/item/cb:speech/cb:person/cb:role/cb:jobTitle | ||||||
| 631 | # /channel/item/cb:speech/cb:person/cb:role/cb:affiliation | ||||||
| 632 | # /channel/item/cb:speech/cb:venue | ||||||
| 633 | |||||||
| 634 | |||||||
| 635 | #------------------------------------------------------------------------------ | ||||||
| 636 | # dates | ||||||
| 637 | |||||||
| 638 | 1 | 1 | 1486 | use constant RFC822_STRFTIME_FORMAT => '%a, %d %b %Y %H:%M:%S %z'; | |||
| 1 | 3 | ||||||
| 1 | 2122 | ||||||
| 639 | |||||||
| 640 | # return a string which is current time in RFC 822 format | ||||||
| 641 | sub rfc822_time_now { | ||||||
| 642 | 0 | 0 | 0 | return POSIX::strftime (RFC822_STRFTIME_FORMAT, localtime(time())); | |||
| 643 | } | ||||||
| 644 | |||||||
| 645 | sub isodate_to_rfc822 { | ||||||
| 646 | 0 | 0 | 0 | my ($isodate) = @_; | |||
| 647 | 0 | 0 | if (! defined $isodate) { return undef; } | ||||
| 0 | |||||||
| 648 | 0 | my $date = $isodate; # the original goes through if unrecognised | |||||
| 649 | |||||||
| 650 | 0 | 0 | 0 | if ($isodate =~ /\dT\d/ || $isodate =~ /^\d{4}-\d{2}-\d{2}$/) { | |||
| 651 | # eg. "2000-01-01T12:00+00:00" | ||||||
| 652 | # "2000-01-01T12:00:00Z" | ||||||
| 653 | # "2000-01-01" | ||||||
| 654 | 0 | 0 | my $zonestr = ($isodate =~ s/([+-][0-9][0-9]):([0-9][0-9])$// ? " $1$2" | ||||
| 0 | |||||||
| 655 | : $isodate =~ s/Z$// ? ' +0000' | ||||||
| 656 | : ''); | ||||||
| 657 | 0 | require Date::Parse; | |||||
| 658 | 0 | my $time_t = Date::Parse::str2time($isodate); | |||||
| 659 | 0 | 0 | if (defined $time_t) { | ||||
| 660 | 0 | $date = POSIX::strftime ("%a, %d %b %Y %H:%M:%S$zonestr", | |||||
| 661 | localtime ($time_t)); | ||||||
| 662 | } | ||||||
| 663 | } | ||||||
| 664 | 0 | return $date; | |||||
| 665 | } | ||||||
| 666 | |||||||
| 667 | # Return an RFC822 date string, or undef if nothing known. | ||||||
| 668 | # This gets a sensible sort-by-date in the newsreader. | ||||||
| 669 |  #  | 
||||||
| 670 | # redundant. | ||||||
| 671 | # | ||||||
| 672 | sub item_to_date { | ||||||
| 673 | 0 | 0 | 0 | my ($self, $item) = @_; | |||
| 674 | 0 | my $date; | |||||
| 675 | 0 | foreach my $elt ($item, item_to_channel($item)) { | |||||
| 676 | 0 | 0 | $date = (non_empty ($elt->first_child_trimmed_text('pubDate')) | ||||
| 0 | |||||||
| 0 | |||||||
| 0 | |||||||
| 0 | |||||||
| 0 | |||||||
| 0 | |||||||
| 0 | |||||||
| 0 | |||||||
| 677 | // non_empty ($elt->first_child_trimmed_text('dc:date')) | ||||||
| 678 | // non_empty ($elt->first_child_trimmed_text('jf:creationDate')) | ||||||
| 679 | # Atom | ||||||
| 680 | // non_empty ($elt->first_child_trimmed_text('modified')) | ||||||
| 681 | // non_empty ($elt->first_child_trimmed_text('updated')) | ||||||
| 682 | // non_empty ($elt->first_child_trimmed_text('issued')) | ||||||
| 683 | // non_empty ($elt->first_child_trimmed_text('dcterms:issued')) | ||||||
| 684 | // non_empty ($elt->first_child_trimmed_text('created')) | ||||||
| 685 | # channel | ||||||
| 686 | // non_empty ($elt->first_child_trimmed_text('lastBuildDate')) | ||||||
| 687 | # Atom | ||||||
| 688 | // non_empty ($elt->first_child_trimmed_text('published')) | ||||||
| 689 | ); | ||||||
| 690 | 0 | 0 | last if defined $date; | ||||
| 691 | } | ||||||
| 692 | 0 | return isodate_to_rfc822($date); | |||||
| 693 | } | ||||||
| 694 | @known{qw(/channel/dc:date | ||||||
| 695 | /channel/lastBuildDate | ||||||
| 696 | /channel/pubDate | ||||||
| 697 | /channel/updated | ||||||
| 698 | /channel/modified | ||||||
| 699 | |||||||
| 700 | /channel/item/dc:date | ||||||
| 701 | /channel/item/pubDate | ||||||
| 702 | /channel/item/updated | ||||||
| 703 | /channel/item/published | ||||||
| 704 | /channel/item/modified | ||||||
| 705 | /channel/item/created | ||||||
| 706 | /channel/item/issued | ||||||
| 707 | /channel/item/dcterms:issued | ||||||
| 708 | |||||||
| 709 | /channel/item/jf:creationDate --java-locale-human-readable | ||||||
| 710 | /channel/item/jf:modificationDate | ||||||
| 711 | /channel/item/jf:date --free-form | ||||||
| 712 | )} = (); | ||||||
| 713 | |||||||
| 714 | |||||||
| 715 | sub item_to_timet { | ||||||
| 716 | 0 | 0 | 0 | my ($self, $item) = @_; | |||
| 717 | ### item_to_timet() ... | ||||||
| 718 | 0 | 0 | my $str = $self->item_to_date($item) | ||||
| 719 | // return - POSIX::DBL_MAX(); # no date fields | ||||||
| 720 | |||||||
| 721 | 0 | require Date::Parse; | |||||
| 722 | ### $str | ||||||
| 723 | # print Date::Parse::str2time($str)," $str\n"; | ||||||
| 724 | return (Date::Parse::str2time($str) | ||||||
| 725 | 0 | 0 | // do { | ||||
| 726 | 0 | say __x('Unrecognised date "{date}" from {url}', | |||||
| 727 | date => $str, | ||||||
| 728 | url => $self->{'uri'}); | ||||||
| 729 | 0 | - POSIX::DBL_MAX(); | |||||
| 730 | }); | ||||||
| 731 | } | ||||||
| 732 | |||||||
| 733 | #----------------------------------------------------------------------------- | ||||||
| 734 | # Message-ID | ||||||
| 735 | |||||||
| 736 | # Return a message ID for something at $uri, optionally uniquified by $str. | ||||||
| 737 | # $uri is either a URI object or a url string. | ||||||
| 738 | # Weird chars in $uri or $str are escaped as necessary. | ||||||
| 739 | # Secret $self->{'msgidextra'} can make different message ids for the same | ||||||
| 740 | # content when testing. | ||||||
| 741 | # | ||||||
| 742 | # The path from $uri is incorporated in the result. fetch_html() needs this | ||||||
| 743 | # since the ETag identifier is only per-url, not globally unique. Suspect | ||||||
| 744 | # fetch_rss() needs it for a guid too (a non-permaLink one), as think the | ||||||
| 745 | # guid is only unique within the particular $uri feed, not globally and not | ||||||
| 746 | # even across multiple feeds on the same server. | ||||||
| 747 | # | ||||||
| 748 | sub url_to_msgid { | ||||||
| 749 | 0 | 0 | 0 | my ($self, $url, $str) = @_; | |||
| 750 | |||||||
| 751 | 0 | my $host; | |||||
| 752 | 0 | my $pathbit = $url; | |||||
| 753 | |||||||
| 754 | 0 | 0 | if (my $uri = eval { URI->new($url) }) { | ||||
| 0 | |||||||
| 755 | 0 | $uri = $uri->canonical; | |||||
| 756 | 0 | 0 | if ($uri->can('host')) { | ||||
| 0 | |||||||
| 757 | 0 | $host = $uri->host; | |||||
| 758 | 0 | $uri->host(''); | |||||
| 759 | 0 | $pathbit = $uri->as_string; | |||||
| 760 | |||||||
| 761 | # If the $uri schema has a host part but it's empty or "localhost" | ||||||
| 762 | # then try expanding that to hostname(). | ||||||
| 763 | # | ||||||
| 764 |        # $uri schemas without a host part, like "urn:" in an Atom  | 
||||||
| 765 | # get hostname(), since want the generated msgid to come out the same | ||||||
| 766 | # if such a urn: appears from different downloaded locations. | ||||||
| 767 | # | ||||||
| 768 | 0 | 0 | 0 | if (is_empty($host) || $host eq 'localhost') { | |||
| 769 | 0 | require Sys::Hostname; | |||||
| 770 | 0 | eval { $host = Sys::Hostname::hostname() }; | |||||
| 0 | |||||||
| 771 | } | ||||||
| 772 | |||||||
| 773 | } elsif ($uri->can('authority')) { | ||||||
| 774 | # the "authority" part of a "tag:" schema | ||||||
| 775 | 0 | $host = $uri->authority; | |||||
| 776 | 0 | $uri->authority(''); | |||||
| 777 | 0 | $pathbit = $uri->as_string; | |||||
| 778 | } | ||||||
| 779 | } | ||||||
| 780 | |||||||
| 781 | # $host can be empty if running from a file:/// | ||||||
| 782 | # "localhost" is a bit bogus and in particular leafnode won't accept it. | ||||||
| 783 | # ".invalid" as per RFC 2606 | ||||||
| 784 | 0 | 0 | 0 | if (is_empty($host) || $host eq 'localhost') { | |||
| 785 | 0 | $host = 'rss2leafnode.invalid'; | |||||
| 786 | } | ||||||
| 787 | |||||||
| 788 | # ipv6 dotted hostname "[1234:5678::0000]" -> "1234.5678.0000..ipv6", | ||||||
| 789 | # because [ and : are not allowed (RFC 2822 "Atom" atext) | ||||||
| 790 | # $uri->canonical above lower cases any hex, for consistency | ||||||
| 791 | 0 | 0 | if (($host =~ s/^\[|\]$//g) | ($host =~ tr/:/./)) { | ||||
| 792 | 0 | $host .= '.ipv6'; | |||||
| 793 | } | ||||||
| 794 | |||||||
| 795 | # leafnode 2.0.0.alpha20070602a seems to insist on a "." in the host name | ||||||
| 796 | 0 | 0 | unless ($host =~ /\./) { | ||||
| 797 | 0 | $host .= '.withadot'; | |||||
| 798 | } | ||||||
| 799 | |||||||
| 800 | 0 | return ('<' | |||||
| 801 | . msgid_chars(join_non_empty('.', | ||||||
| 802 | "rss2leafnode" . $self->{'msgidextra'}, | ||||||
| 803 | $pathbit, | ||||||
| 804 | $str)) | ||||||
| 805 | . '@' | ||||||
| 806 | . msgid_chars($host) | ||||||
| 807 | . '>'); | ||||||
| 808 | } | ||||||
| 809 | # msgid_chars($str) returns $str with invalid Message-ID characters munged. | ||||||
| 810 | # Per RFC850 must be printing ascii and not < > or whitespace, but for | ||||||
| 811 | # safety reduce that a bit, in particular excluding ' and ". | ||||||
| 812 | sub msgid_chars { | ||||||
| 813 | 0 | 0 | 0 | my ($str) = @_; | |||
| 814 | 0 | require URI::Escape; | |||||
| 815 | 0 | return URI::Escape::uri_escape_utf8 ($str, "^A-Za-z0-9\\-_.!~*/:"); | |||||
| 816 | } | ||||||
| 817 | |||||||
| 818 | #------------------------------------------------------------------------------ | ||||||
| 819 | # news posting | ||||||
| 820 | # | ||||||
| 821 | # This used to run the "rnews" program, which in leafnode 2 does some direct | ||||||
| 822 | # writing to the spool. But that requires user "news" perms, and as of the | ||||||
| 823 | # June 2007 leafnode beta it tends to be a good deal slower because it reads | ||||||
| 824 | # the whole groupinfo file. It has the advantage of not being picky about | ||||||
| 825 | # message ID hostnames, and allowing read-only groups to be filled. But | ||||||
| 826 | # apart from that plain POST seems much easier for being "server neutral". | ||||||
| 827 | # | ||||||
| 828 | # IHAVE instead of POST would be a possibility, when available, though POST | ||||||
| 829 | # is probably more accurate in the sense it's a new article coming into the | ||||||
| 830 | # news system. | ||||||
| 831 | # | ||||||
| 832 | # Net::NNTP looks at $ENV{NNTPSERVER}, $ENV{NEWSHOST} and Net::Config | ||||||
| 833 | # nntp_hosts list for the news server. Maybe could have that here too, | ||||||
| 834 | # instead of always defaulting to localhost (in $self->{'nntp_host'}). | ||||||
| 835 | # Would want to find out the name chosen to show in diagnostics though. | ||||||
| 836 | |||||||
| 837 | # return a string "host:port", suitable for the Host arg to Net::NNTP->new | ||||||
| 838 | sub uri_to_nntp_host { | ||||||
| 839 | 0 | 0 | 0 | my ($uri) = @_; | |||
| 840 | 0 | 0 | return (non_empty($uri->host) // 'localhost') . ':' . $uri->port; | ||||
| 841 | } | ||||||
| 842 | |||||||
| 843 | sub nntp { | ||||||
| 844 | 0 | 0 | 0 | my ($self) = @_; | |||
| 845 | # reopen if different 'nntp_host' | ||||||
| 846 | 0 | 0 | 0 | if (! $self->{'nntp'} | |||
| 847 | || $self->{'nntp'}->host ne $self->{'nntp_host'}) { | ||||||
| 848 | 0 | my $host = $self->{'nntp_host'}; | |||||
| 849 | 0 | $self->verbose (1, __x("nntp: {host}", host => $host)); | |||||
| 850 | 0 | require Net::NNTP; | |||||
| 851 | 0 | 0 | my $nntp = $self->{'nntp'} | ||||
| 852 | = Net::NNTP->new ($host, ($self->{'verbose'} >= 2 | ||||||
| 853 | ? (Debug => 1) | ||||||
| 854 | : ())); | ||||||
| 855 | 0 | 0 | if (! $nntp) { | ||||
| 856 | 0 | croak __x("Cannot connect to NNTP on \"{host}\"\n", host => $host); | |||||
| 857 | } | ||||||
| 858 | 0 | 0 | if (! $nntp->postok) { | ||||
| 859 | 0 | $self->verbose (1, "Hmm, ", $nntp->host, " doesn't say \"posting ok\" ..."); | |||||
| 860 | } | ||||||
| 861 | } | ||||||
| 862 | 0 | return $self->{'nntp'}; | |||||
| 863 | } | ||||||
| 864 | |||||||
| 865 | sub nntp_close { | ||||||
| 866 | 0 | 0 | 0 | my ($self) = @_; | |||
| 867 | 0 | 0 | if (my $nntp = delete $self->{'nntp'}) { | ||||
| 868 | 0 | 0 | if (! $nntp->quit) { | ||||
| 869 | 0 | say "Error closing nntp: ",$nntp->message; | |||||
| 870 | } | ||||||
| 871 | } | ||||||
| 872 | } | ||||||
| 873 | |||||||
| 874 | # check that $group exists in the NNTP, return 1 if so, or 0 if not | ||||||
| 875 | sub nntp_group_check { | ||||||
| 876 | 0 | 0 | 0 | my ($self, $group) = @_; | |||
| 877 | 0 | my $nntp = $self->nntp; | |||||
| 878 | 0 | 0 | if (! $nntp->group($group)) { | ||||
| 879 | 0 | print __x("rss2leafnode: no group \"{group}\" on host \"{host}\" | |||||
| 880 | (See the rss2leafnode man page for notes on creating groups.) | ||||||
| 881 | ", | ||||||
| 882 | host => $nntp->host, | ||||||
| 883 | group => $group); | ||||||
| 884 | 0 | return 0; | |||||
| 885 | } | ||||||
| 886 | |||||||
| 887 | 0 | return 1; | |||||
| 888 | } | ||||||
| 889 | |||||||
| 890 | sub nntp_message_id_exists { | ||||||
| 891 | 0 | 0 | 0 | my ($self, $msgid) = @_; | |||
| 892 | 0 | my $ret = $self->nntp->nntpstat($msgid); | |||||
| 893 | 0 | 0 | if ($self->{'verbose'} >= 2) { | ||||
| 0 | |||||||
| 894 | 0 | 0 | $self->verbose (2, "'$msgid' ", ($ret ? 'exists already' : 'new')); | ||||
| 895 | } elsif ($self->{'verbose'} >= 1) { | ||||||
| 896 | 0 | 0 | if ($ret) { | ||||
| 897 | 0 | $self->verbose (1, ' ', __('exists already')); | |||||
| 898 | } | ||||||
| 899 | } | ||||||
| 900 | 0 | return $ret; | |||||
| 901 | } | ||||||
| 902 | |||||||
| 903 | # post $msg to NNTP, return true if successful | ||||||
| 904 | sub nntp_post { | ||||||
| 905 | 0 | 0 | 0 | my ($self, $msg) = @_; | |||
| 906 | 0 | my $nntp = $self->nntp; | |||||
| 907 | 0 | 0 | if (! $nntp->post ($msg->as_string)) { | ||||
| 908 | 0 | say __x('Cannot post: {message}', | |||||
| 909 | message => scalar($nntp->message)); | ||||||
| 910 | 0 | return 0; | |||||
| 911 | } | ||||||
| 912 | 0 | return 1; | |||||
| 913 | } | ||||||
| 914 | |||||||
| 915 | |||||||
| 916 | #------------------------------------------------------------------------------ | ||||||
| 917 | # HTML title | ||||||
| 918 | |||||||
| 919 | # extra data associated against a HTTP::Response object | ||||||
| 920 | Hash::Util::FieldHash::fieldhash (my %resp_exiftool_info); | ||||||
| 921 | |||||||
| 922 | # return hashref { Title => $str, ... }, or empty {} if no exiftool etc | ||||||
| 923 | sub resp_exiftool_info { | ||||||
| 924 | 0 | 0 | 0 | my ($resp) = @_; | |||
| 925 | 0 | 0 | defined $resp or return {}; | ||||
| 926 | 0 | 0 | if (! exists $resp_exiftool_info{$resp}) { | ||||
| 927 | 0 | $resp_exiftool_info{$resp} = _resp_exiftool_info($resp); | |||||
| 928 | ### exiftool info: $resp_exiftool_info{$resp} | ||||||
| 929 | } | ||||||
| 930 | 0 | return $resp_exiftool_info{$resp}; | |||||
| 931 | } | ||||||
| 932 | sub _resp_exiftool_info { | ||||||
| 933 | 0 | 0 | my ($resp) = @_; | ||||
| 934 | |||||||
| 935 | # Want ExifTool 8.22 to have PNG tEXt returned as utf8, but don't bother | ||||||
| 936 | # to enforce that. | ||||||
| 937 | # | ||||||
| 938 | # The returned fields from image formats with a defined charset are | ||||||
| 939 | # converted to the exiftool default "Charset" of utf8, and from other | ||||||
| 940 | # image formats the fields are bytes of something unknown. Might slightly | ||||||
| 941 | # like to know which is the case, and show raw bytes different from "bytes | ||||||
| 942 | # which ought to be utf8", but for now just Encode::decode_utf8() and let | ||||||
| 943 | # its Encode::FB_DEFAULT() put substitution chars for non-ascii non-utf8. | ||||||
| 944 | # | ||||||
| 945 | 0 | 0 | eval { require Image::ExifTool; 1 } || return {}; | ||||
| 0 | |||||||
| 0 | |||||||
| 946 | 0 | $resp->decode; | |||||
| 947 | 0 | my $cref = $resp->content_ref; | |||||
| 948 | 0 | return Image::ExifTool::ImageInfo | |||||
| 949 | ($cref, | ||||||
| 950 | ['Title','Author','Copyright','ImageSize'], # just these tags | ||||||
| 951 | {List => 0}); # get list values as comma separated | ||||||
| 952 | } | ||||||
| 953 | |||||||
| 954 | # $resp is a HTTP::Response, return title | ||||||
| 955 | sub html_title { | ||||||
| 956 | 0 | 0 | 0 | my ($resp) = @_; | |||
| 957 | |||||||
| 958 | return (# for images prefer filename+size over URI::Title just filename | ||||||
| 959 | 0 | 0 | non_empty (html_title_exiftool_image($resp)) | ||||
| 0 | |||||||
| 0 | |||||||
| 960 | |||||||
| 961 | // non_empty (html_title_urititle($resp)) | ||||||
| 962 | // non_empty (html_title_exiftool($resp)) | ||||||
| 963 | // $resp->title); | ||||||
| 964 | } | ||||||
| 965 | sub html_title_urititle { | ||||||
| 966 | 0 | 0 | 0 | my ($resp) = @_; | |||
| 967 | 0 | 0 | eval { require URI::Title } or return undef; | ||||
| 0 | |||||||
| 968 | |||||||
| 969 | # suppress some dodginess in URI::Title 1.82 | ||||||
| 970 | local $SIG{'__WARN__'} = sub { | ||||||
| 971 | 0 | 0 | my ($msg) = @_; | ||||
| 972 | 0 | 0 | $msg =~ /Use of uninitialized value/ or warn @_; | ||||
| 973 | 0 | }; | |||||
| 974 | 0 | $resp->decode; | |||||
| 975 | 0 | 0 | return URI::Title::title | ||||
| 976 | ({ url => ($resp->request->uri // ''), | ||||||
| 977 | data => $resp->content}); | ||||||
| 978 | } | ||||||
| 979 | sub html_title_exiftool_image { | ||||||
| 980 | 0 | 0 | 0 | my ($resp) = @_; | |||
| 981 | 0 | 0 | $resp->content_type =~ m{^image/} or return; | ||||
| 982 | 0 | 0 | if (defined (my $title = html_title_exiftool($resp))) { | ||||
| 983 | 0 | return $title; | |||||
| 984 | } | ||||||
| 985 | 0 | 0 | my $info = resp_exiftool_info($resp) // return; | ||||
| 986 | ### html_title_exiftool_image() on: $info | ||||||
| 987 | 0 | 0 | defined $info->{'ImageSize'} or return; | ||||
| 988 | 0 | return $resp->filename.' '.$info->{'ImageSize'}; | |||||
| 989 | } | ||||||
| 990 | sub html_title_exiftool { | ||||||
| 991 | 0 | 0 | 0 | my ($resp) = @_; | |||
| 992 | 0 | 0 | my $title = resp_exiftool_info($resp)->{'Title'} // return; | ||||
| 993 | 0 | return Encode::decode_utf8 ($title); | |||||
| 994 | } | ||||||
| 995 | |||||||
| 996 | |||||||
| 997 | #------------------------------------------------------------------------------ | ||||||
| 998 | # mime | ||||||
| 999 | |||||||
| 1000 | # prepended to "X-Mailer" header | ||||||
| 1001 | 1 | 1 | 6 | use constant mime_mailer_extra => "RSS2Leafnode $VERSION"; | |||
| 1 | 2 | ||||||
| 1 | 1777 | ||||||
| 1002 | |||||||
| 1003 | # $body is a MIME::Body object, append $str to it | ||||||
| 1004 | sub mime_body_append { | ||||||
| 1005 | 0 | 0 | 0 | my ($body, $str) = @_; | |||
| 1006 | 0 | $str = $body->as_string . "\n" . str_ensure_newline ($str); | |||||
| 1007 | 0 | 0 | my $IO = $body->open('w') | ||||
| 1008 | or die "rss2leafnode: body I/O open: $!"; | ||||||
| 1009 | 0 | $IO->print ($str); | |||||
| 1010 | 0 | 0 | $IO->close | ||||
| 1011 | or die "rss2leafnode: body I/O close: $!"; | ||||||
| 1012 | } | ||||||
| 1013 | |||||||
| 1014 | # if $str is not ascii then apply encode_mimewords() | ||||||
| 1015 | sub mimewords_non_ascii { | ||||||
| 1016 | 0 | 0 | 0 | my ($str) = @_; | |||
| 1017 | 0 | 0 | 0 | if (defined $str && ! is_ascii($str)) { | |||
| 1018 | 0 | require MIME::Words; | |||||
| 1019 | 0 | $str = MIME::Words::encode_mimewords (Encode::encode_utf8($str), | |||||
| 1020 | Charset => 'UTF-8'); | ||||||
| 1021 | } | ||||||
| 1022 | 0 | return $str; | |||||
| 1023 | } | ||||||
| 1024 | |||||||
| 1025 | sub mime_build { | ||||||
| 1026 | 0 | 0 | 0 | my ($self, $headers, @args) = @_; | |||
| 1027 | |||||||
| 1028 | # Headers in utf-8, the same as other text. The docs of | ||||||
| 1029 | # encode_mimewords() isn't clear, but seems to expect bytes of the | ||||||
| 1030 | # specified charset. | ||||||
| 1031 | 0 | foreach my $key (sort keys %$headers) { | |||||
| 1032 | 0 | $headers->{$key} | |||||
| 1033 | = mimewords_non_ascii(Text::Trim::trim($headers->{$key})); | ||||||
| 1034 | } | ||||||
| 1035 | |||||||
| 1036 | 0 | %$headers = (%$headers, @args); | |||||
| 1037 | 0 | 0 | $headers->{'Top'} //= 0; # default to a part not a toplevel | ||||
| 1038 | 0 | 0 | $headers->{'Encoding'} //= '-SUGGEST'; | ||||
| 1039 | |||||||
| 1040 | 0 | 0 | if ($headers->{'Top'}) { | ||||
| 1041 | 0 | my $now822 = rfc822_time_now(); | |||||
| 1042 | 0 | 0 | $headers->{'Date'} //= $now822; | ||||
| 1043 | 0 | $headers->{'Date-Received:'} = $now822; | |||||
| 1044 | } | ||||||
| 1045 | |||||||
| 1046 | 0 | 0 | if (utf8::is_utf8($headers->{'Data'})) { | ||||
| 1047 | 0 | warn 'Oops, mime_build() data should be bytes'; | |||||
| 1048 | } | ||||||
| 1049 | |||||||
| 1050 | # downgrade utf-8 to us-ascii if possible | ||||||
| 1051 | 0 | 0 | 0 | if ($headers->{'Type'} eq 'text/plain' | |||
| 0 | |||||||
| 0 | |||||||
| 1052 | && lc($headers->{'Charset'}||0) eq 'utf-8' | ||||||
| 1053 | && is_ascii ($headers->{'Data'})) { | ||||||
| 1054 | 0 | $headers->{'Charset'} = 'us-ascii'; | |||||
| 1055 | |||||||
| 1056 | # not sure mangling text/html body content is a good idea -- would only | ||||||
| 1057 | # want it on generated html, not downloaded | ||||||
| 1058 | # | ||||||
| 1059 | # if ($headers->{'Type'} eq 'text/html') { | ||||||
| 1060 | # $headers->{'Data'} =~ s{( | ||||||
| 1061 | # } | ||||||
| 1062 | } | ||||||
| 1063 | |||||||
| 1064 | 0 | @args = map {$_,$headers->{$_}} sort keys %$headers; | |||||
| 0 | |||||||
| 1065 | 0 | 0 | if ($self->{'verbose'} >= 4) { | ||||
| 1066 | 0 | require Data::Dumper; | |||||
| 1067 | 0 | $self->verbose (4, Data::Dumper->new([\@args],['mime headers'])->Dump); | |||||
| 1068 | } | ||||||
| 1069 | |||||||
| 1070 | 0 | require MIME::Entity; | |||||
| 1071 | 0 | my $top = MIME::Entity->build (Disposition => 'inline', @args); | |||||
| 1072 | |||||||
| 1073 | 0 | 0 | 0 | if ($headers->{'Top'} && ! defined $headers->{'X-Mailer:'}) { | |||
| 1074 | 0 | my $head = $top->head; | |||||
| 1075 | 0 | $head->set('X-Mailer', join_non_empty (', ', | |||||
| 1076 | $self->mime_mailer_extra, | ||||||
| 1077 | $head->get('X-Mailer'))); | ||||||
| 1078 | } | ||||||
| 1079 | |||||||
| 1080 | 0 | return $top; | |||||
| 1081 | } | ||||||
| 1082 | |||||||
| 1083 | # $resp is a HTTP::Response | ||||||
| 1084 | # Return a MIME::Entity which contains the response, and any further @headers. | ||||||
| 1085 | # If $self->{'render'} is true then render HTML to plain text. | ||||||
| 1086 | # | ||||||
| 1087 | sub mime_part_from_response { | ||||||
| 1088 | 0 | 0 | 0 | my ($self, $resp, @headers) = @_; | |||
| 1089 | |||||||
| 1090 | 0 | my $content_type = $resp->content_type; | |||||
| 1091 | 0 | $self->verbose (2, ' content-type: ',$content_type); | |||||
| 1092 | 0 | $resp->decode; | |||||
| 1093 | 0 | my $content = $resp->content; # the bytes | |||||
| 1094 | 0 | my $charset = $resp->content_charset; # and their charset | |||||
| 1095 | 0 | my $url = $resp->request->uri->as_string; | |||||
| 1096 | 0 | my $content_md5 = $resp->header('Content-MD5'); | |||||
| 1097 | |||||||
| 1098 | 0 | ($content, $content_type, $charset, my $rendered) | |||||
| 1099 | = $self->render_maybe ($content, $content_type, $charset, $url); | ||||||
| 1100 | 0 | 0 | if ($rendered) { | ||||
| 1101 | 0 | undef $content_md5; | |||||
| 1102 | } | ||||||
| 1103 | |||||||
| 1104 | 0 | return $self->mime_build | |||||
| 1105 | ({ 'Content-Language:' => scalar($resp->header('Content-Language')), | ||||||
| 1106 | 'Content-Location:' => $url, | ||||||
| 1107 | 'Content-MD5:' => $content_md5, | ||||||
| 1108 | @headers, | ||||||
| 1109 | }, | ||||||
| 1110 | Type => $content_type, | ||||||
| 1111 | Charset => $charset, | ||||||
| 1112 | Data => $content, | ||||||
| 1113 | Filename => $resp->filename); | ||||||
| 1114 | } | ||||||
| 1115 | |||||||
| 1116 | |||||||
| 1117 | # set "Lines:" header per RFC 1036 | ||||||
| 1118 | # MIME::Entity 5.428 doesn't seem to have anything for this itself | ||||||
| 1119 | # this is after qp or base64, is that right? the actual message lines | ||||||
| 1120 | sub mime_entity_lines { | ||||||
| 1121 | 0 | 0 | 0 | my ($top) = @_; | |||
| 1122 | 0 | $top->head->set('Lines', str_count_lines ($top->stringify_body)); | |||||
| 1123 | } | ||||||
| 1124 | |||||||
| 1125 | #------------------------------------------------------------------------------ | ||||||
| 1126 | # XML::Twig stuff | ||||||
| 1127 | |||||||
| 1128 | # Return the text of $elt and treat child elements as improperly escaped | ||||||
| 1129 | # parts of the text too. | ||||||
| 1130 | # | ||||||
| 1131 |  # This is good for elements which are supposed to be HTML with  etc  | 
||||||
| 1132 | # escaped as <p>, but copes with feeds that don't have the necessary | ||||||
| 1133 | # escapes and thus come out with xml child elements under $elt. | ||||||
| 1134 | # | ||||||
| 1135 | # For elements which are supposed to be plain text with no markup and no | ||||||
| 1136 | # sub-elements this will at least make improper child text visible, though | ||||||
| 1137 | # it might not look very good. | ||||||
| 1138 | # | ||||||
| 1139 | # As of June 2010 http://www.drweil.com/drw/ecs/rss.xml is an example of | ||||||
| 1140 | # improperly escaped html. | ||||||
| 1141 | # | ||||||
| 1142 |  # FIXME: Any need to watch out for  | 
||||||
| 1143 | # | ||||||
| 1144 | sub elt_subtext { | ||||||
| 1145 | 0 | 0 | 0 | my ($elt) = @_; | |||
| 1146 | 0 | 0 | defined $elt or return undef; | ||||
| 1147 | 0 | 0 | if ($elt->is_text) { return $elt->text; } | ||||
| 0 | |||||||
| 1148 | 0 | return join ('', map {_elt_subtext_with_tags($_)} $elt->children); | |||||
| 0 | |||||||
| 1149 | } | ||||||
| 1150 | sub _elt_subtext_with_tags { | ||||||
| 1151 | 0 | 0 | my ($elt) = @_; | ||||
| 1152 | 0 | 0 | defined $elt or return undef; | ||||
| 1153 | 0 | 0 | if ($elt->is_text) { return $elt->text; } | ||||
| 0 | |||||||
| 1154 | 0 | return ($elt->start_tag | |||||
| 1155 | 0 | . join ('', map {_elt_subtext_with_tags($_)} $elt->children) | |||||
| 1156 | . $elt->end_tag); | ||||||
| 1157 | } | ||||||
| 1158 | |||||||
| 1159 |  # $elt contains xhtml   etc sub-elements.  Return a plain html string.   | 
||||||
| 1160 |  # Prefixes like  | 
||||||
| 1161 | # This relies on the map_xmlns mapping to give prefix "xhtml:" | ||||||
| 1162 | # | ||||||
| 1163 | sub elt_xhtml_to_html { | ||||||
| 1164 | 0 | 0 | 0 | my ($elt) = @_; | |||
| 1165 | |||||||
| 1166 | # could probably do it destructively, but just in case | ||||||
| 1167 | 0 | $elt = $elt->copy; | |||||
| 1168 | 0 | App::RSS2Leafnode::XML::Twig::Other::elt_tree_strip_prefix ($elt, 'xhtml'); | |||||
| 1169 | |||||||
| 1170 | # lose xmlns:xhtml="http://www.w3.org/1999/xhtml" | ||||||
| 1171 | 0 | $elt->strip_att('xmlns:xhtml'); | |||||
| 1172 | |||||||
| 1173 | # something fishy turns "href" to "xhtml:href", drop any "xhtml:" | ||||||
| 1174 | # bare "href" also gets turned into atom:href as the default namespace, | ||||||
| 1175 | # drop any "atom:" | ||||||
| 1176 | 0 | foreach my $child ($elt->descendants) { | |||||
| 1177 | 0 | foreach my $attname ($child->att_names) { | |||||
| 1178 | 0 | 0 | if ($attname =~ /^(xhtml|atom):(.*)/) { | ||||
| 1179 | 0 | $child->change_att_name($attname, $2); | |||||
| 1180 | } | ||||||
| 1181 | } | ||||||
| 1182 | } | ||||||
| 1183 | |||||||
| 1184 | 0 | my $old_pretty = $elt->set_pretty_print ('none'); | |||||
| 1185 | ### $old_pretty | ||||||
| 1186 | 0 | my $ret = $elt->xml_string; | |||||
| 1187 | 0 | $elt->set_pretty_print ($old_pretty); | |||||
| 1188 | 0 | return $ret; | |||||
| 1189 | |||||||
| 1190 | } | ||||||
| 1191 | |||||||
| 1192 | # elt_content_type() returns 'text', 'html', 'xhtml' or a mime type. | ||||||
| 1193 | # If no type="" attribute the default is 'text', except for RSS | ||||||
| 1194 |  #  | 
||||||
| 1195 | # | ||||||
| 1196 | # RSS http://www.debian.org/News/weekly/dwn.en.rdf circa Feb 2010 had some | ||||||
| 1197 |  # html in its  | 
||||||
| 1198 |  # plain text) and that RSS is all plain text outside  | 
||||||
| 1199 | # | ||||||
| 1200 |  #  | 
||||||
| 1201 | # not the formatting as html vs text. | ||||||
| 1202 | # | ||||||
| 1203 | @known{'/channel/item/dc:type'} = undef; | ||||||
| 1204 | # | ||||||
| 1205 | sub elt_content_type { | ||||||
| 1206 | 0 | 0 | 0 | my ($elt) = @_; | |||
| 1207 | 0 | 0 | if (! defined $elt) { return undef; } | ||||
| 0 | |||||||
| 1208 | |||||||
| 1209 | 0 | 0 | 0 | if (defined (my $type = ($elt->att('atom:type') // $elt->att('type')))) { | |||
| 1210 | # type="application/xhtml+xml" at http://xmltwig.com/blog/index.atom, | ||||||
| 1211 | # dunno if it should be just "xhtml", but recognise it anyway | ||||||
| 1212 | 0 | 0 | if ($type eq 'application/xhtml+xml') { return 'xhtml'; } | ||||
| 0 | |||||||
| 1213 | 0 | return $type; | |||||
| 1214 | } | ||||||
| 1215 | 0 | 0 | if ($elt->root->tag eq 'feed') { | ||||
| 1216 | 0 |      return 'text';  # Atom  | 
|||||
| 1217 | } | ||||||
| 1218 | 0 | my $tag = $elt->tag; | |||||
| 1219 | 0 | 0 | if ($tag =~ /^itunes:/) { | ||||
| 1220 | # itunes spec is for text-only, no html markup | ||||||
| 1221 | 0 | return 'text'; | |||||
| 1222 | } | ||||||
| 1223 | 0 | 0 | 0 |    if ($tag eq 'description'           # RSS  | 
|||
| 1224 | || $tag eq 'content:encoded') { # same in content:encoded | ||||||
| 1225 | 0 | return 'html'; | |||||
| 1226 | } | ||||||
| 1227 | # other RSS is text | ||||||
| 1228 | 0 | return 'text'; | |||||
| 1229 | } | ||||||
| 1230 | |||||||
| 1231 | # $elt is an XML::Twig::Elt of an RSS or Atom text element. | ||||||
| 1232 | # Atom has a type="" attribute, RSS is html. Html or xhtml are rendered to | ||||||
| 1233 | # a single long line of plain text. | ||||||
| 1234 | # | ||||||
| 1235 | sub elt_to_rendered_line { | ||||||
| 1236 | 0 | 0 | 0 | my ($elt) = @_; | |||
| 1237 | 0 | 0 | defined $elt or return; | ||||
| 1238 | |||||||
| 1239 | 0 | my $str; | |||||
| 1240 | 0 | my $type = elt_content_type ($elt); | |||||
| 1241 | 0 | 0 | if ($type eq 'xhtml') { | ||||
| 1242 | 0 | $str = elt_xhtml_to_html ($elt); | |||||
| 1243 | 0 | $type = 'html'; | |||||
| 1244 | } else { | ||||||
| 1245 | 0 | $str = elt_subtext($elt); | |||||
| 1246 | } | ||||||
| 1247 | 0 | 0 | if ($type eq 'html') { | ||||
| 1248 | 0 | $str = html_to_rendered_line($str); | |||||
| 1249 | } | ||||||
| 1250 | # plain 'text' or anything unrecognised collapsed too | ||||||
| 1251 | 0 | return non_empty(collapse_whitespace($str)); | |||||
| 1252 | } | ||||||
| 1253 | |||||||
| 1254 | sub html_to_rendered_line { | ||||||
| 1255 | 0 | 0 | 0 | my ($html) = @_; | |||
| 1256 | 0 | require HTML::FormatText; | |||||
| 1257 | 0 | return collapse_whitespace | |||||
| 1258 | (HTML::FormatText->format_string ($html, | ||||||
| 1259 | leftmargin => 0, | ||||||
| 1260 | rightmargin => 999)); | ||||||
| 1261 | } | ||||||
| 1262 | |||||||
| 1263 | |||||||
| 1264 | #------------------------------------------------------------------------------ | ||||||
| 1265 | # XML::RSS::Timing | ||||||
| 1266 | |||||||
| 1267 | sub twig_to_timingfields { | ||||||
| 1268 | 0 | 0 | 0 | my ($self, $twig) = @_; | |||
| 1269 | 0 | 0 | return if ! defined $twig; | ||||
| 1270 | 0 | my $root = $twig->root; | |||||
| 1271 | 0 | my %timingfields; | |||||
| 1272 | |||||||
| 1273 | 0 | 0 | if (my $ttl = $root->first_descendant('ttl')) { | ||||
| 1274 | 0 | $timingfields{'ttl'} = $ttl->trimmed_text; | |||||
| 1275 | } | ||||||
| 1276 | 0 | 0 | if (my $skipHours = $root->first_descendant('skipHours')) { | ||||
| 1277 | 0 | $timingfields{'skipHours'} = [map {$_->trimmed_text} $skipHours->children('hour')]; | |||||
| 0 | |||||||
| 1278 | } | ||||||
| 1279 | 0 | 0 | if (my $skipDays = $root->first_descendant('skipDays')) { | ||||
| 1280 | 0 | $timingfields{'skipDays'} = [map {$_->trimmed_text} $skipDays->children('day')]; | |||||
| 0 | |||||||
| 1281 | } | ||||||
| 1282 | |||||||
| 1283 | # "syn:updatePeriod" etc | ||||||
| 1284 | 0 | foreach my $key (qw(updatePeriod updateFrequency updateBase)) { | |||||
| 1285 | 0 | 0 | if (my $update = $root->first_descendant("syn:$key")) { | ||||
| 1286 | 0 | $timingfields{$key} = $update->trimmed_text; | |||||
| 1287 | } | ||||||
| 1288 | } | ||||||
| 1289 | 0 | 0 | if ($self->{'verbose'} >= 2) { | ||||
| 1290 | 0 | require Data::Dumper; | |||||
| 1291 | 0 | $self->verbose (2, | |||||
| 1292 | Data::Dumper->new([\%timingfields],['timingfields']) | ||||||
| 1293 | ->Indent(1)->Sortkeys(1)->Dump); | ||||||
| 1294 | } | ||||||
| 1295 | 0 | 0 | if (! %timingfields) { | ||||
| 1296 | 0 | return; # no info | |||||
| 1297 | } | ||||||
| 1298 | |||||||
| 1299 | # if XML::RSS::Timing doesn't like the values then don't record them | ||||||
| 1300 | 0 | 0 | return unless $self->timingfields_to_timing(\%timingfields); | ||||
| 1301 | |||||||
| 1302 | 0 | return \%timingfields; | |||||
| 1303 | } | ||||||
| 1304 | @known{qw(/channel/skipDays | ||||||
| 1305 | /channel/skipDays/day | ||||||
| 1306 | /channel/skipHours | ||||||
| 1307 | /channel/skipHours/hour | ||||||
| 1308 | /channel/ttl | ||||||
| 1309 | /channel/syn:updateBase | ||||||
| 1310 | /channel/syn:updatePeriod | ||||||
| 1311 | /channel/syn:updateFrequency)} = (); | ||||||
| 1312 | |||||||
| 1313 | # return an XML::RSS::Timing object, or undef | ||||||
| 1314 | sub timingfields_to_timing { | ||||||
| 1315 | 0 | 0 | 0 | my ($self, $timingfields) = @_; | |||
| 1316 | 0 | 0 | $timingfields // return undef; | ||||
| 1317 | |||||||
| 1318 | 0 | 0 | eval { require XML::RSS::Timing } || return undef; | ||||
| 0 | |||||||
| 1319 | 0 | my $timing = XML::RSS::Timing->new; | |||||
| 1320 | 0 | $timing->use_exceptions(0); | |||||
| 1321 | 0 | while (my ($key, $value) = each %$timingfields) { | |||||
| 1322 | 0 | 0 | if (ref $value) { | ||||
| 1323 | 0 | $timing->$key (@$value); | |||||
| 1324 | } else { | ||||||
| 1325 | 0 | $timing->$key ($value); | |||||
| 1326 | } | ||||||
| 1327 | } | ||||||
| 1328 | 0 | 0 | if (my @complaints = $timing->complaints) { | ||||
| 1329 | 0 | say __x('XML::RSS::Timing complains about {url}', | |||||
| 1330 | url => $self->{'uri'}); | ||||||
| 1331 | 0 | foreach my $complaint (@complaints) { | |||||
| 1332 | 0 | say " $complaint"; | |||||
| 1333 | } | ||||||
| 1334 | 0 | return undef; | |||||
| 1335 | } | ||||||
| 1336 | 0 | return $timing; | |||||
| 1337 | } | ||||||
| 1338 | |||||||
| 1339 | |||||||
| 1340 | #------------------------------------------------------------------------------ | ||||||
| 1341 | # rss2leafnode.status file | ||||||
| 1342 | |||||||
| 1343 | # $self->{'global_status'} is a hashref containing entries URL => STATUS, | ||||||
| 1344 | # where URL is a string and STATUS is a sub-hashref of information | ||||||
| 1345 | |||||||
| 1346 | 1 | 1 | 6 | use constant STATUS_EXPIRE_DAYS => 45; | |||
| 1 | 1 | ||||||
| 1 | 3407 | ||||||
| 1347 | |||||||
| 1348 | # read $status_filename into $self->{'global_status'} | ||||||
| 1349 | sub status_read { | ||||||
| 1350 | 0 | 0 | 0 | my ($self) = @_; | |||
| 1351 | 0 | $self->{'global_status'} = {}; | |||||
| 1352 | 0 | my $status_filename = $self->status_filename; | |||||
| 1353 | 0 | $self->verbose (2, 'read status: ', $status_filename); | |||||
| 1354 | |||||||
| 1355 | 0 | $! = 0; | |||||
| 1356 | 0 | my $global_status = do $status_filename; | |||||
| 1357 | 0 | 0 | if (! defined $global_status) { | ||||
| 1358 | 0 | 0 | if ($! == POSIX::ENOENT()) { | ||||
| 1359 | 0 | $self->verbose (2, "status file doesn't exist"); | |||||
| 1360 | } else { | ||||||
| 1361 | 0 | say "rss2leafnode: error in $status_filename\n$@"; | |||||
| 1362 | 0 | say "ignoring that file"; | |||||
| 1363 | } | ||||||
| 1364 | 0 | $global_status = {}; | |||||
| 1365 | } | ||||||
| 1366 | 0 | $self->{'global_status'} = $global_status; | |||||
| 1367 | } | ||||||
| 1368 | |||||||
| 1369 | # delete old entries from $self->{'global_status'} | ||||||
| 1370 | sub status_prune { | ||||||
| 1371 | 0 | 0 | 0 | my ($self) = @_; | |||
| 1372 | 0 | 0 | my $global_status = $self->{'global_status'} // return; | ||||
| 1373 | 0 | my $pruned = 0; | |||||
| 1374 | 0 | my $old_time = time() - STATUS_EXPIRE_DAYS * 86400; | |||||
| 1375 | 0 | foreach my $key (keys %$global_status) { | |||||
| 1376 | 0 | 0 | if ($global_status->{$key}->{'status-time'} < $old_time) { | ||||
| 1377 | 0 | $self->verbose (2, __x("discard old status {url}\n", url => $key)); | |||||
| 1378 | 0 | delete $global_status->{$key}; | |||||
| 1379 | 0 | $pruned++; | |||||
| 1380 | } | ||||||
| 1381 | } | ||||||
| 1382 | 0 | 0 | if ($pruned) { | ||||
| 1383 | 0 | $self->verbose (1, __xn("discard {count} old status entry\n", | |||||
| 1384 | "discard {count} old status entries\n", | ||||||
| 1385 | $pruned, | ||||||
| 1386 | count => $pruned)); | ||||||
| 1387 | } | ||||||
| 1388 | } | ||||||
| 1389 | |||||||
| 1390 | # save $self->{'global_status'} into the $status_filename | ||||||
| 1391 | sub status_save { | ||||||
| 1392 | 0 | 0 | 0 | my ($self, $status) = @_; | |||
| 1393 | 0 | $status->{'status-time'} = time(); | |||||
| 1394 | 0 | 0 | if ($status->{'timingfields'}) { | ||||
| 1395 | 0 | $status->{'timingfields'}->{'lastPolled'} = $status->{'status-time'}; | |||||
| 1396 | } | ||||||
| 1397 | |||||||
| 1398 | 0 | $self->status_prune; | |||||
| 1399 | |||||||
| 1400 | 0 | require Data::Dumper; | |||||
| 1401 | 0 | my $str = Data::Dumper->new([$self->{'global_status'}],['global_status']) | |||||
| 1402 | ->Indent(1)->Sortkeys(1)->Terse(1)->Useqq(1)->Dump; | ||||||
| 1403 | 0 | $str = <<"HERE"; | |||||
| 1404 | # rss2leafnode status file -- automatically generated -- DO NOT EDIT | ||||||
| 1405 | # | ||||||
| 1406 | # (If there seems to be something very wrong then you can delete this file | ||||||
| 1407 | # and it'll be started afresh on the next run.) | ||||||
| 1408 | |||||||
| 1409 | $str | ||||||
| 1410 | |||||||
| 1411 | |||||||
| 1412 | # Local variables: | ||||||
| 1413 | # mode: perl-mode | ||||||
| 1414 | # End: | ||||||
| 1415 | HERE | ||||||
| 1416 | |||||||
| 1417 | 0 | my $status_filename = $self->status_filename; | |||||
| 1418 | 0 | my $out; | |||||
| 1419 | 0 | 0 | 0 | (open $out, '>', $status_filename | |||
| 0 | |||||||
| 1420 | and print $out $str | ||||||
| 1421 | and close $out) | ||||||
| 1422 | or croak "rss2leafnode: cannot write to $status_filename: $!\n"; | ||||||
| 1423 | } | ||||||
| 1424 | |||||||
| 1425 | # return a hashref which has status information about $url, or undef if | ||||||
| 1426 | # nothing recorded about $url | ||||||
| 1427 | sub status_geturl { | ||||||
| 1428 | 0 | 0 | 0 | my ($self, $url) = @_; | |||
| 1429 | 0 | 0 | $self->status_read if ! $self->{'global_status'}; | ||||
| 1430 | 0 | 0 | if (! $self->{'global_status'}->{$url}) { | ||||
| 1431 | 0 | $self->{'global_status'}->{$url} = { 'status-time' => time() }; | |||||
| 1432 | } | ||||||
| 1433 | 0 | return $self->{'global_status'}->{$url}; | |||||
| 1434 | } | ||||||
| 1435 | |||||||
| 1436 | # $resp is a HTTP::Response object from retrieving $url. | ||||||
| 1437 | # Optional $twig is an XML::Twig. | ||||||
| 1438 | # Record against $url any ETag, Last-Modified and ttl from $resp and $twig. | ||||||
| 1439 | # If $resp is an error return, or is undef, then do nothing. | ||||||
| 1440 | sub status_etagmod_resp { | ||||||
| 1441 | 0 | 0 | 0 | my ($self, $url, $resp, $twig) = @_; | |||
| 1442 | 0 | 0 | 0 | if ($resp && $resp->is_success) { | |||
| 1443 | 0 | my $status = $self->status_geturl ($url); | |||||
| 1444 | 0 | $status->{'Last-Modified'} = $resp->header('Last-Modified'); | |||||
| 1445 | 0 | $status->{'ETag'} = $resp->header('ETag'); | |||||
| 1446 | 0 | $status->{'timingfields'} = $self->twig_to_timingfields ($twig); | |||||
| 1447 | |||||||
| 1448 | 0 | 0 | 0 | if (! defined $status->{'ETag'} && ! defined $status->{'Last-Modified'}) { | |||
| 1449 | 0 | $self->verbose (1, " no ETag or Last-Modified"); | |||||
| 1450 | } | ||||||
| 1451 | 0 | 0 | if (defined (my $comments_count = $self->{'comments_count'})) { | ||||
| 1452 | 0 | $status->{'comments_count'} = $comments_count; | |||||
| 1453 | } | ||||||
| 1454 | |||||||
| 1455 | 0 | 0 | if ($twig) { | ||||
| 1456 | # record previously applied newest option | ||||||
| 1457 | 0 | $status->{'rss_newest_only'} = $self->{'rss_newest_only'}; | |||||
| 1458 | |||||||
| 1459 | # if (rss_newest_cmp($self,$status) > 0) { | ||||||
| 1460 | # # the newest number increases | ||||||
| 1461 | # } | ||||||
| 1462 | } | ||||||
| 1463 | 0 | foreach my $key (keys %$status) { | |||||
| 1464 | 0 | 0 | if (! defined $status->{$key}) { delete $status->{$key} } | ||||
| 0 | |||||||
| 1465 | } | ||||||
| 1466 | 0 | $self->status_save($status); | |||||
| 1467 | } | ||||||
| 1468 | } | ||||||
| 1469 | |||||||
| 1470 | # update recorded status for a $url with unchanged contents | ||||||
| 1471 | sub status_unchanged { | ||||||
| 1472 | 0 | 0 | 0 | my ($self, $url) = @_; | |||
| 1473 | 0 | $self->verbose (1, ' ', __('unchanged')); | |||||
| 1474 | 0 | $self->status_save ($self->status_geturl ($url)); | |||||
| 1475 | } | ||||||
| 1476 | |||||||
| 1477 | # $req is a HTTP::Request object. | ||||||
| 1478 | # Add "If-None-Match" and/or "If-Modified-Since" headers to it based on what | ||||||
| 1479 | # the status file has recorded from when we last fetched the url in $req. | ||||||
| 1480 | # Return 1 to download, 0 if nothing expected yet by RSS timing fields | ||||||
| 1481 | # | ||||||
| 1482 | sub status_etagmod_req { | ||||||
| 1483 | 0 | 0 | 0 | my ($self, $req, $for_rss) = @_; | |||
| 1484 | 0 | 0 | $self->{'global_status'} or $self->status_read; | ||||
| 1485 | |||||||
| 1486 | 0 | my $url = $req->uri->as_string; | |||||
| 1487 | my $status = $self->{'global_status'}->{$url} | ||||||
| 1488 | 0 | 0 | // do { | ||||
| 1489 | 0 | $self->verbose (2, __x("no status info for {url}\n", url => $url)); | |||||
| 1490 | 0 | return 1; # want download | |||||
| 1491 | }; | ||||||
| 1492 | |||||||
| 1493 | 0 | 0 | if ($for_rss) { | ||||
| 1494 | # if status says the last download was for only a certain number of | ||||||
| 1495 | # newest, then force a re-download if that option now different | ||||||
| 1496 | 0 | 0 | if (! str_equal($self->{'rss_newest_only'}, | ||||
| 1497 | $status->{'rss_newest_only'})) { | ||||||
| 1498 | 0 | return 1; # want download | |||||
| 1499 | } | ||||||
| 1500 | } | ||||||
| 1501 | |||||||
| 1502 | 0 | 0 | if (my $timing = $self->timingfields_to_timing ($status->{'timingfields'})) { | ||||
| 1503 | 0 | my $next = $timing->nextUpdate; | |||||
| 1504 | 0 | my $now = time(); | |||||
| 1505 | 0 | 0 | if ($next > $now) { | ||||
| 1506 | 0 | $self->verbose (1, ' ', | |||||
| 1507 | __x('timing: next update {time} (local time)', | ||||||
| 1508 | time => POSIX::strftime ("%H:%M:%S %a %d %b %Y", | ||||||
| 1509 | localtime($next)))); | ||||||
| 1510 | 0 | 0 | 0 | if (eval 'use Time::Duration::Locale; 1' | |||
| 1511 | || eval 'use Time::Duration; 1') { | ||||||
| 1512 | 0 | $self->verbose (1, ' ', __x('which is {duration} from now', | |||||
| 1513 | duration => duration($next-$now))); | ||||||
| 1514 | } | ||||||
| 1515 | 0 | return 0; # no update yet | |||||
| 1516 | } | ||||||
| 1517 | } | ||||||
| 1518 | 0 | 0 | if (defined (my $lastmod = $status->{'Last-Modified'})) { | ||||
| 1519 | 0 | $req->header('If-Modified-Since' => $lastmod); | |||||
| 1520 | } | ||||||
| 1521 | 0 | 0 | if (defined (my $etag = $status->{'ETag'})) { | ||||
| 1522 | 0 | $req->header('If-None-Match' => $etag); | |||||
| 1523 | } | ||||||
| 1524 | 0 | return 1; | |||||
| 1525 | } | ||||||
| 1526 | |||||||
| 1527 |  # return -1 if x | 
||||||
| 1528 | # sub rss_newest_cmp { | ||||||
| 1529 | # my ($x, $y) = @_; | ||||||
| 1530 | # if ($x->{'rss_newest_only'}) { | ||||||
| 1531 | # if (! $y->{'rss_newest_only'}) { | ||||||
| 1532 | # return -1; # x finite, y infinite | ||||||
| 1533 | # } | ||||||
| 1534 | # # x and y finite | ||||||
| 1535 | # return ($x->{'rss_newest_only'} <=> $y->{'rss_newest_only'}); | ||||||
| 1536 | # } else { | ||||||
| 1537 | # # x infinite, so 1 if y finite, 0 if y infinite too | ||||||
| 1538 | # return !! $y->{'rss_newest_only'}; | ||||||
| 1539 | # } | ||||||
| 1540 | # } | ||||||
| 1541 | sub str_equal { | ||||||
| 1542 | 0 | 0 | 0 | my ($x, $y) = @_; | |||
| 1543 | 0 | 0 | return ((defined $x && defined $y && $x eq $y) | ||||
| 1544 | || (! defined $x && ! defined $y)); | ||||||
| 1545 | } | ||||||
| 1546 | |||||||
| 1547 | #------------------------------------------------------------------------------ | ||||||
| 1548 | # render html | ||||||
| 1549 | |||||||
| 1550 | # $content_type is a string like "text/html" or "text/plain". | ||||||
| 1551 | # $content is data as raw bytes. | ||||||
| 1552 | # $charset is the character set of those bytes, eg. "utf-8". | ||||||
| 1553 | # | ||||||
| 1554 | # If the $render option is set, and $content_type is 'text/html', then | ||||||
| 1555 | # render $content down to 'text/plain', using either HTML::FormatText or | ||||||
| 1556 | # Lynx. | ||||||
| 1557 | # The return is a new triplet ($content, $content_type, $charset). | ||||||
| 1558 | # | ||||||
| 1559 | sub render_maybe { | ||||||
| 1560 | 0 | 0 | 0 | my ($self, $content, $content_type, $charset, $base_url) = @_; | |||
| 1561 | 0 | my $rendered = 0; | |||||
| 1562 | 0 | 0 | 0 | if ($self->{'render'} && $content_type eq 'text/html') { | |||
| 1563 | |||||||
| 1564 | 0 | my $class = $self->{'render'}; | |||||
| 1565 | 0 | 0 | if ($class !~ /^HTML::/) { $class = "HTML::FormatText::\u$class"; } | ||||
| 0 | |||||||
| 1566 | 0 | $class =~ s/::1$//; # "::1" is $render=1 for plain HTML::FormatText | |||||
| 1567 | 0 | require Module::Load; | |||||
| 1568 | 0 | Module::Load::load ($class); | |||||
| 1569 | |||||||
| 1570 | 0 | 0 | if ($class =~ /^HTML::FormatText($|::WithLinks)/) { | ||||
| 1571 | # Believe HTML::FormatText (as of version 2.04) doesn't know much | ||||||
| 1572 | # about input or output charsets, but it can be tricked into getting | ||||||
| 1573 | # pretty reasonable results by putting wide chars through it. | ||||||
| 1574 | # Likewise HTML::FormatText::WithLinks (as of version 0.11). | ||||||
| 1575 | # | ||||||
| 1576 | # FIXME: decode() can error out on bad charset | ||||||
| 1577 | 0 | $content = Encode::decode ($charset, $content); | |||||
| 1578 | 0 | local $SIG{'__WARN__'} = \&_warn_suppress_unknown_base; | |||||
| 1579 | 0 | $content = $class->format_string | |||||
| 1580 | ($content, | ||||||
| 1581 | base => $base_url, | ||||||
| 1582 | leftmargin => 0, | ||||||
| 1583 | rightmargin => $self->{'render_width'}); | ||||||
| 1584 | 0 | $content = Encode::encode_utf8 ($content); | |||||
| 1585 | |||||||
| 1586 | } else { | ||||||
| 1587 | # HTML::FormatExternal style charset specs | ||||||
| 1588 | 0 | $content = $class->format_string | |||||
| 1589 | ($content, | ||||||
| 1590 | base => $base_url, | ||||||
| 1591 | leftmargin => 0, | ||||||
| 1592 | rightmargin => $self->{'render_width'}, | ||||||
| 1593 | input_charset => $charset, | ||||||
| 1594 | output_charset => 'utf-8'); | ||||||
| 1595 | } | ||||||
| 1596 | 0 | $charset = 'UTF-8'; | |||||
| 1597 | 0 | $content_type = 'text/plain'; | |||||
| 1598 | 0 | $rendered = 1; | |||||
| 1599 | } | ||||||
| 1600 | 0 | return ($content, $content_type, $charset, $rendered); | |||||
| 1601 | } | ||||||
| 1602 | sub _warn_suppress_unknown_base { | ||||||
| 1603 | 0 | 0 | my ($msg) = @_; | ||||
| 1604 | 0 | 0 | $msg =~ /^Unknown configure option 'base'/ | ||||
| 1605 | or warn $msg; | ||||||
| 1606 | } | ||||||
| 1607 | |||||||
| 1608 | # $str is a wide-char string of text | ||||||
| 1609 | sub text_wrap { | ||||||
| 1610 | 0 | 0 | 0 | my ($self, $str, $prefix) = @_; | |||
| 1611 | 0 | 0 | if (! defined $prefix) { $prefix = ''; } | ||||
| 0 | |||||||
| 1612 | 0 | require Text::WrapI18N; | |||||
| 1613 | 0 | local $Text::WrapI18N::columns = $self->{'render_width'} + 1; | |||||
| 1614 | 0 | local $Text::WrapI18N::unexpand = 0; # no tabs in output | |||||
| 1615 | 0 | local $Text::WrapI18N::huge = 'overflow'; # don't break long words | |||||
| 1616 | 0 | $str =~ tr/\n/ /; | |||||
| 1617 | 0 | 0 | my $second_prefix = (length($prefix) ? ' 'x(length($prefix)+2) : ''); | ||||
| 1618 | 0 | return Text::WrapI18N::wrap($prefix, $second_prefix, $str); | |||||
| 1619 | } | ||||||
| 1620 | |||||||
| 1621 | #------------------------------------------------------------------------------ | ||||||
| 1622 | # Face icons | ||||||
| 1623 | |||||||
| 1624 | # $item is an XML::Twig::Elt of an RSS or Atom item | ||||||
| 1625 | # return a string value for the Face: header, or undef if no icon | ||||||
| 1626 | sub item_to_face { | ||||||
| 1627 | 0 | 0 | 0 | my ($self, $item) = @_; | |||
| 1628 | 0 | 0 | $self->{'get_icon'} || return; | ||||
| 1629 | 0 | 0 | my ($uri, $width, $height) = $self->item_image_uwh ($item) | ||||
| 1630 | or return; | ||||||
| 1631 | 0 | 0 | $self->face_wh_ok ($width, $height) || return; | ||||
| 1632 | 0 | return $self->download_face ($uri, $width, $height); | |||||
| 1633 | } | ||||||
| 1634 | |||||||
| 1635 | # $item is an XML::Twig::Elt of an RSS or Atom item | ||||||
| 1636 |  # return values ($uri, $width, $height) of the  | 
||||||
| 1637 | # | ||||||
| 1638 | sub item_image_uwh { | ||||||
| 1639 | 0 | 0 | 0 | my ($self, $item) = @_; | |||
| 1640 | ### item_image_uwh() ... | ||||||
| 1641 | |||||||
| 1642 | 0 | foreach my $where ($item, | |||||
| 1643 | item_to_channel($item)) { | ||||||
| 1644 | ### image text: $where->first_child_text('image') | ||||||
| 1645 | |||||||
| 1646 | # identi.ca | ||||||
| 1647 | 0 | 0 | if (my $actor = $where->first_child('activity:actor')) { | ||||
| 1648 | 0 | my ($url, $width, $height); | |||||
| 1649 | 0 | foreach my $link_elt ($actor->children('link')) { | |||||
| 1650 | 0 | 0 | 0 | ($link_elt->att('rel')||$link_elt->att('atom:rel')||'') | |||
| 1651 | eq 'avatar' or next; | ||||||
| 1652 | 0 | 0 | $url = $link_elt->att('href') // $link_elt->att('atom:href') // next; | ||||
| 0 | |||||||
| 1653 | 0 | my $this_width = $link_elt->att('media:width'); | |||||
| 1654 | 0 | 0 | 0 | next if (defined $width | |||
| 0 | |||||||
| 1655 | && defined $this_width | ||||||
| 1656 | && $width < $this_width); # prefer smallest | ||||||
| 1657 | 0 | $url = App::RSS2Leafnode::XML::Twig::Other::elt_xml_based_uri ($link_elt, $url); | |||||
| 1658 | 0 | 0 | $width = ($this_width || 0); | ||||
| 1659 | 0 | 0 | $height = ($link_elt->att('media:height') || 0); | ||||
| 1660 | ### $url | ||||||
| 1661 | ### $width | ||||||
| 1662 | ### $height | ||||||
| 1663 | } | ||||||
| 1664 | 0 | 0 | if (defined $url) { | ||||
| 1665 | 0 | return ($url, $width, $height); | |||||
| 1666 | } | ||||||
| 1667 | } | ||||||
| 1668 | |||||||
| 1669 | # RSS | ||||||
| 1670 |      #  | 
||||||
| 1671 |      #    | 
||||||
| 1672 |      #    | 
||||||
| 1673 |      #    | 
||||||
| 1674 | # | ||||||
| 1675 | 0 | 0 | if (my $image_elt = $where->first_child('image')) { | ||||
| 1676 | 0 | my $url_elt; # XML::Twig::Elt where the url came from | |||||
| 1677 | my $url; # url string | ||||||
| 1678 | 0 | 0 | if ($url_elt = $image_elt->first_child('url')) { | ||||
| 1679 | 0 | $url = $url_elt->trimmed_text; | |||||
| 1680 | } else { | ||||||
| 1681 | # Cooper Hewitt museum http://blog.cooperhewitt.org/rss/?limit=10 | ||||||
| 1682 |          # item  | 
||||||
| 1683 |          #     | 
||||||
| 1684 | # ]]> | ||||||
| 1685 | # | ||||||
| 1686 | # don't want to encourage dodginess like this, but picking it out | ||||||
| 1687 | # isn't too hard | ||||||
| 1688 | 0 | 0 |          if ($image_elt->text =~ / | 
||||
| 1689 | ### image from html: $1 | ||||||
| 1690 | 0 | $url_elt = $image_elt; | |||||
| 1691 | 0 | $url = $1; | |||||
| 1692 | } | ||||||
| 1693 | } | ||||||
| 1694 | 0 | 0 | if (is_non_empty ($url)) { | ||||
| 1695 | 0 | my $width = $image_elt->first_child_text('width'); | |||||
| 1696 | 0 | 0 | 0 | unless (Scalar::Util::looks_like_number($width) && $width > 0) { | |||
| 1697 | 0 | $width = 0; | |||||
| 1698 | } | ||||||
| 1699 | 0 | my $height = $image_elt->first_child_text('height'); | |||||
| 1700 | 0 | 0 | 0 | unless (Scalar::Util::looks_like_number($height) && $height > 0) { | |||
| 1701 | 0 | $height = 0; | |||||
| 1702 | } | ||||||
| 1703 | ### item_image_uwh() RSS: $url | ||||||
| 1704 | 0 | return (App::RSS2Leafnode::XML::Twig::Other::elt_xml_based_uri ($url_elt, $url), | |||||
| 1705 | $width, $height); | ||||||
| 1706 | } | ||||||
| 1707 | } | ||||||
| 1708 | |||||||
| 1709 |      # Atom channel  | 
||||||
| 1710 |      # or   channel  | 
||||||
| 1711 | # | ||||||
| 1712 |      #  | 
||||||
| 1713 | # is bigger than the RSS 48x48, so would probably need shrinking. Rate | ||||||
| 1714 |      # it below  | 
||||||
| 1715 | # | ||||||
| 1716 |      #  | 
||||||
| 1717 | # Is it better to show the channel icon, being the From person? | ||||||
| 1718 | { | ||||||
| 1719 | 0 | my $elt; | |||||
| 0 | |||||||
| 1720 | 0 | my ($width, $height); | |||||
| 1721 | my $url = ((($elt = $where->first_child('icon')) | ||||||
| 1722 | && non_empty ($elt->text)) | ||||||
| 1723 | || (($elt = $where->first_child('logo')) | ||||||
| 1724 | && non_empty ($elt->text)) | ||||||
| 1725 | || (($elt = $where->first_child('itunes:image')) | ||||||
| 1726 | && non_empty ($elt->att('href'))) | ||||||
| 1727 | || (($elt = $where->first_child('media:thumbnail')) | ||||||
| 1728 | && is_non_empty ($elt->att('url')) | ||||||
| 1729 | && do { | ||||||
| 1730 | $width = $elt->att('width'); | ||||||
| 1731 | $height = $elt->att('height'); | ||||||
| 1732 | $elt->att('url') }) | ||||||
| 1733 | # seen att('atom:url' rather than plain 'url' ... | ||||||
| 1734 | || (($elt = $where->first_child('media:thumbnail')) | ||||||
| 1735 | && is_non_empty ($elt->att('atom:url')) | ||||||
| 1736 | 0 | 0 | && do { | ||||
| 1737 | $width = $elt->att('width'); | ||||||
| 1738 | $height = $elt->att('height'); | ||||||
| 1739 | $elt->att('atom:url') })); | ||||||
| 1740 | ### $url | ||||||
| 1741 | 0 | 0 | if ($url) { | ||||
| 1742 | 0 | 0 | 0 | unless (Scalar::Util::looks_like_number($width) && $width > 0) { | |||
| 1743 | 0 | $width = 0; | |||||
| 1744 | } | ||||||
| 1745 | 0 | 0 | 0 | unless (Scalar::Util::looks_like_number($height) && $height > 0) { | |||
| 1746 | 0 | $height = 0; | |||||
| 1747 | } | ||||||
| 1748 | 0 | return (App::RSS2Leafnode::XML::Twig::Other::elt_xml_based_uri ($elt, $url), | |||||
| 1749 | $width, | ||||||
| 1750 | $height); | ||||||
| 1751 | } | ||||||
| 1752 | } | ||||||
| 1753 | |||||||
| 1754 | # status.net for rss 1.0 | ||||||
| 1755 |      #  | 
||||||
| 1756 | 0 | 0 | if (my $elt = $where->first_child('statusnet:postIcon')) { | ||||
| 1757 | 0 | 0 | if (is_non_empty (my $url = $elt->att('rdf:resource'))) { | ||||
| 1758 | 0 | return (App::RSS2Leafnode::XML::Twig::Other::elt_xml_based_uri ($elt, $url), | |||||
| 1759 | 0, 0); # unknown size | ||||||
| 1760 | } | ||||||
| 1761 | } | ||||||
| 1762 | |||||||
| 1763 |      #  | 
||||||
| 1764 | # eg. from blogger.com | ||||||
| 1765 |      #  | 
||||||
| 1766 | { | ||||||
| 1767 | 0 | my $elt; | |||||
| 0 | |||||||
| 1768 | 0 | 0 | 0 | if (($elt = $where->first_child('author')) | |||
| 0 | |||||||
| 0 | |||||||
| 1769 | && ($elt = $elt->first_child('gd:image')) | ||||||
| 1770 | && (is_non_empty (my $url = $elt->att('src') // $elt->att('atom:src')))) { | ||||||
| 1771 | ### $url | ||||||
| 1772 | 0 | 0 | return (App::RSS2Leafnode::XML::Twig::Other::elt_xml_based_uri ($elt, $url), | ||||
| 0 | |||||||
| 1773 | $elt->att('width') || $elt->att('atom:width') || 0, | ||||||
| 1774 | $elt->att('height') || $elt->att('atom:height') || 0); | ||||||
| 1775 | } | ||||||
| 1776 | } | ||||||
| 1777 | } | ||||||
| 1778 | 0 | return; | |||||
| 1779 | } | ||||||
| 1780 | @known{qw(/channel/logo | ||||||
| 1781 | /channel/icon | ||||||
| 1782 | /channel/image | ||||||
| 1783 | /channel/image/url | ||||||
| 1784 | /channel/image/width | ||||||
| 1785 | /channel/image/height | ||||||
| 1786 | /channel/image/title | ||||||
| 1787 | /channel/image/link | ||||||
| 1788 | /channel/image/description | ||||||
| 1789 | /channel/itunes:image | ||||||
| 1790 | /channel/statusnet:postIcon | ||||||
| 1791 | |||||||
| 1792 | /channel/item/image | ||||||
| 1793 | /channel/item/media:thumbnail | ||||||
| 1794 | /channel/item/statusnet:postIcon | ||||||
| 1795 | )} = (); | ||||||
| 1796 | |||||||
| 1797 | # $resp is a HTTP::Response | ||||||
| 1798 | # return a string value for the Face: header, or undef if no icon | ||||||
| 1799 | sub http_resp_to_face { | ||||||
| 1800 | 0 | 0 | 0 | my ($self, $resp) = @_; | |||
| 1801 | 0 | 0 | $self->{'get_icon'} || return; | ||||
| 1802 | |||||||
| 1803 | 0 | 0 | my $uri = http_resp_favicon_uri($resp) || return; | ||||
| 1804 | 0 | $self->verbose (2, ' response favicon URI: ', $uri); | |||||
| 1805 | 0 | return $self->download_face ($uri, 0, 0); | |||||
| 1806 | } | ||||||
| 1807 | |||||||
| 1808 | # $resp is a HTTP::Response | ||||||
| 1809 | # if it's a html with a favicon link return a URI object of that image | ||||||
| 1810 | # | ||||||
| 1811 | # http://www.w3.org/2005/10/howto-favicon | ||||||
| 1812 | # | ||||||
| 1813 | sub http_resp_favicon_uri { | ||||||
| 1814 | 0 | 0 | 0 | my ($resp) = @_; | |||
| 1815 | 0 | 0 | $resp->headers->content_is_html || return; | ||||
| 1816 | 0 | require HTML::Parser; | |||||
| 1817 | 0 | my $href; | |||||
| 1818 | my $p; | ||||||
| 1819 | $p = HTML::Parser->new (api_version => 3, | ||||||
| 1820 | start_h => [ sub { | ||||||
| 1821 | 0 | 0 | my ($tagname, $attr) = @_; | ||||
| 1822 | 0 | 0 | 0 | if ($tagname eq 'link' | |||
| 1823 | && $attr->{'rel'} eq 'icon') { | ||||||
| 1824 | 0 | $href = $attr->{'href'}; | |||||
| 1825 | 0 | $p->eof; | |||||
| 1826 | } | ||||||
| 1827 | 0 | }, "tagname, attr"]); | |||||
| 1828 | 0 | $resp->decode; | |||||
| 1829 | 0 | $p->parse ($resp->content); | |||||
| 1830 | 0 | 0 | return $href && URI->new_abs ($href, $resp->base); | ||||
| 1831 | } | ||||||
| 1832 | |||||||
| 1833 | # return base64 string value for "Face:" header | ||||||
| 1834 | sub download_face { | ||||||
| 1835 | 0 | 0 | 0 | my ($self, $uri, $width, $height) = @_; | |||
| 1836 | 0 | my $key = $uri->canonical->as_string; | |||||
| 1837 | 0 | 0 | if (! exists $self->{'download_face'}->{$key}) { | ||||
| 1838 | 0 | $self->{'download_face'}->{$key} | |||||
| 1839 | = $self->download_face_uncached ($uri, $width, $height); | ||||||
| 1840 | } | ||||||
| 1841 | 0 | return $self->{'download_face'}->{$key}; | |||||
| 1842 | } | ||||||
| 1843 | sub download_face_uncached { | ||||||
| 1844 | 0 | 0 | 0 | my ($self, $url, $width, $height) = @_; | |||
| 1845 | |||||||
| 1846 | 0 | $self->{'download_face_uncached'} = $url; | |||||
| 1847 | 0 | $self->verbose (1, ' image download: ', $url); | |||||
| 1848 | |||||||
| 1849 | 0 | require HTTP::Request; | |||||
| 1850 | 0 | my $req = HTTP::Request->new (GET => $url); | |||||
| 1851 | 0 | my $resp = $self->ua->request($req); | |||||
| 1852 | 0 | 0 | if (! $resp->is_success) { | ||||
| 1853 | 0 | print __x(" no image: {status}\n", | |||||
| 1854 | status => $resp->status_line); | ||||||
| 1855 | 0 | return; | |||||
| 1856 | } | ||||||
| 1857 | |||||||
| 1858 | 0 | my $type = $resp->content_type; | |||||
| 1859 | ### $type | ||||||
| 1860 | # FIXME: is mime=>$type the right way? could give it a look at the url | ||||||
| 1861 | # basename or server's suggested filename too, for Read() to use the | ||||||
| 1862 | # extension. | ||||||
| 1863 | 0 | 0 | 0 | if ($type eq 'image/vnd.microsoft.icon' || $type eq 'image/x-icon') { | |||
| 0 | |||||||
| 1864 | # mime.xml of imagemagick 6.6.0 only has "image/x-ico", and nothing for | ||||||
| 1865 | # ico in magic.xml | ||||||
| 1866 | 0 | $type = 'ico'; | |||||
| 1867 | } elsif ($type =~ m{^image/(.*)$}i) { | ||||||
| 1868 | 0 | $type = $1; | |||||
| 1869 | } else { | ||||||
| 1870 | 0 | $self->verbose (2, 'ignore non-image icon type: ',$type); | |||||
| 1871 | 0 | return; | |||||
| 1872 | } | ||||||
| 1873 | |||||||
| 1874 | 0 | $resp->decode; | |||||
| 1875 | 0 | my $data = $resp->content; | |||||
| 1876 | 0 | 0 | 0 | if ($type ne 'png' | |||
| 0 | |||||||
| 0 | |||||||
| 0 | |||||||
| 1877 | || $width == 0 || $height == 0 | ||||||
| 1878 | || $width > 48 || $height > 48) { | ||||||
| 1879 | 0 | 0 | $data = $self->imagemagick_to_png($type,$data) // return; | ||||
| 1880 | } | ||||||
| 1881 | 0 | $self->verbose (2, " image for Face ",length($data)," bytes"); | |||||
| 1882 | |||||||
| 1883 | # use a space as a separator since MIME::Entity will collapse out a | ||||||
| 1884 | # newline and make an enormous long word which then can't be split across | ||||||
| 1885 | # header lines and will likely exceed the nntp 998 char single-line limit | ||||||
| 1886 | 0 | require MIME::Base64; | |||||
| 1887 | 0 | $data = MIME::Base64::encode_base64($data, " "); | |||||
| 1888 | ### $data | ||||||
| 1889 | |||||||
| 1890 | 0 | return $data; | |||||
| 1891 | } | ||||||
| 1892 | |||||||
| 1893 | sub face_wh_ok { | ||||||
| 1894 | 0 | 0 | 0 | my ($self, $width, $height) = @_; | |||
| 1895 | |||||||
| 1896 | 0 | 0 | 0 | if ($width > 0 && $width > 2*$height) { | |||
| 1897 | # some obnoxious banner | ||||||
| 1898 | 0 | $self->verbose (1, ' ', | |||||
| 1899 | __x('image is a banner ({width}x{height}), ignore', | ||||||
| 1900 | width => $width, height => $height)); | ||||||
| 1901 | 0 | return 0; | |||||
| 1902 | } | ||||||
| 1903 | 0 | return 1; | |||||
| 1904 | } | ||||||
| 1905 | |||||||
| 1906 | #------------------------------------------------------------------------------ | ||||||
| 1907 | # ImageMagick bits | ||||||
| 1908 | |||||||
| 1909 | # $type is "gif", "ico" etc, $data is an image in a byte string | ||||||
| 1910 | # return a byte string of png, or undef if $data unrecognised | ||||||
| 1911 | sub imagemagick_to_png { | ||||||
| 1912 | 0 | 0 | 0 | my ($self, $type, $data) = @_; | |||
| 1913 | ### $type | ||||||
| 1914 | 0 | 0 | my $image = $self->imagemagick_from_data($type,$data) // return; | ||||
| 1915 | |||||||
| 1916 | 0 | my $width = $image->Get('width'); | |||||
| 1917 | 0 | my $height = $image->Get('height'); | |||||
| 1918 | ### compress: $image->Get('compression') | ||||||
| 1919 | 0 | $self->verbose (2, " image ${width}x${height}"); | |||||
| 1920 | 0 | 0 | 0 | if ($width == 0 || $height == 0) { | |||
| 1921 | 0 | return; | |||||
| 1922 | } | ||||||
| 1923 | 0 | 0 | 0 | if ($width <= 48 && $height <= 48 && $type eq 'png') { | |||
| 0 | |||||||
| 1924 | 0 | return $data; | |||||
| 1925 | } | ||||||
| 1926 | |||||||
| 1927 | # having downloaded the image is it better to keep a banner but shrink it, | ||||||
| 1928 | # or discard as no good? | ||||||
| 1929 | # | ||||||
| 1930 | # $self->face_wh_ok ($width, $height) || return; | ||||||
| 1931 | |||||||
| 1932 | 0 | 0 | 0 | if ($width > 48 || $height > 48) { | |||
| 1933 | 0 | my $factor; | |||||
| 1934 | 0 | 0 | 0 | if ($width <= 2*48 && $height <= 2*48) { | |||
| 1935 | 0 | $factor = 0.5; | |||||
| 1936 | } else { | ||||||
| 1937 | 0 | $factor = min (48 / $width, 48 / $height); | |||||
| 1938 | } | ||||||
| 1939 | 0 | $width = POSIX::ceil ($width * $factor); | |||||
| 1940 | 0 | $height = POSIX::ceil ($height * $factor); | |||||
| 1941 | 0 | $self->verbose (2, " image shrink by $factor to ${width}x${height}"); | |||||
| 1942 | # cf LiquidResize() or plain Resize() | ||||||
| 1943 | 0 | $image->AdaptiveResize (width => $width, height => $height); | |||||
| 1944 | } | ||||||
| 1945 | |||||||
| 1946 | 0 | my $ret = $image->Set (magick => 'PNG8'); | |||||
| 1947 | ### ret: "$ret" | ||||||
| 1948 | ### ret: $ret+0 | ||||||
| 1949 | 0 | 0 | if ($ret != 0) { | ||||
| 1950 | 0 | print "oops, imagemagick doesn't like PNG8: $ret\n"; | |||||
| 1951 | 0 | return; | |||||
| 1952 | } | ||||||
| 1953 | ### compress: $image->Get('compression') | ||||||
| 1954 | |||||||
| 1955 | # $image->Write ('/tmp/x.png'); | ||||||
| 1956 | 0 | ($data) = $image->ImageToBlob (); | |||||
| 1957 | 0 | return $data; | |||||
| 1958 | } | ||||||
| 1959 | |||||||
| 1960 | |||||||
| 1961 | # $type is "png", "ico" etc, $data is an image in a byte string | ||||||
| 1962 | # return a Image::Magick object, or undef if Perl-Magick not available | ||||||
| 1963 | sub imagemagick_from_data { | ||||||
| 1964 | 0 | 0 | 0 | my ($self, $type, $data) = @_; | |||
| 1965 | ### imagemagick_from_data(): $type | ||||||
| 1966 | 0 | 0 | eval { require Image::Magick } or return; | ||||
| 0 | |||||||
| 1967 | |||||||
| 1968 | 0 | my $image = Image::Magick->new (magick=>$type); | |||||
| 1969 | # $image->Set(debug=>'All'); | ||||||
| 1970 | 0 | my $ret = $image->BlobToImage ($data); | |||||
| 1971 | ### ret: "$ret" | ||||||
| 1972 | ### ret: $ret+0 | ||||||
| 1973 | 0 | 0 | if ($ret == 1) { | ||||
| 1974 | 0 | return $image; | |||||
| 1975 | } | ||||||
| 1976 | |||||||
| 1977 | # try again without the $type forced, in case bad Content-Type from http | ||||||
| 1978 | 0 | $image = Image::Magick->new; | |||||
| 1979 | # $image->Set(debug=>'All'); | ||||||
| 1980 | 0 | $ret = $image->BlobToImage ($data); | |||||
| 1981 | ### ret: "$ret" | ||||||
| 1982 | ### ret: $ret+0 | ||||||
| 1983 | 0 | 0 | if ($ret == 1) { | ||||
| 1984 | 0 | return $image; | |||||
| 1985 | } | ||||||
| 1986 | |||||||
| 1987 | 0 | print __x(" imagemagick doesn't like image data ({length} bytes) from {url}: {error}\n", | |||||
| 1988 | length => length($data), | ||||||
| 1989 | url => $self->{'download_face_uncached'}, | ||||||
| 1990 | error => $ret); | ||||||
| 1991 | 0 | return undef; | |||||
| 1992 | } | ||||||
| 1993 | |||||||
| 1994 | |||||||
| 1995 | #------------------------------------------------------------------------------ | ||||||
| 1996 | # XML::Liberal | ||||||
| 1997 | |||||||
| 1998 | use constant::defer have_xml_liberal => sub { | ||||||
| 1999 | 0 | 0 | my ($self) = @_; | ||||
| 2000 | 0 | 0 | 0 | if (eval { require XML::Liberal; 1 }) { | |||
| 0 | 0 | ||||||
| 0 | 0 | ||||||
| 2001 | 0 | 0 | return 1; | ||||
| 2002 | } | ||||||
| 2003 | 0 | 0 | $self->verbose (3, __x('XML::Liberal not available: {error}', error => $@)); | ||||
| 2004 | 0 | 0 | return 0; | ||||
| 2005 | 1 | 1 | 7 | }; | |||
| 1 | 2 | ||||||
| 1 | 9 | ||||||
| 2006 | |||||||
| 2007 | # try to correct $xmlstr | ||||||
| 2008 | # if successful return a new xml string, otherwise return undef | ||||||
| 2009 | sub xml_liberal_correction { | ||||||
| 2010 | 0 | 0 | 0 | my ($self, $xmlstr) = @_; | |||
| 2011 | 0 | 0 | $self->have_xml_liberal or return; | ||||
| 2012 | |||||||
| 2013 | ### try XML-Liberal ... | ||||||
| 2014 | 0 | my $liberal = XML::Liberal->new('LibXML'); | |||||
| 2015 | 0 | 0 | if (my $doc = eval { $liberal->parse_string($xmlstr) }) { | ||||
| 0 | |||||||
| 2016 | 0 | return $doc->toString; | |||||
| 2017 | } else { | ||||||
| 2018 | 0 | $self->verbose (2, __x('XML::Liberal parse error: {error}', error => $@)); | |||||
| 2019 | 0 | return undef; | |||||
| 2020 | } | ||||||
| 2021 | } | ||||||
| 2022 | |||||||
| 2023 | |||||||
| 2024 | #------------------------------------------------------------------------------ | ||||||
| 2025 | # error as news message | ||||||
| 2026 | |||||||
| 2027 | sub error_message { | ||||||
| 2028 | 0 | 0 | 0 | my ($self, $subject, $message, $attach_bytes) = @_; | |||
| 2029 | |||||||
| 2030 | 0 | require Encode; | |||||
| 2031 | 0 | my $charset = 'utf-8'; | |||||
| 2032 | 0 | $message = str_ensure_newline ($message); | |||||
| 2033 | 0 | $message = Encode::encode ($charset, $message, Encode::FB_DEFAULT()); | |||||
| 2034 | |||||||
| 2035 | 0 | my $date = rfc822_time_now(); | |||||
| 2036 | 0 | require Digest::MD5; | |||||
| 2037 | 0 | my $msgid = $self->url_to_msgid | |||||
| 2038 | ('http://localhost', | ||||||
| 2039 | Digest::MD5::md5_base64 ($date.$subject.$message)); | ||||||
| 2040 | |||||||
| 2041 | 0 | my $top = $self->mime_build | |||||
| 2042 | ({ | ||||||
| 2043 | 'Path:' => 'localhost', | ||||||
| 2044 | 'Newsgroups:' => $self->{'nntp_group'}, | ||||||
| 2045 |        From          => __('RSS2Leafnode').'  | 
||||||
| 2046 | Subject => $subject, | ||||||
| 2047 | Date => $date, | ||||||
| 2048 | 'Message-ID' => $msgid, | ||||||
| 2049 | }, | ||||||
| 2050 | Top => 1, | ||||||
| 2051 | Type => 'text/plain', | ||||||
| 2052 | Charset => $charset, | ||||||
| 2053 | Data => $message); | ||||||
| 2054 | |||||||
| 2055 | 0 | 0 | if (defined $attach_bytes) { | ||||
| 2056 | 0 | $top->make_multipart; | |||||
| 2057 | 0 | my $part = $self->mime_build | |||||
| 2058 | ({}, | ||||||
| 2059 | Charset => 'none', | ||||||
| 2060 | Type => 'application/octet-stream', | ||||||
| 2061 | Data => $attach_bytes); | ||||||
| 2062 | 0 | $top->add_part ($part); | |||||
| 2063 | } | ||||||
| 2064 | |||||||
| 2065 | 0 | mime_entity_lines($top); | |||||
| 2066 | 0 | 0 | $self->nntp_post($top) || return; | ||||
| 2067 | 0 | say __x('{group} 1 new article', group => $self->{'nntp_group'}); | |||||
| 2068 | } | ||||||
| 2069 | |||||||
| 2070 | |||||||
| 2071 | #------------------------------------------------------------------------------ | ||||||
| 2072 | # fetch HTML | ||||||
| 2073 | |||||||
| 2074 | sub http_resp_to_from { | ||||||
| 2075 | 0 | 0 | 0 | my ($self, $resp) = @_; | |||
| 2076 | ### http_resp_to_from() | ||||||
| 2077 | 0 | 0 | return $self->http_resp_exiftool_author($resp) | ||||
| 2078 | // 'nobody@'.$self->uri_to_host; | ||||||
| 2079 | } | ||||||
| 2080 | sub http_resp_exiftool_author { | ||||||
| 2081 | 0 | 0 | 0 | my ($self, $resp) = @_; | |||
| 2082 | # PNG Author field, or HTML author | ||||||
| 2083 | 0 | 0 | my $author = resp_exiftool_info($resp)->{'Author'} // return; | ||||
| 2084 | 0 | return $self->email_format_maybe (Encode::decode_utf8($author), '', undef); | |||||
| 2085 | } | ||||||
| 2086 | |||||||
| 2087 | sub http_resp_to_copyright { | ||||||
| 2088 | 0 | 0 | 0 | my ($self, $resp) = @_; | |||
| 2089 | ### http_http_resp_to_copyright() ... | ||||||
| 2090 | |||||||
| 2091 | 0 | my @copyrights = non_empty($resp->header('X-Meta-Copyright')); | |||||
| 2092 | 0 | 0 | unless ($resp->content_type =~ m{^text/}) { | ||||
| 2093 | # PNG Copyright field, perhaps other formats | ||||||
| 2094 | 0 | push @copyrights, non_empty(resp_exiftool_info($resp)->{'Copyright'}); | |||||
| 2095 | } | ||||||
| 2096 | 0 | return \@copyrights; | |||||
| 2097 | } | ||||||
| 2098 | |||||||
| 2099 | # return a "Keywords:" string, or undef if nothing | ||||||
| 2100 | sub http_resp_to_keywords { | ||||||
| 2101 | 0 | 0 | 0 | my ($self, $resp) = @_; | |||
| 2102 | ### http_resp_to_keywords() ... | ||||||
| 2103 | |||||||
| 2104 | 0 | my @keywords = $resp->header('X-Meta-Keywords'); | |||||
| 2105 | |||||||
| 2106 | 0 | 0 | if ($resp->headers->content_is_html) { | ||||
| 2107 | 0 | $resp->decode; | |||||
| 2108 | 0 | require HTML::Parser; | |||||
| 2109 | my $p = HTML::Parser->new | ||||||
| 2110 | (api_version => 3, | ||||||
| 2111 | report_tags => ['meta'], | ||||||
| 2112 | start_h => [ sub { | ||||||
| 2113 | 0 | 0 | my ($tagname, $attr) = @_; | ||||
| 2114 | # facebook thing | ||||||
| 2115 | 0 | 0 | 0 | if ($tagname eq 'meta' | |||
| 0 | |||||||
| 2116 | && lc($attr->{'property'}||'') eq 'og:type') { | ||||||
| 2117 | 0 | push @keywords, $attr->{'content'}; | |||||
| 2118 | } | ||||||
| 2119 | |||||||
| 2120 | 0 | }, "tagname, attr" ]); | |||||
| 2121 | 0 | $p->parse ($resp->decoded_content); | |||||
| 2122 | } | ||||||
| 2123 | ### @keywords | ||||||
| 2124 | |||||||
| 2125 | 0 | return join_non_empty | |||||
| 2126 | 0 | (', ', List::MoreUtils::uniq(map {collapse_whitespace($_)} | |||||
| 2127 | @keywords)); | ||||||
| 2128 | } | ||||||
| 2129 | |||||||
| 2130 | sub fetch_html { | ||||||
| 2131 | 0 | 0 | 1 | my ($self, $group, $url, %options) = @_; | |||
| 2132 | ### fetch_html() ... | ||||||
| 2133 | |||||||
| 2134 | 0 | local @{$self}{keys %options} = values %options; # hash slice | |||||
| 0 | |||||||
| 2135 | 0 | $self->verbose (1, __x('page: {url}', url => $url)); | |||||
| 2136 | |||||||
| 2137 | 0 | my $group_uri = URI->new($group,'news'); | |||||
| 2138 | 0 | local $self->{'nntp_host'} = uri_to_nntp_host ($group_uri); | |||||
| 2139 | 0 | local $self->{'nntp_group'} = $group = $group_uri->group; | |||||
| 2140 | 0 | 0 | $self->nntp_group_check($group) or return; | ||||
| 2141 | |||||||
| 2142 | 0 | require HTTP::Request; | |||||
| 2143 | 0 | my $req = HTTP::Request->new (GET => $url); | |||||
| 2144 | 0 | $self->status_etagmod_req ($req); | |||||
| 2145 | 0 | my $resp = $self->ua->request($req); | |||||
| 2146 | 0 | 0 | if ($resp->code == 304) { | ||||
| 2147 | 0 | $self->status_unchanged ($url); | |||||
| 2148 | 0 | return; | |||||
| 2149 | } | ||||||
| 2150 | 0 | 0 | if (! $resp->is_success) { | ||||
| 2151 | 0 | print __x("rss2leafnode: {url}\n {status}\n", | |||||
| 2152 | url => $url, | ||||||
| 2153 | status => $resp->status_line); | ||||||
| 2154 | 0 | return; | |||||
| 2155 | } | ||||||
| 2156 | 0 | $self->verbose (2, $resp->headers->as_string); | |||||
| 2157 | 0 | $self->enforce_html_charset_from_content ($resp); | |||||
| 2158 | |||||||
| 2159 | # message id is either the etag if present, or an md5 of the content if not | ||||||
| 2160 | my $msgid = $self->url_to_msgid | ||||||
| 2161 | ($url, | ||||||
| 2162 | 0 | 0 | $resp->header('ETag') // do { | ||||
| 2163 | 0 | require Digest::MD5; | |||||
| 2164 | 0 | $resp->decode; | |||||
| 2165 | 0 | my $content = $resp->content; | |||||
| 2166 | 0 | Digest::MD5::md5_base64($content) | |||||
| 2167 | }); | ||||||
| 2168 | 0 | 0 | return 0 if $self->nntp_message_id_exists ($msgid); | ||||
| 2169 | |||||||
| 2170 | 0 | 0 | my $subject = (html_title($resp) | ||||
| 0 | |||||||
| 2171 | // $resp->filename | ||||||
| 2172 | # show original url in subject, not anywhere redirected | ||||||
| 2173 | // __x('RSS2Leafnode {url}', url => $url)); | ||||||
| 2174 | |||||||
| 2175 | 0 | my $from = $self->http_resp_to_from($resp); | |||||
| 2176 | 0 | my $date = $resp->header('Last-Modified'); | |||||
| 2177 | 0 | my $face = $self->http_resp_to_face($resp); | |||||
| 2178 | 0 | my $copyright = $self->http_resp_to_copyright($resp); | |||||
| 2179 | 0 | my $keywords = $self->http_resp_to_keywords($resp); | |||||
| 2180 | |||||||
| 2181 | 0 | my $part = $self->http_resp_extract_main($resp); | |||||
| 2182 | |||||||
| 2183 | 0 | my $top = $self->mime_part_from_response | |||||
| 2184 | ($resp, | ||||||
| 2185 | Top => 1, | ||||||
| 2186 | 'Path:' => scalar($self->uri_to_host), | ||||||
| 2187 | 'Newsgroups:' => $group, | ||||||
| 2188 | From => $from, | ||||||
| 2189 | Subject => $subject, | ||||||
| 2190 | Date => $date, | ||||||
| 2191 | 'Message-ID' => $msgid, | ||||||
| 2192 | Keywords => $keywords, | ||||||
| 2193 | 'Face:' => $face, | ||||||
| 2194 | 'X-Copyright:' => $copyright); | ||||||
| 2195 | 0 | 0 | if ($part) { | ||||
| 2196 | ### attach full part ... | ||||||
| 2197 | 0 | $top->make_multipart; | |||||
| 2198 | 0 | $top->add_part ($part); | |||||
| 2199 | } | ||||||
| 2200 | |||||||
| 2201 | 0 | mime_entity_lines($top); | |||||
| 2202 | 0 | 0 | $self->nntp_post($top) || return; | ||||
| 2203 | 0 | $self->status_etagmod_resp ($url, $resp); | |||||
| 2204 | 0 | say __x("{group} 1 new article", group => $group); | |||||
| 2205 | } | ||||||
| 2206 | |||||||
| 2207 | # $resp is a HTTP::Response | ||||||
| 2208 | # If the $self->{'html_extract_main'} option is true and $resp is html then | ||||||
| 2209 | # resplace the $resp content with HTML::ExtractMain extracted part. | ||||||
| 2210 | # | ||||||
| 2211 | sub http_resp_extract_main { | ||||||
| 2212 | 0 | 0 | 0 | my ($self, $resp) = @_; | |||
| 2213 | |||||||
| 2214 | 0 | 0 | $self->{'html_extract_main'} or return; | ||||
| 2215 | 0 | 0 | $resp->headers->content_is_html() or return; | ||||
| 2216 | |||||||
| 2217 | 0 | 0 | my $full_part | ||||
| 2218 | = (defined $self->{'html_extract_main'} | ||||||
| 2219 | && $self->{'html_extract_main'} eq 'attach_full' | ||||||
| 2220 | && $self->mime_part_from_response($resp, | ||||||
| 2221 | Disposition => "attachment")); | ||||||
| 2222 | |||||||
| 2223 | 0 | require HTML::ExtractMain; | |||||
| 2224 | 0 | HTML::ExtractMain->VERSION(0.63); # for output_type=>'html' | |||||
| 2225 | 0 | $resp->decode; # expand any compression | |||||
| 2226 | 0 | my $content = $resp->decoded_content; # as wide-chars | |||||
| 2227 | |||||||
| 2228 | # Output type 'html' differs from the default xhtml by a few entities, in | ||||||
| 2229 | # particular it avoids ' which is an xml-ism not in the html standards. | ||||||
| 2230 | # Various browsers support &apos anyway, but not for example by w3m. | ||||||
| 2231 | 0 | $content = HTML::ExtractMain::extract_main_html($content, | |||||
| 2232 | output_type => 'html'); | ||||||
| 2233 | 0 | 0 | if (! defined $content) { | ||||
| 2234 | 0 | $self->verbose(1, __(" HTML::ExtractMain no main part found, posting whole")); | |||||
| 2235 | 0 | return; | |||||
| 2236 | } | ||||||
| 2237 | ### main extracted: $content | ||||||
| 2238 | 0 | $resp->remove_header('Content-MD5'); # since changed content | |||||
| 2239 | 0 | my $charset = $resp->content_charset; | |||||
| 2240 | 0 | $content = Encode::encode ($charset, $content); | |||||
| 2241 | 0 | $resp->content($content); | |||||
| 2242 | |||||||
| 2243 | 0 | return $full_part; | |||||
| 2244 | } | ||||||
| 2245 | |||||||
| 2246 | #------------------------------------------------------------------------------ | ||||||
| 2247 | # RSS hacks | ||||||
| 2248 | |||||||
| 2249 | # This is a hack for Yahoo Finance feed uniqification. | ||||||
| 2250 | # $item is a feed hashref. If it has 'link' field with a yahoo.com | ||||||
| 2251 | # redirection like | ||||||
| 2252 | # | ||||||
| 2253 | # http://au.rd.yahoo.com/finance/news/rss/financenews/*http://au.biz.yahoo.com/071003/30/1fdvx.html | ||||||
| 2254 | # | ||||||
| 2255 | # then return the last target url part. Otherwise return false. | ||||||
| 2256 | # | ||||||
| 2257 | # This allows the item to be identified by its final target link, so as to | ||||||
| 2258 | # avoid duplication when the item appears in multiple yahoo feeds with a | ||||||
| 2259 | # different leading part. (There's no guid in yahoo feeds, as of Oct 2007.) | ||||||
| 2260 | # | ||||||
| 2261 | sub item_yahoo_permalink { | ||||||
| 2262 | 0 | 0 | 0 | my ($item) = @_; | |||
| 2263 | 0 | 0 | my $url = $item->first_child_text('link') | ||||
| 2264 | // return undef; | ||||||
| 2265 | 0 | 0 | $url =~ m{^http://[^/]*yahoo\.com/.*\*(http://.*yahoo\.com.*)$} | ||||
| 2266 | or return undef; | ||||||
| 2267 | 0 | return $1; | |||||
| 2268 | } | ||||||
| 2269 | |||||||
| 2270 | # This is a special case for Google Groups RSS feeds. | ||||||
| 2271 | # The arguments are link elements [$name,$uri]. If there's a google groups | ||||||
| 2272 | # like "http://groups.google.com/group/cfcdev/msg/445d4ccfdabf086b" then | ||||||
| 2273 | # return a mailing list address like "cfcdev@googlegroups.com". If not in | ||||||
| 2274 | # that form then return undef. | ||||||
| 2275 | # | ||||||
| 2276 | sub googlegroups_link_email { | ||||||
| 2277 | ## no critic (RequireInterpolationOfMetachars) | ||||||
| 2278 | 0 | 0 | 0 | foreach my $l (@_) { | |||
| 2279 | 0 | 0 | 0 | if ($l->{'uri'} | |||
| 2280 | && $l->{'uri'}->canonical =~ m{^http://groups\.google\.com/group/([^/]+)/}) { | ||||||
| 2281 | 0 | return ($1 . '@googlegroups.com'); | |||||
| 2282 | } | ||||||
| 2283 | } | ||||||
| 2284 | 0 | return undef; | |||||
| 2285 | } | ||||||
| 2286 | |||||||
| 2287 | # This is a nasty hack for http://www.aireview.com.au/rss.php | ||||||
| 2288 | # $url is a link url string just fetched, $resp is a HTTP::Response. The | ||||||
| 2289 | # return is a possibly new HTTP::Response object. | ||||||
| 2290 | # | ||||||
| 2291 | # The first fetch of an item link from aireview gives back content like | ||||||
| 2292 | # | ||||||
| 2293 | # | ||||||
| 2294 | # | ||||||
| 2295 | # plus some cookies in the headers. The URL "zz=1" in that line seems very | ||||||
| 2296 | # dodgy, it ends up going to the home page with mozilla. In any case a | ||||||
| 2297 | # fresh fetch of the link url with the cookies provided is enough to get the | ||||||
| 2298 | # actual content. | ||||||
| 2299 | # | ||||||
| 2300 | # The LWP::UserAgent::FramesReady module on cpan has a similar match of a | ||||||
| 2301 | # Refresh, for use with frames. It works by turning the response into a | ||||||
| 2302 | # "302 Moved temporarily" for LWP to follow. urlcheck.pl at | ||||||
| 2303 | # http://www.cpan.org/authors/id/P/PH/PHILMI/urlcheck-1.00.pl likewise | ||||||
| 2304 | # follows. But alas both obey the URL given in the , which is no good | ||||||
| 2305 | # here. | ||||||
| 2306 | # | ||||||
| 2307 | sub aireview_follow { | ||||||
| 2308 | 0 | 0 | 0 | my ($self, $url, $resp) = @_; | |||
| 2309 | |||||||
| 2310 | 0 | 0 | if ($resp->is_success) { | ||||
| 2311 | 0 | $resp->decode; | |||||
| 2312 | 0 | my $content = $resp->content; | |||||
| 2313 | 0 | 0 | if ($content =~ /]*Refresh[^>]*checkForCookies/i) { | ||||
| 2314 | 0 | $self->verbose (1, ' following aireview META Refresh with cookies'); | |||||
| 2315 | 0 | require HTTP::Request; | |||||
| 2316 | 0 | my $req = HTTP::Request->new (GET => $url); | |||||
| 2317 | 0 | $resp = $self->ua->request($req); | |||||
| 2318 | } | ||||||
| 2319 | } | ||||||
| 2320 | 0 | return $resp; | |||||
| 2321 | } | ||||||
| 2322 | |||||||
| 2323 | |||||||
| 2324 | #------------------------------------------------------------------------------ | ||||||
| 2325 | # RSS links | ||||||
| 2326 | |||||||
| 2327 | |||||||
| 2328 | # WordPress (http://wordpress.org/) pre 2.5 had a bug | ||||||
| 2329 | # http://core.trac.wordpress.org/ticket/6579 where it gave | ||||||
| 2330 | # type="appication/atom+xml" missing the "l" in "application/". | ||||||
| 2331 | # Don't want to workaround every bad generator, but this one is GPL | ||||||
| 2332 | # free and the past versions still found for instance in the | ||||||
| 2333 | # language log http://languagelog.ldc.upenn.edu/nll/ in Feb 2011 | ||||||
| 2334 | # | ||||||
| 2335 | sub mime_type_is_rss { | ||||||
| 2336 | 0 | 0 | 0 | my ($self, $type) = @_; | |||
| 2337 | 0 | return ($type =~ m{^appl?ication/atom\+xml$}); | |||||
| 2338 | } | ||||||
| 2339 | sub atom_link_is_rss { | ||||||
| 2340 | 0 | 0 | 0 | my ($self, $elt) = @_; | |||
| 2341 | 0 | 0 | my $type = $elt->att('atom:type') // $elt->att('type') // return 0; | ||||
| 0 | |||||||
| 2342 | 0 | return $self->mime_type_is_rss($type); | |||||
| 2343 | } | ||||||
| 2344 | |||||||
| 2345 | |||||||
| 2346 | |||||||
| 2347 | # return list of hashrefs, each being | ||||||
| 2348 | # { name => $str, | ||||||
| 2349 | # uri => $uri_object, | ||||||
| 2350 | # download => $boolean, | ||||||
| 2351 | # priority => $number, | ||||||
| 2352 | # } | ||||||
| 2353 | # | ||||||
| 2354 | # Links are listed from highest to lowest priority. The current priority | ||||||
| 2355 | # levels are | ||||||
| 2356 | # 0 plain links | ||||||
| 2357 | # -10 comment RSS | ||||||
| 2358 | # -20 author home page | ||||||
| 2359 | # -100 geo location text-only | ||||||
| 2360 | # -101 statusnet geo location | ||||||
| 2361 |  #     -200   | 
||||||
| 2362 | # | ||||||
| 2363 | sub item_to_links { | ||||||
| 2364 | 0 | 0 | 0 | my ($self, $item) = @_; | |||
| 2365 | |||||||
| 2366 |    #  | 
||||||
| 2367 | # something has been expanded into the item, or should it be shown? | ||||||
| 2368 | |||||||
| 2369 |    # FIXME:  | 
||||||
| 2370 |    #  | 
||||||
| 2371 |    # Can have a  | 
||||||
| 2372 | # | ||||||
| 2373 |    # ENHANCE-ME:  | 
||||||
| 2374 | # | ||||||
| 2375 |    # FIXME:  | 
||||||
| 2376 | # formats etc. Have seen this from archive.org just duplicating | ||||||
| 2377 |    #  | 
||||||
| 2378 | # | ||||||
| 2379 |    #  | 
||||||
| 2380 | # case. | ||||||
| 2381 | # | ||||||
| 2382 | 0 | my @elts = $item->children (qr/^(link | |||||
| 2383 | |enclosure | ||||||
| 2384 | |content | ||||||
| 2385 | |wiki:diff | ||||||
| 2386 | |wiki:history | ||||||
| 2387 | |comments | ||||||
| 2388 | |wfw:comment | ||||||
| 2389 | |wfw:commentRss | ||||||
| 2390 | |foaf:maker | ||||||
| 2391 | |sioc:has_creator | ||||||
| 2392 | |sioc:has_discussion | ||||||
| 2393 | |sioc:links_to | ||||||
| 2394 | |sioc:reply_of | ||||||
| 2395 | |statusnet:origin | ||||||
| 2396 | |dc:source | ||||||
| 2397 | )$/ix); | ||||||
| 2398 | ### link elts: "@elts" | ||||||
| 2399 | |||||||
| 2400 | 0 | my @links; | |||||
| 2401 | 0 | foreach my $elt (@elts) { | |||||
| 2402 | 0 | 0 | if ($self->{'verbose'} >= 2) { | ||||
| 2403 | 0 | require Text::Wrap; | |||||
| 2404 | 0 | local $Text::Wrap::huge = 'overflow'; # don't break long URLs etc | |||||
| 2405 | 0 | $self->verbose (2, "link\n", Text::Trim::trim($elt->sprint)); | |||||
| 2406 | } | ||||||
| 2407 | |||||||
| 2408 | 0 | my $tag = lc($elt->tag); | |||||
| 2409 | ### $tag | ||||||
| 2410 | 0 | 0 | 0 | if ($tag eq 'content' && atom_content_flavour($elt) ne 'link') { | |||
| 2411 | 0 | next; | |||||
| 2412 | } | ||||||
| 2413 | 0 | my $l = { download => 1 }; | |||||
| 2414 | |||||||
| 2415 | 0 | foreach my $name ('hreflang', 'title', 'type') { | |||||
| 2416 | 0 | 0 | $l->{$name} = ($elt->att("atom:$name") // $elt->att($name)); | ||||
| 2417 | } | ||||||
| 2418 | |||||||
| 2419 | 0 | 0 | my $rel = non_empty($elt->att('atom:rel') // $elt->att('rel')); | ||||
| 2420 | 0 | 0 | if (defined $rel) { | ||||
| 2421 | # Atom rel="..." | ||||||
| 2422 | # Maybe: if ($rel eq 'next') ... # not sure about "next" link | ||||||
| 2423 | |||||||
| 2424 | 0 | 0 | 0 | if ($rel eq 'self' # the feed itself (in the channel normally) | |||
| 0 | |||||||
| 0 | |||||||
| 2425 | || $rel eq 'edit' # to edit the item, maybe | ||||||
| 2426 | || $rel eq 'service.edit' # to edit the item | ||||||
| 2427 | || $rel eq 'license' # probably only in the channel part normally | ||||||
| 2428 | ) { | ||||||
| 2429 | 0 | $self->verbose (1, ' ', __x('skip link "{type}"', type => $rel)); | |||||
| 2430 | 0 | next; | |||||
| 2431 | } | ||||||
| 2432 | 0 | 0 | if ($rel eq 'alternate') { | ||||
| 0 | |||||||
| 0 | |||||||
| 0 | |||||||
| 0 | |||||||
| 0 | |||||||
| 0 | |||||||
| 0 | |||||||
| 2433 | # "alternate" is supposed to be the content as the entry, but in a | ||||||
| 2434 | # web page or something. Not sure that's always quite true, so show | ||||||
| 2435 |          # it as a plain link.  If no  | 
||||||
| 2436 | # supposed to be mandatory. | ||||||
| 2437 | |||||||
| 2438 | } elsif ($rel eq 'enclosure') { | ||||||
| 2439 | 0 | $l->{'name'} = __('Encl'); | |||||
| 2440 | |||||||
| 2441 | } elsif ($rel eq 'ostatus:conversation') { | ||||||
| 2442 | 0 | $l->{'name'} = __('Conversation'); | |||||
| 2443 | 0 | $l->{'download'} = 0; | |||||
| 2444 | |||||||
| 2445 | } elsif ($rel eq 'ostatus:attention') { | ||||||
| 2446 | 0 | $l->{'name'} = __('Attention'); | |||||
| 2447 | 0 | $l->{'download'} = 0; | |||||
| 2448 | |||||||
| 2449 | } elsif ($rel eq 'related') { | ||||||
| 2450 | 0 | $l->{'name'} = __('Related'); | |||||
| 2451 | |||||||
| 2452 | } elsif ($rel eq 'replies') { | ||||||
| 2453 | # Atom | ||||||
| 2454 | 0 | my $count = $self->item_elt_comments_count($item,$elt); | |||||
| 2455 | 0 | 0 | if ($self->atom_link_is_rss($elt)) { | ||||
| 2456 | 0 | 0 | $l->{'name'} = (defined $count | ||||
| 2457 | ? __x('RSS Replies({count})', count => $count) | ||||||
| 2458 | : __('RSS Replies')); | ||||||
| 2459 | 0 | $l->{'priority'} = -10; | |||||
| 2460 | } else { | ||||||
| 2461 | 0 | 0 | $l->{'name'} = (defined $count | ||||
| 2462 | ? __x('Replies({count})', count => $count) | ||||||
| 2463 | : __('Replies')); | ||||||
| 2464 | } | ||||||
| 2465 | 0 | $l->{'download'} = 0; | |||||
| 2466 | |||||||
| 2467 | } elsif ($rel eq 'service.post') { | ||||||
| 2468 | 0 | $l->{'name'} = __('Comments'); | |||||
| 2469 | 0 | $l->{'download'} = 0; | |||||
| 2470 | |||||||
| 2471 | } elsif ($rel eq 'via') { | ||||||
| 2472 | 0 | $l->{'name'} = __('Via'); | |||||
| 2473 | 0 | $l->{'download'} = 0; | |||||
| 2474 | |||||||
| 2475 | } else { | ||||||
| 2476 | 0 | $l->{'name'} = __x('{linkrel}', linkrel => $rel); | |||||
| 2477 | } | ||||||
| 2478 | |||||||
| 2479 | } else { # ! defined $rel | ||||||
| 2480 | # tags without rel="" attribute | ||||||
| 2481 | # | ||||||
| 2482 | 0 | 0 | if ($tag eq 'enclosure') { | ||||
| 0 | |||||||
| 0 | |||||||
| 0 | |||||||
| 0 | |||||||
| 0 | |||||||
| 0 | |||||||
| 0 | |||||||
| 0 | |||||||
| 2483 | 0 | $l->{'name'} = __('Encl'); | |||||
| 2484 | |||||||
| 2485 | } elsif ($tag eq 'dc:source') { | ||||||
| 2486 | 0 | $l->{'name'} = __('Source'); | |||||
| 2487 | 0 | $l->{'download'} = 0; | |||||
| 2488 | |||||||
| 2489 | } elsif ($tag eq 'wiki:diff') { | ||||||
| 2490 | 0 | $l->{'name'} = __('Diff'); | |||||
| 2491 | |||||||
| 2492 | } elsif ($tag eq 'wiki:history') { | ||||||
| 2493 | 0 | $l->{'name'} = __('History'); | |||||
| 2494 | 0 | $l->{'download'} = 0; | |||||
| 2495 | |||||||
| 2496 | } elsif ($tag =~ /foaf:maker|sioc:has_creator/) { | ||||||
| 2497 | 0 | $l->{'name'} = __('Author'); | |||||
| 2498 | 0 | $l->{'download'} = 0; | |||||
| 2499 | 0 | $l->{'priority'} = -20; # low | |||||
| 2500 | |||||||
| 2501 | } elsif ($tag eq 'statusnet:origin') { | ||||||
| 2502 | 0 | $l->{'name'} = __('Geo location'); | |||||
| 2503 | 0 | $l->{'download'} = 0; | |||||
| 2504 | 0 | $l->{'priority'} = -101; # just after Geo location | |||||
| 2505 | |||||||
| 2506 | } elsif ($tag eq 'sioc:has_discussion') { | ||||||
| 2507 | 0 | $l->{'name'} = __('Discussion'); | |||||
| 2508 | 0 | $l->{'download'} = 0; | |||||
| 2509 | |||||||
| 2510 | } elsif ($tag eq 'wfw:commentrss') { | ||||||
| 2511 | 0 | 0 | if (defined (my $count = $self->item_elt_comments_count($item,$elt))) { | ||||
| 2512 | 0 | $l->{'name'} = __x('RSS Comments({count})', count => $count); | |||||
| 2513 | } else { | ||||||
| 2514 | 0 | $l->{'name'} = __('RSS Comments'); | |||||
| 2515 | } | ||||||
| 2516 | 0 | $l->{'download'} = 0; | |||||
| 2517 | 0 | $l->{'priority'} = -10; | |||||
| 2518 | |||||||
| 2519 |        } elsif ($tag =~ /comment/) {  #  | 
||||||
| 2520 | 0 | 0 | if (defined (my $count = $self->item_elt_comments_count($item,$elt))) { | ||||
| 2521 | 0 | $l->{'name'} = __x('Comments({count})', count => $count); | |||||
| 2522 | } else { | ||||||
| 2523 | 0 | $l->{'name'} = __('Comments'); | |||||
| 2524 | } | ||||||
| 2525 | 0 | $l->{'download'} = 0; | |||||
| 2526 | } | ||||||
| 2527 | } | ||||||
| 2528 | |||||||
| 2529 | # Atom | ||||||
| 2530 | # RSS http:.. | ||||||
| 2531 |      # RSS  | 
||||||
| 2532 | 0 | 0 | my $uri = (non_empty ($elt->att('atom:href')) # Atom | ||||
| 0 | |||||||
| 0 | |||||||
| 0 | |||||||
| 0 | |||||||
| 0 | |||||||
| 0 | |||||||
| 2533 | // non_empty ($elt->att('href')) # Atom | ||||||
| 2534 |                 // non_empty ($elt->att('atom:src')) # Atom  | 
||||||
| 2535 |                 // non_empty ($elt->att('src'))      # Atom  | 
||||||
| 2536 |                 // non_empty ($elt->att('url'))      # RSS  | 
||||||
| 2537 |                 #  | 
||||||
| 2538 | // non_empty ($elt->att('rdf:resource')) | ||||||
| 2539 | // non_empty ($elt->trimmed_text) # RSS | ||||||
| 2540 | // next); | ||||||
| 2541 | 0 | $uri = App::RSS2Leafnode::XML::Twig::Other::elt_xml_based_uri ($elt, $uri); | |||||
| 2542 | |||||||
| 2543 | 0 | $l->{'uri'} = $uri; | |||||
| 2544 | 0 | 0 | $l->{'name'} //= __('Link'); | ||||
| 2545 | |||||||
| 2546 | 0 | my @paren; | |||||
| 2547 | # show length if biggish, often provided on enclosures but not plain | ||||||
| 2548 | # links | ||||||
| 2549 | 0 | 0 | 0 | if (defined (my $length = ($elt->att('atom:length') | |||
| 2550 | // $elt->att('length')))) { | ||||||
| 2551 | 0 | push @paren, $self->format_size_in_bytes($length); | |||||
| 2552 | } | ||||||
| 2553 |      #  | 
||||||
| 2554 | # seconds, otherwise MM:SS or HH:MM:SS. | ||||||
| 2555 | 0 | 0 | 0 | if ($tag eq 'enclosure' | |||
| 2556 | && defined (my $duration = non_empty ($item->first_child_text('itunes:duration')))) { | ||||||
| 2557 | 0 | 0 | if ($duration !~ /:/) { | ||||
| 2558 | 0 | $duration = __px('s-for-seconds', '{duration}s', | |||||
| 2559 | duration => $duration); | ||||||
| 2560 | } | ||||||
| 2561 | 0 | push @paren, $duration; | |||||
| 2562 | } | ||||||
| 2563 | 0 | 0 | if (@paren) { | ||||
| 2564 | 0 | $l->{'name'} .= '('.join(', ',@paren). ')'; | |||||
| 2565 | } | ||||||
| 2566 | |||||||
| 2567 | ### push link: $l | ||||||
| 2568 | 0 | push @links, $l; | |||||
| 2569 | } | ||||||
| 2570 | |||||||
| 2571 |    # eg. RSS  | 
||||||
| 2572 |    #     Atom  | 
||||||
| 2573 |    #             | 
||||||
| 2574 |    #             | 
||||||
| 2575 | # | ||||||
| 2576 |    #             | 
||||||
| 2577 | # | ||||||
| 2578 | # | ||||||
| 2579 | # | ||||||
| 2580 | 0 | foreach my $elt ($item->children('source')) { | |||||
| 2581 | 0 | 0 | my $str = non_empty (elt_to_rendered_line($elt->first_child('title'))) | ||||
| 2582 | // non_empty ($elt->trimmed_text); | ||||||
| 2583 | 0 | 0 | if (defined $str) { | ||||
| 2584 | ### source: $str | ||||||
| 2585 | 0 | push @links, { name => __('Source') . ": $str", | |||||
| 2586 | download => 0, | ||||||
| 2587 | priority => -200, | ||||||
| 2588 | }; | ||||||
| 2589 | } | ||||||
| 2590 | 0 | foreach my $subelt | |||||
| 0 | |||||||
| 2591 | ($elt, | ||||||
| 2592 | grep {$self->atom_link_is_rss($_)} $elt->children('link') | ||||||
| 2593 | ) { | ||||||
| 2594 | 0 | 0 | 0 | if (defined $subelt | |||
| 0 | |||||||
| 0 | |||||||
| 2595 | && defined (my $url = non_empty ($subelt->att('url')) | ||||||
| 2596 | // non_empty ($subelt->att('href')) | ||||||
| 2597 | // non_empty ($subelt->att('atom:href')))) { | ||||||
| 2598 | 0 | push @links, { name => __('Source RSS'), | |||||
| 2599 | uri => App::RSS2Leafnode::XML::Twig::Other::elt_xml_based_uri($subelt,$url), | ||||||
| 2600 | download => 0, | ||||||
| 2601 | priority => -200, | ||||||
| 2602 | }; | ||||||
| 2603 | } | ||||||
| 2604 | } | ||||||
| 2605 | } | ||||||
| 2606 | |||||||
| 2607 | # Merge together duplicate urls, so as not to download two copies as | ||||||
| 2608 | # attachments, and so as to make it clear when there's only one | ||||||
| 2609 | # destination for two things. | ||||||
| 2610 | # | ||||||
| 2611 |    # Have seen same url under  and  | 
||||||
| 2612 | # http://sourceforge.net/export/rss2_keepsake.php?group_id=203650 | ||||||
| 2613 |    # or same url under  and  | 
||||||
| 2614 | # http://abc.net.au/rn/podcast/feeds/sci.xml | ||||||
| 2615 | { | ||||||
| 2616 | 0 | my %seen; | |||||
| 0 | |||||||
| 2617 | 0 | @links = grep { | |||||
| 2618 | 0 | my $l = $_; | |||||
| 2619 | 0 | my $want = 1; | |||||
| 2620 | 0 | 0 | if (my $uri = $l->{'uri'}) { | ||||
| 2621 | 0 | my $canonical = $uri->canonical; | |||||
| 2622 | 0 | $canonical->fragment(undef); # ignore #foo anchor for uniqueness | |||||
| 2623 | 0 | 0 | if (my $prev_l = $seen{$canonical}) { | ||||
| 2624 | 0 | $want = 0; | |||||
| 2625 | 0 | 0 | $prev_l->{'download'} ||= $l->{'download'}; | ||||
| 2626 | 0 | 0 | $l->{'priority'} = max ($l->{'priority'}||0, | ||||
| 0 | |||||||
| 2627 | $prev_l->{'priority'}||0); | ||||||
| 2628 | |||||||
| 2629 | # prefer no anchor if have both with and without | ||||||
| 2630 | 0 | 0 | if (is_empty($l->{'uri'}->fragment)) { | ||||
| 2631 | 0 | $prev_l->{'uri'} = $l->{'uri'}; | |||||
| 2632 | } | ||||||
| 2633 | |||||||
| 2634 | 0 | 0 | if ($prev_l->{'name'} eq __('Link')) { | ||||
| 0 | |||||||
| 0 | |||||||
| 2635 | # name "Link" doesn't say much, prefer the other over "Link" | ||||||
| 2636 | 0 | $prev_l->{'name'} = $l->{'name'}; | |||||
| 2637 | } elsif ($l->{'name'} eq __('Link')) { | ||||||
| 2638 | # don't append "Link" to the previous | ||||||
| 2639 | } elsif ($l->{'name'} eq $prev_l->{'name'}) { | ||||||
| 2640 | # don't double the same name | ||||||
| 2641 | } else { | ||||||
| 2642 | 0 | $prev_l->{'name'} .= ", $l->{'name'}"; | |||||
| 2643 | } | ||||||
| 2644 | } | ||||||
| 2645 | 0 | $seen{$canonical} = $l; | |||||
| 2646 | } | ||||||
| 2647 | $want | ||||||
| 2648 | 0 | } @links; | |||||
| 2649 | } | ||||||
| 2650 | 0 | foreach my $l (@links) { | |||||
| 2651 | 0 | 0 | if ($l->{'uri'}) { | ||||
| 2652 | 0 | $l->{'name'} .= ':'; | |||||
| 2653 | } | ||||||
| 2654 | } | ||||||
| 2655 | |||||||
| 2656 | 0 | 0 | if (defined (my $str = $self->item_to_lat_long_alt_str ($item))) { | ||||
| 2657 | 0 | push @links, { name => $str, | |||||
| 2658 | download => 0, | ||||||
| 2659 | priority => -100, # lat/long low priority | ||||||
| 2660 | }; | ||||||
| 2661 | } | ||||||
| 2662 | |||||||
| 2663 | # re:rank as for example from stackexchange.com | ||||||
| 2664 | # What does label="" usually show? Are parens like this good? | ||||||
| 2665 | 0 | foreach my $elt ($item->children('re:rank')) { | |||||
| 2666 | 0 | my $label = $elt->att('label'); | |||||
| 2667 | 0 | my $value = elt_to_rendered_line($elt); | |||||
| 2668 | 0 | 0 | push @links, { name => (defined $label | ||||
| 2669 | ? __x('Rank: {value} ({label})', value => $value, label => $label) | ||||||
| 2670 | : __x('Rank: {value}', value => $value)), | ||||||
| 2671 | download => 0, | ||||||
| 2672 | priority => -200, # low priority | ||||||
| 2673 | }; | ||||||
| 2674 | } | ||||||
| 2675 | |||||||
| 2676 |    # eg.  | 
||||||
| 2677 | # is there any value in the role="" part? | ||||||
| 2678 | 0 | foreach my $elt ($item->children('media:credit')) { | |||||
| 2679 | 0 | push @links, { name => __x('Credit: {who}', | |||||
| 2680 | who => scalar(elt_to_rendered_line($elt))), | ||||||
| 2681 | download => 0, | ||||||
| 2682 | priority => -200, # very low priority | ||||||
| 2683 | }; | ||||||
| 2684 | } | ||||||
| 2685 | |||||||
| 2686 |    #  | 
||||||
| 2687 | # | ||||||
| 2688 |    # Allow for empty  | 
||||||
| 2689 | # http://abc.net.au/rn/podcast/feeds/sci.xml | ||||||
| 2690 | # | ||||||
| 2691 | 0 | foreach my $elt ($item->children('itunes:explicit')) { | |||||
| 2692 | 0 | 0 | my $line = elt_to_rendered_line($elt) | ||||
| 2693 |        // next; # skip empty  | 
||||||
| 2694 | 0 | push @links, { name => __x('Explicit: {value}', value => $line), | |||||
| 2695 | download => 0, | ||||||
| 2696 | priority => -200, # very low priority | ||||||
| 2697 | }; | ||||||
| 2698 | } | ||||||
| 2699 | |||||||
| 2700 |    #  | 
||||||
| 2701 | # a fun kind of commentary thing | ||||||
| 2702 | 0 | foreach my $elt ($item->children('slash:department')) { | |||||
| 2703 | 0 | push @links, { name => __x('Department: {department}', | |||||
| 2704 | department => scalar(elt_to_rendered_line($elt))), | ||||||
| 2705 | download => 0, | ||||||
| 2706 | priority => -200, # very low priority | ||||||
| 2707 | }; | ||||||
| 2708 | } | ||||||
| 2709 | |||||||
| 2710 | 0 | return @links; | |||||
| 2711 | } | ||||||
| 2712 | @known{qw( | ||||||
| 2713 | /channel/item/pheedo:origLink | ||||||
| 2714 | /channel/item/feedburner:origLink | ||||||
| 2715 | |||||||
| 2716 | /channel/item/link | ||||||
| 2717 | /channel/item/enclosure | ||||||
| 2718 | /channel/item/dc:source | ||||||
| 2719 | /channel/item/comments | ||||||
| 2720 | /channel/item/wfw:comment | ||||||
| 2721 | /channel/item/wfw:commentRss | ||||||
| 2722 | /channel/item/slash:comments | ||||||
| 2723 | /channel/item/slash:hit_parade | ||||||
| 2724 | /channel/item/slash:department | ||||||
| 2725 | /channel/item/thr:total | ||||||
| 2726 | /channel/item/content --atom | ||||||
| 2727 | /channel/item/wiki:diff | ||||||
| 2728 | /channel/item/itunes:duration | ||||||
| 2729 | /channel/item/re:rank | ||||||
| 2730 | |||||||
| 2731 | /channel/wiki:interwiki | ||||||
| 2732 | /channel/wiki:interwiki/rdf:Description | ||||||
| 2733 | /channel/wiki:interwiki/rdf:Description/rdf:value | ||||||
| 2734 | /channel/item/wiki:version | ||||||
| 2735 | /channel/item/wiki:status | ||||||
| 2736 | /channel/item/wiki:history | ||||||
| 2737 | /channel/item/foaf:maker | ||||||
| 2738 | /channel/item/sioc:has_creator | ||||||
| 2739 | /channel/item/sioc:has_discussion | ||||||
| 2740 | /channel/item/sioc:links_to | ||||||
| 2741 | /channel/item/sioc:reply_of | ||||||
| 2742 | /channel/item/media:credit | ||||||
| 2743 | /channel/item/itunes:explicit | ||||||
| 2744 | /channel/item/itunes:block | ||||||
| 2745 | |||||||
| 2746 | --believed-to-be-duplicate-of-description | ||||||
| 2747 | /channel/item/media:content | ||||||
| 2748 | /channel/item/media:text | ||||||
| 2749 | )} = (); | ||||||
| 2750 | |||||||
| 2751 | # sub any_link_replies_nonfeed { | ||||||
| 2752 | # foreach my $elt (@_) { | ||||||
| 2753 | # # ### any_link_replies_nonfeed(): ref $elt, "$elt", $elt->tag | ||||||
| 2754 | # # my $rel = ($elt->att('atom:rel') // $elt->att('rel') // ''); | ||||||
| 2755 | # # my $type = ($elt->att('atom:type') // $elt->att('type') // ''); | ||||||
| 2756 | # # ### $rel | ||||||
| 2757 | # # ### $type | ||||||
| 2758 | # if ($elt->tag eq 'link' | ||||||
| 2759 | # && (($elt->att('atom:rel') // $elt->att('rel') // '') | ||||||
| 2760 | # eq 'replies') | ||||||
| 2761 | # && (($elt->atom_link_is_rss($elt))) { | ||||||
| 2762 | # return 1; | ||||||
| 2763 | # } | ||||||
| 2764 | # } | ||||||
| 2765 | # return 0; | ||||||
| 2766 | # } | ||||||
| 2767 | |||||||
| 2768 | # Return a string which is the latitude, longitude and possibly altitude | ||||||
| 2769 | # from the item. If no location in the item then return undef. | ||||||
| 2770 | # | ||||||
| 2771 | sub item_to_lat_long_alt_str { | ||||||
| 2772 | 0 | 0 | 0 | my ($self, $item) = @_; | |||
| 2773 | 0 | 0 | my ($lat, $long, $alt) = $self->item_to_lat_long_alt_values ($item) | ||||
| 2774 | or return; | ||||||
| 2775 | ### $lat | ||||||
| 2776 | ### $long | ||||||
| 2777 | ### $alt | ||||||
| 2778 | |||||||
| 2779 | 0 | 0 | if (Scalar::Util::looks_like_number($lat)) { | ||||
| 2780 | 0 | 0 | $lat = ($lat >= 0 | ||||
| 2781 | # TRANSLATORS: the latin1/unicode degree symbol can be used here | ||||||
| 2782 | # instead of " deg", if it will be recognised in translation, | ||||||
| 2783 | # etc. | ||||||
| 2784 | ? __x('{latitude} deg N', latitude => $lat) | ||||||
| 2785 | : __x('{latitude} deg S', latitude => -$lat)); | ||||||
| 2786 | } | ||||||
| 2787 | 0 | 0 | if (Scalar::Util::looks_like_number($long)) { | ||||
| 2788 | 0 | 0 | $long = ($long >= 0 | ||||
| 2789 | ? __x('{longitude} deg E', longitude => $long) | ||||||
| 2790 | : __x('{longitude} deg W', longitude => -$long)); | ||||||
| 2791 | } | ||||||
| 2792 | |||||||
| 2793 | 0 | 0 | if (is_non_empty ($alt)) { | ||||
| 2794 | 0 | return __x('Geo location: {latitude}, {longitude}, alt {altitude}m', | |||||
| 2795 | latitude => $lat, | ||||||
| 2796 | longitude => $long, | ||||||
| 2797 | altitude => $alt); | ||||||
| 2798 | } else { | ||||||
| 2799 | 0 | return __x('Geo location: {latitude}, {longitude}', | |||||
| 2800 | latitude => $lat, | ||||||
| 2801 | longitude => $long); | ||||||
| 2802 | } | ||||||
| 2803 | } | ||||||
| 2804 | |||||||
| 2805 | # Return a list of values which are the latitude, longitude and possibly | ||||||
| 2806 | # altitude extracted from $item. | ||||||
| 2807 | # | ||||||
| 2808 | # ($latitude, $longitude, $altitude) | ||||||
| 2809 | # ($latitude, $longitude) | ||||||
| 2810 | # () | ||||||
| 2811 | # | ||||||
| 2812 | # If no location then return an empty list. Some of the values returned | ||||||
| 2813 |  # might be empty strings if say there's a  | 
||||||
| 2814 | # | ||||||
| 2815 | # Latitude is degrees North, or negative for South. Longitude is degrees | ||||||
| 2816 | # East, or negative for West. Both possibly with decimal places. | ||||||
| 2817 | # | ||||||
| 2818 | sub item_to_lat_long_alt_values { | ||||||
| 2819 | 0 | 0 | 0 | my ($self, $item) = @_; | |||
| 2820 | |||||||
| 2821 |    # per-item  | 
||||||
| 2822 | # http://earthquake.usgs.gov/eqcenter/recenteqsww/catalogs/eqs7day-M5.xml | ||||||
| 2823 |    #  | 
||||||
| 2824 |    #    | 
||||||
| 2825 |    #    | 
||||||
| 2826 | # | ||||||
| 2827 | # or under geo:Point, maybe, eg. http://www.gdacs.org/xml/RSSTC.xml | ||||||
| 2828 |    #  | 
||||||
| 2829 |    #    | 
||||||
| 2830 |    #      | 
||||||
| 2831 |    #      | 
||||||
| 2832 | # | ||||||
| 2833 | 0 | foreach my $elt ($item, $item->children(qr/^geo:point$/i)) { | |||||
| 2834 | 0 | my $lat = $elt->first_child_trimmed_text('geo:lat'); | |||||
| 2835 | 0 | 0 | if (is_non_empty ($lat)) { | ||||
| 2836 | 0 | return ($lat, | |||||
| 2837 | $elt->first_child_trimmed_text('geo:long'), | ||||||
| 2838 | non_empty ($elt->first_child_trimmed_text('geo:alt'))); | ||||||
| 2839 | } | ||||||
| 2840 | } | ||||||
| 2841 | |||||||
| 2842 |    #  | 
||||||
| 2843 |    #    | 
||||||
| 2844 | # space separator per http://www.georss.org/Encodings | ||||||
| 2845 | { | ||||||
| 2846 | 0 | my $str = $item->first_child_trimmed_text ('georss:point'); | |||||
| 0 | |||||||
| 2847 | 0 | 0 | if (is_non_empty ($str)) { | ||||
| 2848 | 0 | return split(/\s+/, $str, 2); # no altitude | |||||
| 2849 | } | ||||||
| 2850 | } | ||||||
| 2851 | |||||||
| 2852 |    #  | 
||||||
| 2853 |    #    | ||||||
| 2854 | # rdf:resource="http://sws.geonames.org/2638077/"> | ||||||
| 2855 | # | ||||||
| 2856 | 0 | 0 | if (my $elt = $item->first_child ('statusnet:origin')) { | ||||
| 2857 | 0 | 0 | if (defined (my $lat = $elt->att('geo:lat'))) { | ||||
| 2858 | 0 | my $long = $elt->att('geo:long'); | |||||
| 2859 | 0 | return ($lat, $long); | |||||
| 2860 | } | ||||||
| 2861 | } | ||||||
| 2862 | |||||||
| 2863 | 0 | return; # not found | |||||
| 2864 | } | ||||||
| 2865 | @known{qw(/channel/item/geo:lat | ||||||
| 2866 | /channel/item/geo:long | ||||||
| 2867 | /channel/item/geo:alt | ||||||
| 2868 | /channel/item/geo:Point | ||||||
| 2869 | /channel/item/geo:Point/geo:lat | ||||||
| 2870 | /channel/item/geo:Point/geo:long | ||||||
| 2871 | /channel/item/georss:point | ||||||
| 2872 | /channel/item/statusnet:origin | ||||||
| 2873 | )} = (); | ||||||
| 2874 | |||||||
| 2875 | |||||||
| 2876 | sub links_to_html { | ||||||
| 2877 | 0 | 0 | 0 | 0 | @_ or return ''; | ||
| 2878 | |||||||
| 2879 |    #  | 
||||||
| 2880 |    # up by a line-wrap, which can make it hard to cut and paste.  can  | 
||||||
| 2881 | # prevent a line wrap, but it ends up treated as starting a paragraph, | ||||||
| 2882 | # separate from the 'name' part. | ||||||
| 2883 | # | ||||||
| 2884 | 0 | my $str = ''; | |||||
| 2885 | 0 |    my $sep = "\n\n \n";  | 
|||||
| 2886 | 0 | foreach my $l (@_) { | |||||
| 2887 | 0 |      $str .= "$sep | 
|||||
| 2888 | 0 |      $sep = " \n";  | 
|||||
| 2889 | |||||||
| 2890 | 0 | 0 | if (defined (my $uri = $l->{'uri'})) { | ||||
| 2891 | 0 | $str .= " | |||||
| 2892 | 0 | 0 | if (defined (my $hreflang = $l->{'hreflang'})) { | ||||
| 2893 | 0 | $str .= " hreflang=\"$Entitize{$hreflang}\""; | |||||
| 2894 | } | ||||||
| 2895 | 0 | 0 | if (defined (my $type = $l->{'type'})) { | ||||
| 2896 | 0 | $str .= " type=\"$Entitize{$type}\""; | |||||
| 2897 | } | ||||||
| 2898 | 0 | $uri = $Entitize{$uri}; | |||||
| 2899 | 0 | $str .= " href=\"$uri\">$uri"; | |||||
| 2900 | } | ||||||
| 2901 | 0 | $str .= "\n"; | |||||
| 2902 | } | ||||||
| 2903 | 0 | return "$str\n"; | |||||
| 2904 | } | ||||||
| 2905 | |||||||
| 2906 | sub links_to_text { | ||||||
| 2907 | 0 | 0 | 0 | return join ('', map { join_non_empty (' ', | |||
| 0 | |||||||
| 2908 | $_->{'name'}, | ||||||
| 2909 | $_->{'uri'}) . "\n" } @_); | ||||||
| 2910 | } | ||||||
| 2911 | |||||||
| 2912 | |||||||
| 2913 | #------------------------------------------------------------------------------ | ||||||
| 2914 | # "From:" and email addresses | ||||||
| 2915 | |||||||
| 2916 | 1 | 1 | 11072 | use constant DUMMY_EMAIL_ADDRESS => 'nobody@rss2leafnode.dummy'; | |||
| 1 | 10 | ||||||
| 1 | 8161 | ||||||
| 2917 | |||||||
| 2918 | { | ||||||
| 2919 | my %tag_to_link_name | ||||||
| 2920 | = (author => __('Author:'), | ||||||
| 2921 | creator => __('Creator:'), | ||||||
| 2922 | contributor => __('Contributor:'), | ||||||
| 2923 | managingEditor => __('Managing Editor:'), | ||||||
| 2924 | webMaster => __('Webmaster:'), | ||||||
| 2925 | publisher => __('Publisher:'), | ||||||
| 2926 | owner => __('Owner:'), | ||||||
| 2927 | username => __('User:'), | ||||||
| 2928 | ); | ||||||
| 2929 | |||||||
| 2930 | # Return ($from, $linkhash,$linkhash,...). | ||||||
| 2931 | # $from is a string like "foo@example.com". | ||||||
| 2932 | # Multiple authors are for example "foo@example.com, quux@example.com" as | ||||||
| 2933 | # per RFC5322 email, though currently no Sender: is picked out from among | ||||||
| 2934 | # them. | ||||||
| 2935 | # | ||||||
| 2936 | sub item_to_from { | ||||||
| 2937 | 0 | 0 | 0 | my ($self, $item) = @_; | |||
| 2938 | ### item_to_from() ... | ||||||
| 2939 | 0 | my $channel = item_to_channel($item); | |||||
| 2940 | |||||||
| 2941 |      #  | 
||||||
| 2942 |      # looser.  The RSS recommendation is  | 
||||||
| 2943 |      # and  | 
||||||
| 2944 | # | ||||||
| 2945 |      #  | 
||||||
| 2946 | # attribute. | ||||||
| 2947 | # | ||||||
| 2948 |      #  | 
||||||
| 2949 | # | ||||||
| 2950 |      #  | 
||||||
| 2951 | # to show just the primary author or authors. | ||||||
| 2952 | # | ||||||
| 2953 | # The first | ||||||
| 2954 | # | ||||||
| 2955 | 0 | my @from; | |||||
| 2956 | my @links; | ||||||
| 2957 | 0 | foreach my $try ([$item, 'author'], | |||||
| 2958 | [$item, 'jf:author'], | ||||||
| 2959 | [$item, 'slate:author'], | ||||||
| 2960 | [$item, 'dc:creator'], | ||||||
| 2961 | [$item, 'dc:contributor'], | ||||||
| 2962 | [$item, 'wiki:username'], | ||||||
| 2963 | [$item, 'itunes:author'], | ||||||
| 2964 | |||||||
| 2965 | [$channel, 'author'], | ||||||
| 2966 | [$channel, 'dc:creator'], | ||||||
| 2967 | [$channel, 'itunes:author'], | ||||||
| 2968 | [$channel, 'managingEditor'], | ||||||
| 2969 | [$channel, 'webMaster'], | ||||||
| 2970 | |||||||
| 2971 | [$item , 'dc:publisher'], | ||||||
| 2972 | [$channel, 'dc:publisher'], | ||||||
| 2973 | [$channel, 'itunes:owner'], | ||||||
| 2974 | ) { | ||||||
| 2975 | 0 | my ($where, $tag) = @$try; | |||||
| 2976 | ### $tag | ||||||
| 2977 | |||||||
| 2978 | 0 | 0 | if (my @elts = $item->children($tag)) { | ||||
| 2979 | 0 | foreach my $elt (@elts) { | |||||
| 2980 | ### elt for From: $elt->sprint | ||||||
| 2981 | 0 | push @from, $self->elt_to_email($elt); | |||||
| 2982 | |||||||
| 2983 | # author's home page etc as a link | ||||||
| 2984 | 0 | 0 | 0 | if (my $uri = | |||
| 0 | |||||||
| 2985 | (# Atom | ||||||
| 2986 |                 #  | 
||||||
| 2987 |                 #    | 
||||||
| 2988 |                 #    | 
||||||
| 2989 | # | ||||||
| 2990 | # | ||||||
| 2991 | non_empty ($elt->first_child_text('uri')) | ||||||
| 2992 | |||||||
| 2993 | # slate.com | ||||||
| 2994 |                 #  | 
||||||
| 2995 | // non_empty ($elt->att('url')) | ||||||
| 2996 | |||||||
| 2997 | # ModWiki dc:contributor example | ||||||
| 2998 |                 #      | 
||||||
| 2999 |                 #        | 
||||||
| 3000 | # | ||||||
| 3001 | # The text shows rss:link= and the example just link=. | ||||||
| 3002 | # | ||||||
| 3003 | // non_empty (do { | ||||||
| 3004 | 0 | 0 | 0 | my $child; ($child = $elt->first_child('rdf:Description')) | |||
| 0 | |||||||
| 3005 | && ($child->att('link') // $child->att('rss:link')) | ||||||
| 3006 | }))) { | ||||||
| 3007 | 0 | my $tag = $elt->tag; | |||||
| 3008 | 0 | $tag =~ s/.*?://; | |||||
| 3009 | 0 | 0 | push @links, { uri => URI->new($uri), | ||||
| 3010 | name => ($tag_to_link_name{$tag} // "\u$tag:"), | ||||||
| 3011 | download => 0, | ||||||
| 3012 | priority => -20 }; | ||||||
| 3013 | } | ||||||
| 3014 | } | ||||||
| 3015 | } | ||||||
| 3016 | 0 | 0 | last if @from; | ||||
| 3017 | } | ||||||
| 3018 | 0 | 0 | if (! @from) { | ||||
| 3019 |        # Atom  | 
||||||
| 3020 | # Hope the channel title is different from the item title. | ||||||
| 3021 | 0 | @from = ($self->email_format (elt_to_rendered_line | |||||
| 3022 | ($channel->first_child('title')))); | ||||||
| 3023 | } | ||||||
| 3024 | 0 | 0 | if (! @from) { | ||||
| 3025 | 0 | @from = ('nobody@'.$self->uri_to_host); | |||||
| 3026 | } | ||||||
| 3027 | |||||||
| 3028 | ### @from | ||||||
| 3029 | 0 | return (join(', ',@from), | |||||
| 3030 | @links); | ||||||
| 3031 | } | ||||||
| 3032 | @known{qw(/channel/author | ||||||
| 3033 | /channel/author/name --atom | ||||||
| 3034 | /channel/author/uri --atom | ||||||
| 3035 | /channel/author/url --atom-typo-maybe | ||||||
| 3036 | /channel/author/email --atom | ||||||
| 3037 | /channel/managingEditor | ||||||
| 3038 | /channel/webMaster | ||||||
| 3039 | /channel/dc:publisher | ||||||
| 3040 | /channel/dc:creator | ||||||
| 3041 | /channel/itunes:author | ||||||
| 3042 | |||||||
| 3043 | /channel/item/author | ||||||
| 3044 | /channel/item/author/name --atom | ||||||
| 3045 | /channel/item/author/uri --atom | ||||||
| 3046 | /channel/item/author/url --atom-typo-maybe | ||||||
| 3047 | /channel/item/author/email --atom | ||||||
| 3048 | /channel/item/author/gd:extendedProperty --good-dinner | ||||||
| 3049 | /channel/item/dc:creator | ||||||
| 3050 | /channel/item/dc:publisher | ||||||
| 3051 | /channel/item/wiki:username | ||||||
| 3052 | /channel/item/itunes:author | ||||||
| 3053 | /channel/item/dc:contributor | ||||||
| 3054 | /channel/item/dc:contributor/rdf:Description | ||||||
| 3055 | /channel/item/dc:contributor/rdf:Description/rdf:value | ||||||
| 3056 | /channel/item/jf:author | ||||||
| 3057 | /channel/item/slate:author | ||||||
| 3058 | |||||||
| 3059 | /channel/item/contributor --atom | ||||||
| 3060 | /channel/item/contributor/name | ||||||
| 3061 | /channel/item/contributor/uri | ||||||
| 3062 | /channel/item/contributor/url --atom-typo-maybe | ||||||
| 3063 | /channel/item/contributor/email | ||||||
| 3064 | |||||||
| 3065 | /channel/item/activity:actor | ||||||
| 3066 | /channel/item/activity:verb --usually-post-or-something | ||||||
| 3067 | /channel/item/activity:object-type --is-this-anything | ||||||
| 3068 | )} = (); | ||||||
| 3069 | } | ||||||
| 3070 | |||||||
| 3071 | # $elt is an XML::Twig::Elt | ||||||
| 3072 | # Return an email address, either just the text part of $elt or Atom | ||||||
| 3073 |  # sub-elements  | 
||||||
| 3074 | # If $elt is empty then return an empty list. | ||||||
| 3075 | # | ||||||
| 3076 | sub elt_to_email { | ||||||
| 3077 | 0 | 0 | 0 | my ($self, $elt) = @_; | |||
| 3078 | 0 | 0 | return unless defined $elt; | ||||
| 3079 | |||||||
| 3080 |    #  | 
||||||
| 3081 |    #  | 
||||||
| 3082 | 0 | my $email = elt_to_rendered_line ($elt->first_child(qr/^(itunes:)?email$/)); | |||||
| 3083 | |||||||
| 3084 |    #  | 
||||||
| 3085 |    #  | 
||||||
| 3086 | 0 | 0 | my $display = elt_to_rendered_line ($elt->first_child(qr/^(itunes:)?name$/)) | ||||
| 3087 | // ''; | ||||||
| 3088 | |||||||
| 3089 | my $maybe = join | ||||||
| 3090 | (' ', | ||||||
| 3091 | non_empty ($elt->text_only), | ||||||
| 3092 | 0 | non_empty (do { | |||||
| 3093 |         #  | 
||||||
| 3094 | 0 | 0 | my $rdfdesc; ($rdfdesc = $elt->first_child('rdf:Description')) | ||||
| 0 | |||||||
| 3095 | && $rdfdesc->first_child_text('rdf:value') | ||||||
| 3096 | })); | ||||||
| 3097 | |||||||
| 3098 | 0 | return $self->email_format_maybe ($maybe, $display, $email); | |||||
| 3099 | } | ||||||
| 3100 | |||||||
| 3101 | # $mailbox_re is a mailbox with domain, like "foo@example.com" | ||||||
| 3102 | # Allows no dots like "foo@localhost". | ||||||
| 3103 | # Allows dashes like "www-something@example.com". | ||||||
| 3104 | # | ||||||
| 3105 | # $mailbox_with_comment_re allows an optional paren comment part like | ||||||
| 3106 | # "foo@example.com (Foo)" | ||||||
| 3107 | # | ||||||
| 3108 | # cf Email::Address $addr_spec, but its version 1.890 loosened to allow a | ||||||
| 3109 | # domain-less bare "foo", which is no good | ||||||
| 3110 | # | ||||||
| 3111 | my $words_with_dots_re = qr/[[:word:]-]+(\.[[:word:]-]+)*/; | ||||||
| 3112 | my $mailbox_re = qr/$words_with_dots_re\@$words_with_dots_re/o; | ||||||
| 3113 | my $mailbox_with_comment_re = qr/$mailbox_re(\s*\([^\)]*\))?/os; | ||||||
| 3114 | |||||||
| 3115 | # $maybe is some free-form author part possibly including a foo@example.com | ||||||
| 3116 | # $display is a display part for the email like "Foo", possibly empty "" | ||||||
| 3117 | # $email is a mailbox "foo@example.com", or undef | ||||||
| 3118 |  # return an rfc822 "Foo  | 
||||||
| 3119 | # | ||||||
| 3120 | sub email_format_maybe { | ||||||
| 3121 | 0 | 0 | 0 | my ($self, $maybe, $display, $email) = @_; | |||
| 3122 | ### email_format_maybe() start | ||||||
| 3123 | ### $maybe | ||||||
| 3124 | ### $display | ||||||
| 3125 | |||||||
| 3126 | |||||||
| 3127 |    # look also at $display the same in case Atom no  | 
||||||
| 3128 | # which is a mailbox and can be corrected, | ||||||
| 3129 | # eg. http://www.weather.gov/alerts-beta/hi.php?x=0 | ||||||
| 3130 | # | ||||||
| 3131 | # Or $maybe full email like | ||||||
| 3132 | 0 | 0 | if (is_empty($email)) { | ||||
| 3133 | 0 | foreach ($maybe, $display) { | |||||
| 3134 | |||||||
| 3135 | 0 | 0 | 0 | if (/^\s*(mailto:)?($mailbox_with_comment_re)\s*$/o) { | |||
| 0 | |||||||
| 3136 | ### maybe or display is a mailbox | ||||||
| 3137 | # "foo@example.com" | ||||||
| 3138 | # "mailto:foo@example.com" | ||||||
| 3139 | # "foo@example.com (Foo)" | ||||||
| 3140 | 0 | $email = $2; | |||||
| 3141 | 0 | undef $_; | |||||
| 3142 | 0 | last; | |||||
| 3143 | |||||||
| 3144 | } elsif (/(.*)\((mailto:)?($mailbox_re)\)\s*$/o | ||||||
| 3145 | || /(.*)<(mailto:)?($mailbox_re)>\s*$/o) { | ||||||
| 3146 | ### maybe or display part is display plus mailbox | ||||||
| 3147 | # "Foo (mailto:foo@example.com)" | ||||||
| 3148 | # "Foo (foo@example.com)" | ||||||
| 3149 |          #     "Foo  | 
||||||
| 3150 | # | ||||||
| 3151 | 0 | $_ = $1; | |||||
| 3152 | 0 | $email = $3; | |||||
| 3153 | 0 | last; | |||||
| 3154 | } | ||||||
| 3155 | } | ||||||
| 3156 | } | ||||||
| 3157 | |||||||
| 3158 | 0 | 0 | $display .= ' '.($maybe//''); | ||||
| 3159 | 0 | my $ret; | |||||
| 3160 | 0 | 0 | 0 | if (is_empty($email) && $display =~ /^$mailbox_re$/o) { | |||
| 3161 | # display or maybe is a "foo@example.com" or "foo@example.com (Foo)", | ||||||
| 3162 | # return it as-is, in particular leave it in "(Foo)" style comment | ||||||
| 3163 | 0 | $ret = $display; | |||||
| 3164 | } else { | ||||||
| 3165 | 0 | $ret = $self->email_format ($display, $email); | |||||
| 3166 | } | ||||||
| 3167 | |||||||
| 3168 |    # Collapse whitespace against possible tabs and newlines in an  | 
||||||
| 3169 | # from googlegroups for instance. MIME::Entity seems to collapse | ||||||
| 3170 | # newlines, but not tabs. | ||||||
| 3171 | 0 | return non_empty (collapse_whitespace ($ret)); | |||||
| 3172 | } | ||||||
| 3173 | |||||||
| 3174 | # $display is a display part for the email "Foo", possibly empty "" | ||||||
| 3175 | # $email is a mailbox "foo@example.com", or undef or empty "" | ||||||
| 3176 |  # return an rfc822 "Foo  | 
||||||
| 3177 | # | ||||||
| 3178 | sub email_format { | ||||||
| 3179 | 0 | 0 | 0 | my ($self, $display, $email) = @_; | |||
| 3180 | ### $display | ||||||
| 3181 | 0 | $display = Text::Trim::trim($display); | |||||
| 3182 | 0 | $email = Text::Trim::trim($email); | |||||
| 3183 | 0 | 0 | if (is_empty($display)) { | ||||
| 3184 | 0 | 0 | if (is_empty($email)) { | ||||
| 3185 | 0 | return; | |||||
| 3186 | } else { | ||||||
| 3187 | 0 | return $email; | |||||
| 3188 | } | ||||||
| 3189 | } | ||||||
| 3190 | 0 | 0 | if (is_empty($email)) { | ||||
| 3191 | # think can't have empty <> or omitted, otherwise the quoted part is | ||||||
| 3192 | # still parsed as an address, certainly it's not rfc822 compliant to | ||||||
| 3193 | # omit | ||||||
| 3194 | 0 | $email = 'nobody@'.$self->uri_to_host; | |||||
| 3195 | } else { | ||||||
| 3196 | 0 | $email = $email; | |||||
| 3197 | } | ||||||
| 3198 | 0 | return email_phrase_quote_maybe($display) . " <$email>"; | |||||
| 3199 | } | ||||||
| 3200 | |||||||
| 3201 | # return $str with quotes like "Foo Bar" if it needs them to go in an email | ||||||
| 3202 | # display part | ||||||
| 3203 | sub email_phrase_quote_maybe { | ||||||
| 3204 | 0 | 0 | 0 | my ($str) = @_; | |||
| 3205 | 0 | 0 | return if ! defined $str; | ||||
| 3206 | |||||||
| 3207 | # RFC2822 "atext" characters, with "-" last | ||||||
| 3208 | 0 | 0 | if ($str =~ m<[^[:alnum:][:space:]!#\$%&'*+/=?^_`{|}~-]>) { | ||||
| 3209 | # strange chars, need to quote | ||||||
| 3210 | 0 | return email_phrase_quote($str); | |||||
| 3211 | } else { | ||||||
| 3212 | # alphanumeric and whitespace, no quotes | ||||||
| 3213 | 0 | return $str; | |||||
| 3214 | } | ||||||
| 3215 | } | ||||||
| 3216 | sub email_phrase_quote { | ||||||
| 3217 | 0 | 0 | 0 | my ($str) = @_; | |||
| 3218 | 0 | 0 | return if ! defined $str; | ||||
| 3219 | 0 | $str =~ s/^"(.*)"$/$1/; # strip existing quotes | |||||
| 3220 | 0 | $str =~ s/(["\\])/\\$1/g; # escape internal quotes and backslashes | |||||
| 3221 | 0 | return "\"$str\""; | |||||
| 3222 | } | ||||||
| 3223 | |||||||
| 3224 | |||||||
| 3225 | #------------------------------------------------------------------------------ | ||||||
| 3226 | # rss_newest_only | ||||||
| 3227 | |||||||
| 3228 | { | ||||||
| 3229 | my %multiplier = (minute => 60, | ||||||
| 3230 | hour => 3600, | ||||||
| 3231 | day => 86400, | ||||||
| 3232 | week => 86400 * 7, | ||||||
| 3233 | month => 365.25 * 86400 / 12, | ||||||
| 3234 | year => 365.25 * 86400, | ||||||
| 3235 | ); | ||||||
| 3236 | # return a target time_t, or undef | ||||||
| 3237 | sub rss_newest_only_timet { | ||||||
| 3238 | 0 | 0 | 0 | my ($self) = @_; | |||
| 3239 | |||||||
| 3240 | 0 | 0 | if (defined (my $str = $self->{'rss_newest_only'})) { | ||||
| 3241 | 0 | 0 | if ($str =~ /^\s*(\d+)\s*(minute|hour|day|week|month|year)s?\s*$/) { | ||||
| 3242 | 0 | return time() - $1*$multiplier{$2}; | |||||
| 3243 | } | ||||||
| 3244 | } | ||||||
| 3245 | 0 | return undef; | |||||
| 3246 | } | ||||||
| 3247 | } | ||||||
| 3248 | |||||||
| 3249 | # return a number, or undef | ||||||
| 3250 | sub rss_newest_only_count { | ||||||
| 3251 | 0 | 0 | 0 | my ($self) = @_; | |||
| 3252 | 0 | 0 | if (defined (my $str = $self->{'rss_newest_only'})) { | ||||
| 3253 | 0 | 0 | if (Scalar::Util::looks_like_number($str)) { | ||||
| 3254 | ### rss_newest_only number: $str | ||||||
| 3255 | 0 | return $str; | |||||
| 3256 | } | ||||||
| 3257 | } | ||||||
| 3258 | 0 | return undef; | |||||
| 3259 | } | ||||||
| 3260 | |||||||
| 3261 | # return @items restricted or filtered by rss_newest_only | ||||||
| 3262 | sub rss_newest_only_items { | ||||||
| 3263 | 0 | 0 | 0 | my ($self, @items) = @_; | |||
| 3264 | |||||||
| 3265 | 0 | 0 | if (defined (my $count = $self->rss_newest_only_count)) { | ||||
| 3266 | 0 | 0 | if ($count == 0) { | ||||
| 3267 | # rss_newest_only=>0 means don't apply a newest | ||||||
| 3268 | 0 | return @items; | |||||
| 3269 | } | ||||||
| 3270 | 0 | my $before = scalar(@items); | |||||
| 3271 | 0 | require Sort::Key::Top; | |||||
| 3272 | 0 | 0 | @items = Sort::Key::Top::rnkeytop (sub { $self->item_to_timet($_) }, | ||||
| 3273 | 0 | $count, @items); | |||||
| 3274 | |||||||
| 3275 | 0 | my $after = scalar(@items); | |||||
| 3276 | 0 | 0 | if ($before != $after) { | ||||
| 3277 | 0 | $self->verbose (1, " rss_newest_only reduce by count from $before items to $after items"); | |||||
| 3278 | } | ||||||
| 3279 | 0 | return @items; | |||||
| 3280 | } | ||||||
| 3281 | |||||||
| 3282 | 0 | 0 | if (defined (my $target_timet = $self->rss_newest_only_timet)) { | ||||
| 3283 | 0 | my $before = scalar(@items); | |||||
| 3284 | 0 | @items = grep { my $got_timet = $self->item_to_timet($_); | |||||
| 0 | |||||||
| 3285 | 0 | 0 | ! defined $got_timet || $got_timet >= $target_timet } | ||||
| 3286 | @items; | ||||||
| 3287 | 0 | my $after = scalar(@items); | |||||
| 3288 | 0 | 0 | if ($before != $after) { | ||||
| 3289 | 0 | $self->verbose (1, " rss_newest_only reduce by age from $before to $after items"); | |||||
| 3290 | } | ||||||
| 3291 | 0 | return @items; | |||||
| 3292 | } | ||||||
| 3293 | |||||||
| 3294 | 0 | 0 | if (defined (my $str = $self->{'rss_newest_only'})) { | ||||
| 3295 | 0 | die "rss2leafnode: unrecognised rss_newest_only: ",$str; | |||||
| 3296 | } | ||||||
| 3297 | 0 | return @items; | |||||
| 3298 | } | ||||||
| 3299 | |||||||
| 3300 | |||||||
| 3301 | #------------------------------------------------------------------------------ | ||||||
| 3302 | # fetch RSS | ||||||
| 3303 | |||||||
| 3304 | my $map_xmlns | ||||||
| 3305 | = { | ||||||
| 3306 | 'http://purl.org/rss/1.0/' => 'rss', | ||||||
| 3307 | 'http://www.w3.org/2005/Atom' => 'atom', | ||||||
| 3308 | 'http://www.w3.org/1999/02/22-rdf-syntax-ns#' => 'rdf', | ||||||
| 3309 | 'http://purl.org/rss/1.0/modules/content/' => 'content', | ||||||
| 3310 | 'http://purl.org/rss/1.0/modules/slash/' => 'slash', | ||||||
| 3311 | 'http://purl.org/rss/1.0/modules/syndication/' => 'syn', | ||||||
| 3312 | 'http://purl.org/syndication/thread/1.0' => 'thr', | ||||||
| 3313 | 'http://wellformedweb.org/CommentAPI/' => 'wfw', | ||||||
| 3314 | 'http://www.w3.org/1999/xhtml' => 'xhtml', | ||||||
| 3315 | 'http://www.itunes.com/dtds/podcast-1.0.dtd' => 'itunes', | ||||||
| 3316 | 'http://rssnamespace.org/feedburner/ext/1.0' => 'feedburner', | ||||||
| 3317 | |||||||
| 3318 | # http://www.rssboard.org/media-rss | ||||||
| 3319 | 'http://search.yahoo.com/mrss' => 'media', | ||||||
| 3320 | |||||||
| 3321 | 'http://www.w3.org/2003/01/geo/wgs84_pos#' => 'geo', | ||||||
| 3322 | 'http://www.georss.org/georss' => 'georss', | ||||||
| 3323 | 'http://www.pheedo.com/namespace/pheedo' => 'pheedo', | ||||||
| 3324 | 'http://api.twitter.com' => 'twitter', | ||||||
| 3325 | 'http://xmlns.com/foaf/0.1/' => 'foaf', | ||||||
| 3326 | 'http://status.net/ont/' => 'statusnet', | ||||||
| 3327 | 'http://rdfs.org/sioc/ns#' => 'sioc', | ||||||
| 3328 | 'http://www.slate.com' => 'slate', | ||||||
| 3329 | 'http://activitystrea.ms/spec/1.0/' => 'activity', | ||||||
| 3330 | 'http://ostatus.org/schema/1.0' => 'ostatus', | ||||||
| 3331 | |||||||
| 3332 | # http://tools.ietf.org/html/draft-snell-atompub-feed-index-10 | ||||||
| 3333 | 'http://purl.org/atompub/rank/1.0' => 're', | ||||||
| 3334 | |||||||
| 3335 | # per http://docs.jivesoftware.com/latest/documentation/rss.html#output | ||||||
| 3336 | 'http://www.jivesoftware.com/xmlns/jiveforums/rss' => 'jf', | ||||||
| 3337 | |||||||
| 3338 | # these two are different, but treat the same for now | ||||||
| 3339 | 'http://backend.userland.com/creativeCommonsRssModule'=>'creativeCommons', | ||||||
| 3340 | 'http://creativecommons.org/ns#' =>'creativeCommons', | ||||||
| 3341 | |||||||
| 3342 | # Common Alerts Protocol | ||||||
| 3343 | 'urn:oasis:names:tc:emergency:cap:1.1' => 'cap', | ||||||
| 3344 | |||||||
| 3345 | # central bank exchange rates format, | ||||||
| 3346 | # spec http://www.cbwiki.net/wiki/index.php/RSS-CBMain | ||||||
| 3347 | # eg. RBA http://www.rba.gov.au/rss/rss-cb-exchange-rates.xml | ||||||
| 3348 | 'http://www.cbwiki.net/wiki/index.php/Specification_1.1' => 'cb', | ||||||
| 3349 | |||||||
| 3350 | # earthquakes | ||||||
| 3351 | # eg. http://earthquake.usgs.gov/earthquakes/shakemap/rss.xml | ||||||
| 3352 | 'http://earthquake.usgs.gov/rss/1.0/' => 'eq', | ||||||
| 3353 | |||||||
| 3354 | 'http://purl.org/dc/elements/1.1/' => 'dc', | ||||||
| 3355 | 'http://purl.org/dc/terms/' => 'dcterms', | ||||||
| 3356 | |||||||
| 3357 | # purl.org might be supposed to be the home for wiki:, but it's a 404 | ||||||
| 3358 | # and usemod.com suggests its page instead | ||||||
| 3359 | # Spec at http://www.meatballwiki.org/wiki/ModWiki | ||||||
| 3360 | 'http://purl.org/rss/1.0/modules/wiki/' => 'wiki', | ||||||
| 3361 | 'http://www.usemod.com/cgi-bin/mb.pl?ModWiki' => 'wiki', | ||||||
| 3362 | |||||||
| 3363 | # not sure if this is supposed to be necessary, but without it | ||||||
| 3364 | # "xml:lang" attributes are turned into "lang" | ||||||
| 3365 | 'http://www.w3.org/XML/1998/namespace' => 'xml', | ||||||
| 3366 | }; | ||||||
| 3367 | |||||||
| 3368 | sub twig_parse { | ||||||
| 3369 | 0 | 0 | 0 | my ($self, $xml) = @_; | |||
| 3370 | ### twig_parse() ... | ||||||
| 3371 | |||||||
| 3372 | # default "discard_spaces" chucks leading and trailing space on content, | ||||||
| 3373 | # which is usually a good thing | ||||||
| 3374 | # | ||||||
| 3375 | 0 | require XML::Twig; | |||||
| 3376 | 0 | XML::Twig->VERSION('3.34'); # for att_exists() | |||||
| 3377 | 0 | my $twig = XML::Twig->new (map_xmlns => $map_xmlns, | |||||
| 3378 | pretty_print => 'wrapped'); | ||||||
| 3379 | 0 | $twig->safe_parse ($xml); | |||||
| 3380 | 0 | my $err = $@; | |||||
| 3381 | ### $err | ||||||
| 3382 | |||||||
| 3383 | # Try to fix bad non-ascii chars by putting it through Encode::from_to(). | ||||||
| 3384 | # Encode::FB_DEFAULT substitutes U+FFFD when going to unicode, or question | ||||||
| 3385 | # mark "?" going to non-unicode. Mozilla does some sort of similar | ||||||
| 3386 | # liberal byte interpretation so as to at least display something from a | ||||||
| 3387 | # dodgy feed. | ||||||
| 3388 | # | ||||||
| 3389 | 0 | 0 | 0 | if ($err && $err =~ /not well-formed \(invalid token\) at (line \d+, column \d+, byte (\d+))/) { | |||
| 3390 | 0 | my $where = $1; | |||||
| 3391 | 0 | my $byte = ord(substr($xml,$2,1)); | |||||
| 3392 | 0 | 0 | if ($byte >= 128) { | ||||
| 3393 | 0 | 0 | my $charset = $twig->encoding // 'utf-8'; | ||||
| 3394 | 0 | $self->verbose (1, sprintf ("parse error, attempt re-code $charset for byte 0x%02X\n", $byte)); | |||||
| 3395 | 0 | require Encode; | |||||
| 3396 | 0 | my $recoded_xml = $xml; | |||||
| 3397 | 0 | Encode::from_to($recoded_xml, $charset, $charset, Encode::FB_DEFAULT()); | |||||
| 3398 | |||||||
| 3399 | 0 | $twig = XML::Twig->new (map_xmlns => $map_xmlns); | |||||
| 3400 | 0 | 0 | if ($twig->safe_parse ($recoded_xml)) { | ||||
| 3401 | 0 | $twig->root->set_att('rss2leafnode:fixup', | |||||
| 3402 | "Recoded bad bytes to charset $charset"); | ||||||
| 3403 | 0 | print __x("Feed {url}\n recoded {charset} to parse, expect substitutions for bad non-ascii\n ({where})\n", | |||||
| 3404 | url => $self->{'uri'}, | ||||||
| 3405 | charset => $charset, | ||||||
| 3406 | where => $where); | ||||||
| 3407 | 0 | undef $err; | |||||
| 3408 | } | ||||||
| 3409 | } | ||||||
| 3410 | } | ||||||
| 3411 | |||||||
| 3412 | # Or attempt to put it through XML::Liberal, if available. | ||||||
| 3413 | # | ||||||
| 3414 | 0 | 0 | if ($err) { | ||||
| 3415 | 0 | my $liberal_xml = $self->xml_liberal_correction($xml); | |||||
| 3416 | 0 | 0 | if (defined $liberal_xml) { | ||||
| 3417 | ### reparse xml liberal fixup with twig ... | ||||||
| 3418 | 0 | $twig = XML::Twig->new (map_xmlns => $map_xmlns); | |||||
| 3419 | 0 | 0 | if ($twig->safe_parse ($liberal_xml)) { | ||||
| 3420 | ### now ok ... | ||||||
| 3421 | 0 | $err = Text::Trim::trim($err); | |||||
| 3422 | 0 | $twig->root->set_att('rss2leafnode:fixup', | |||||
| 3423 | "XML::Liberal fixed: {error}", | ||||||
| 3424 | error => $err); | ||||||
| 3425 | 0 | print __x("Feed {url}\n parse error: {error}\n continuing with repairs by XML::Liberal\n", | |||||
| 3426 | url => $self->{'uri'}, | ||||||
| 3427 | error => $err); | ||||||
| 3428 | 0 | undef $err; | |||||
| 3429 | } | ||||||
| 3430 | } | ||||||
| 3431 | ### now err: $err | ||||||
| 3432 | } | ||||||
| 3433 | |||||||
| 3434 | 0 | 0 | if ($err) { | ||||
| 3435 | # XML::Parser seems to stick some spurious leading whitespace on the error | ||||||
| 3436 | 0 | $err = Text::Trim::trim($err); | |||||
| 3437 | |||||||
| 3438 | 0 | $self->verbose (1, __x("Parse error on URL {url}\n{error}", | |||||
| 3439 | url => $self->{'uri'}, | ||||||
| 3440 | error => $err)); | ||||||
| 3441 | 0 | return (undef, $err); | |||||
| 3442 | } | ||||||
| 3443 | |||||||
| 3444 | # Strip any explicit "rss:" or "atom:" namespace down to bare part. | ||||||
| 3445 | # Should be unambiguous and is easier than giving tag names both with and | ||||||
| 3446 | # without the namespace. Undocumented set_ns_as_default() might do this | ||||||
| 3447 | # ... or might not. | ||||||
| 3448 | # | ||||||
| 3449 | 0 | my $root = $twig->root; | |||||
| 3450 | 0 | App::RSS2Leafnode::XML::Twig::Other::elt_tree_strip_prefix ($root, 'atom'); | |||||
| 3451 | 0 | App::RSS2Leafnode::XML::Twig::Other::elt_tree_strip_prefix ($root, 'rss'); | |||||
| 3452 | |||||||
| 3453 | # somehow map_xmlns mangles default attributes like "decimals=...", prefer | ||||||
| 3454 | # to see them without rss: or atom: -- maybe | ||||||
| 3455 | # foreach my $child ($root->descendants_or_self) { | ||||||
| 3456 | # foreach my $attname ($child->att_names) { | ||||||
| 3457 | # if ($attname =~ /^(atom|rss):(.*)/) { | ||||||
| 3458 | # $child->change_att_name($attname, $2); | ||||||
| 3459 | # } | ||||||
| 3460 | # } | ||||||
| 3461 | # } | ||||||
| 3462 | |||||||
| 3463 | ### add xml base | ||||||
| 3464 | 0 | 0 | 0 | if (defined $self->{'uri'} && ! $root->att_exists('xml:base')) { | |||
| 3465 | 0 | $root->set_att ('xml:base', $self->{'uri'}); | |||||
| 3466 | } | ||||||
| 3467 | |||||||
| 3468 | ### success | ||||||
| 3469 | 0 | return ($twig, undef); | |||||
| 3470 | } | ||||||
| 3471 | |||||||
| 3472 | sub item_to_channel { | ||||||
| 3473 | 0 | 0 | 0 | my ($item) = @_; | |||
| 3474 | # parent for RSS or Atom, but sibling "channel" for RDF | ||||||
| 3475 | 0 | my $channel = $item->parent; | |||||
| 3476 | 0 | 0 | return ($channel->first_child('channel') | ||||
| 3477 | // $channel); | ||||||
| 3478 | } | ||||||
| 3479 | |||||||
| 3480 | # return a Message-ID string for this $item coming from $self->{'uri'} | ||||||
| 3481 | # | ||||||
| 3482 | sub item_to_msgid { | ||||||
| 3483 | 0 | 0 | 0 | my ($self, $item) = @_; | |||
| 3484 | |||||||
| 3485 | 0 | 0 | if (is_non_empty (my $id = $item->first_child_text('id'))) { | ||||
| 3486 |      # Atom  | 
||||||
| 3487 | 0 | return $self->url_to_msgid ($id, $item->first_child_text('updated')); | |||||
| 3488 | } | ||||||
| 3489 | |||||||
| 3490 | 0 | my $guid; | |||||
| 3491 | 0 | my $isPermaLink = 0; | |||||
| 3492 | 0 | 0 | if (my $elt = $item->first_child('guid')) { | ||||
| 3493 |      # ignore empty  | 
||||||
| 3494 | # http://abc.net.au/rn/podcast/feeds/sci.xml | ||||||
| 3495 | 0 | 0 | if (is_non_empty (my $str = collapse_whitespace ($elt->text))) { | ||||
| 3496 | 0 | $guid = $str; | |||||
| 3497 | 0 | 0 | $isPermaLink = (lc($elt->att('isPermaLink') // 'true') eq 'true'); | ||||
| 3498 | } | ||||||
| 3499 | } | ||||||
| 3500 | |||||||
| 3501 | 0 | 0 |    if ($isPermaLink) {   #  | 
||||
| 3502 | 0 | return $self->url_to_msgid ($guid); | |||||
| 3503 | } | ||||||
| 3504 | 0 | 0 | if (my $link = item_yahoo_permalink ($item)) { | ||||
| 3505 | 0 | return $self->url_to_msgid ($link); | |||||
| 3506 | } | ||||||
| 3507 | 0 | 0 |    if (defined $guid) {  #  | 
||||
| 3508 | 0 | return $self->url_to_msgid ($self->{'uri'}, $guid); | |||||
| 3509 | } | ||||||
| 3510 | |||||||
| 3511 | # nothing in the item, use the feed url and MD5 of some fields which | ||||||
| 3512 | # will hopefully distinguish it from other items at this url | ||||||
| 3513 | 0 | $self->verbose (2, ' msgid from MD5'); | |||||
| 3514 | 0 | return $self->url_to_msgid | |||||
| 3515 | ($self->{'uri'}, | ||||||
| 3516 | md5_of_utf8 (join_non_empty ('', | ||||||
| 3517 | 0 | map {$item->first_child_text($_)} | |||||
| 3518 | qw(title | ||||||
| 3519 | author | ||||||
| 3520 | dc:creator | ||||||
| 3521 | description | ||||||
| 3522 | content | ||||||
| 3523 | link | ||||||
| 3524 | pubDate | ||||||
| 3525 | published | ||||||
| 3526 | updated | ||||||
| 3527 | )))); | ||||||
| 3528 | } | ||||||
| 3529 |  # FIXME: is  | 
||||||
| 3530 |  #  | 
||||||
| 3531 | @known{qw(/channel/item/guid | ||||||
| 3532 | /channel/item/id | ||||||
| 3533 | /channel/item/wordzilla:id | ||||||
| 3534 | /channel/item/slate:id | ||||||
| 3535 | )} = (); | ||||||
| 3536 | |||||||
| 3537 | # Return an "In-Reply-To:" value for $item, being a space-separated list of | ||||||
| 3538 | # Message-ID strings including angles <>, or undef if nothing. The message | ||||||
| 3539 |  # ids match up to an Atom  | 
||||||
| 3540 | # | ||||||
| 3541 |  # RFC 4685 has  | 
||||||
| 3542 | # reply to multiple originals. | ||||||
| 3543 | # | ||||||
| 3544 | # Eg. comment feeds under | ||||||
| 3545 | # http://wickedgooddinner.blogspot.com/feeds/posts/default | ||||||
| 3546 | # | ||||||
| 3547 | sub item_to_in_reply_to { | ||||||
| 3548 | 0 | 0 | 0 | my ($self, $item) = @_; | |||
| 3549 | |||||||
| 3550 | 0 | my @ids; | |||||
| 3551 | 0 | foreach my $elt ($item->children('thr:in-reply-to')) { | |||||
| 3552 | 0 | 0 | my $ref = ($elt->att('thr:ref') | ||||
| 0 | |||||||
| 0 | |||||||
| 3553 | // $elt->att('ref') | ||||||
| 3554 | // $elt->att('atom:ref') # comes out atom: under map_xmlns ... | ||||||
| 3555 | // next); | ||||||
| 3556 | # probably shouldn't be relative actually ... | ||||||
| 3557 | 0 | $ref = App::RSS2Leafnode::XML::Twig::Other::elt_xml_based_uri ($elt, $ref); | |||||
| 3558 | 0 | push @ids, $self->url_to_msgid ($ref); | |||||
| 3559 | } | ||||||
| 3560 | 0 | 0 | if (@ids) { | ||||
| 3561 | 0 | return join (' ', @ids); | |||||
| 3562 | } else { | ||||||
| 3563 | 0 | return undef; | |||||
| 3564 | } | ||||||
| 3565 | } | ||||||
| 3566 | @known{qw(/channel/item/thr:in-reply-to | ||||||
| 3567 | )} = (); | ||||||
| 3568 | |||||||
| 3569 | # Return a string of comma separated keywords per RFC1036 and RFC2822. | ||||||
| 3570 | # | ||||||
| 3571 |  # RSS  | 
||||||
| 3572 | # that in as a bit of a fallback, being better than nothing for | ||||||
| 3573 | # classification. | ||||||
| 3574 | # | ||||||
| 3575 |  # Atom  | 
||||||
| 3576 | # attribute being the displayable part. Have seen only the "term" attribute | ||||||
| 3577 | # though. | ||||||
| 3578 | # | ||||||
| 3579 |  #  | 
||||||
| 3580 |  # it in for more classification for now.  Can have child  | 
||||||
| 3581 | # elements as sub-categories, but don't worry about them, haven't seen any | ||||||
| 3582 | # real ones, only the sample at | ||||||
| 3583 | # http://www.apple.com/itunes/podcasts/specs.html#example | ||||||
| 3584 | # | ||||||
| 3585 |  #  | 
||||||
| 3586 |  #  | 
||||||
| 3587 | # particularly informative. | ||||||
| 3588 | # | ||||||
| 3589 |  #  | 
||||||
| 3590 | # should be in the keywords if it's also in the body text, but at least | ||||||
| 3591 | # offers a bit of classification in the headers. | ||||||
| 3592 | # | ||||||
| 3593 |  #  | 
||||||
| 3594 | # want a bit of decoding. Not much used, but for instance | ||||||
| 3595 | # http://www.gdacs.org/xml/RSSTC.xml | ||||||
| 3596 | # http://earthquake.usgs.gov/eqcenter/recenteqsww/catalogs/eqs7day-M5.xml | ||||||
| 3597 | # | ||||||
| 3598 | # How much value is there in the channel keywords? | ||||||
| 3599 | # | ||||||
| 3600 | { | ||||||
| 3601 | my $re = qr/^(category | ||||||
| 3602 | |itunes:category | ||||||
| 3603 | |cap:category | ||||||
| 3604 | |itunes:keywords | ||||||
| 3605 | |media:keywords | ||||||
| 3606 | |dc:subject | ||||||
| 3607 | |slash:section | ||||||
| 3608 | |slate:topic | ||||||
| 3609 | )$/x; | ||||||
| 3610 | sub item_to_keywords { | ||||||
| 3611 | 0 | 0 | 0 | my ($self, $item) = @_; | |||
| 3612 | 0 | my $channel = item_to_channel($item); | |||||
| 3613 | |||||||
| 3614 | 0 | return join_non_empty | |||||
| 3615 | (', ', | ||||||
| 3616 | List::MoreUtils::uniq | ||||||
| 3617 | 0 | (map { collapse_whitespace($_) } | |||||
| 3618 | 0 | 0 | map { split /,/ } | ||||
| 0 | |||||||
| 0 | |||||||
| 0 | |||||||
| 0 | |||||||
| 0 | |||||||
| 3619 | 0 | map { ($_->att('text') # itunes:category | |||||
| 3620 | // $_->att('itunes:text') # itunes:category | ||||||
| 3621 |                 // $_->att('atom:label')  # atom  | 
||||||
| 3622 |                 // $_->att('label')       # atom  | 
||||||
| 3623 |                 // $_->att('atom:term')   # atom  | 
||||||
| 3624 |                 // $_->att('term')        # atom  | 
||||||
| 3625 | // $_->text) } # other | ||||||
| 3626 | ($item->children($re), | ||||||
| 3627 | $channel->children($re), | ||||||
| 3628 |           #  | 
||||||
| 3629 | 0 | map {$_->children('cb:keyword')} $item->children, | |||||
| 3630 | ))); | ||||||
| 3631 | } | ||||||
| 3632 |    # maybe could show  | 
||||||
| 3633 | # too, for now just omit | ||||||
| 3634 | @known{qw(/channel/category | ||||||
| 3635 | /channel/itunes:category | ||||||
| 3636 | /channel/itunes:category/itunes:category | ||||||
| 3637 | |||||||
| 3638 | /channel/item/category | ||||||
| 3639 | /channel/item/itunes:keywords | ||||||
| 3640 | /channel/item/media:keywords | ||||||
| 3641 | /channel/item/slash:section | ||||||
| 3642 | /channel/item/slate:topic | ||||||
| 3643 | /channel/item/slate:section | ||||||
| 3644 | )} = (); | ||||||
| 3645 | } | ||||||
| 3646 | |||||||
| 3647 | { | ||||||
| 3648 | # Feturn a string for the "Importance:" header of RFC 1911, RFC 2156 | ||||||
| 3649 | # voice and X.400 messaging. Possible values 'high', 'normal', 'low'. | ||||||
| 3650 | # 'normal' is the header default, return undef in that case in the | ||||||
| 3651 | # interests of not junking up headers with defaults | ||||||
| 3652 | # | ||||||
| 3653 | my %cap_severity_high = (extreme => 1, | ||||||
| 3654 | severe => 1); | ||||||
| 3655 | my %cap_severity_normal = (moderate => 1); | ||||||
| 3656 | my %cap_severity_low = (minor => 1); | ||||||
| 3657 | |||||||
| 3658 | sub item_to_importance { | ||||||
| 3659 | 0 | 0 | 0 | my ($self, $item) = @_; | |||
| 3660 | |||||||
| 3661 | 0 | 0 | my $cap_severity = lc($item->first_child_trimmed_text('cap:severity') | ||||
| 3662 | // ''); | ||||||
| 3663 | 0 | 0 | my $wiki_importance = ($item->first_child_trimmed_text('wiki:importance') | ||||
| 3664 | // ''); | ||||||
| 3665 | 0 | 0 | if ($cap_severity) { | ||||
| 3666 | 0 | $self->verbose (2, " CAP severity: ",$cap_severity); | |||||
| 3667 | 0 | $self->verbose (2, " Wiki importance: ",$wiki_importance); | |||||
| 3668 | } | ||||||
| 3669 | |||||||
| 3670 | 0 | 0 | if ($cap_severity_high{$cap_severity}) { | ||||
| 3671 | 0 | return 'high'; | |||||
| 3672 | } | ||||||
| 3673 | 0 | 0 | if ($cap_severity_normal{$cap_severity}) { | ||||
| 3674 | 0 | return undef; # default "normal" | |||||
| 3675 | } | ||||||
| 3676 | 0 | 0 | 0 | if ($cap_severity_low{$cap_severity} | |||
| 3677 | || $wiki_importance eq 'minor') { | ||||||
| 3678 | 0 | return 'low'; | |||||
| 3679 | } | ||||||
| 3680 | 0 | return undef; # unknown | |||||
| 3681 | } | ||||||
| 3682 | @known{qw(/channel/item/wiki:importance | ||||||
| 3683 | )} = (); | ||||||
| 3684 | } | ||||||
| 3685 | { | ||||||
| 3686 | # Return a string for the "Priority:" header of RFC 1327, RFC 2156. | ||||||
| 3687 | # Possible values 'urgent', 'normal', 'non-urgent'. | ||||||
| 3688 | # 'normal' is the header default, return undef in that case in the | ||||||
| 3689 | # interests of not junking up headers with defaults | ||||||
| 3690 | # | ||||||
| 3691 |    #  | 
||||||
| 3692 |    # for when response action should be taken.  Is the  | 
||||||
| 3693 | # better indicator of transmission priority? | ||||||
| 3694 | # | ||||||
| 3695 | my %cap_severity_urgent = (extreme => 1, | ||||||
| 3696 | severe => 1); | ||||||
| 3697 | my %cap_severity_normal = (moderate => 1); | ||||||
| 3698 | |||||||
| 3699 | sub item_to_priority { | ||||||
| 3700 | 0 | 0 | 0 | my ($self, $item) = @_; | |||
| 3701 | |||||||
| 3702 | 0 | 0 | my $cap_severity = lc($item->first_child_trimmed_text('cap:severity') | ||||
| 3703 | // ''); | ||||||
| 3704 | |||||||
| 3705 | 0 | 0 | if ($cap_severity_urgent{$cap_severity}) { | ||||
| 3706 | 0 | return 'urgent'; | |||||
| 3707 | } | ||||||
| 3708 | 0 | 0 | if ($cap_severity_normal{$cap_severity}) { | ||||
| 3709 | 0 | return undef; # default "normal" | |||||
| 3710 | } | ||||||
| 3711 | 0 | if (0) { # nothing for this yet | |||||
| 3712 | return 'non-urgent'; | ||||||
| 3713 | } | ||||||
| 3714 | 0 | return undef; # unknown | |||||
| 3715 | } | ||||||
| 3716 | } | ||||||
| 3717 | |||||||
| 3718 | # return a string for the slightly unofficial "Precedence:" header | ||||||
| 3719 | # might be able to identify lists gatewayed to RSS and give "list" for them | ||||||
| 3720 | # maybe "bulk" would suit low priority stuff | ||||||
| 3721 | # for now nothing | ||||||
| 3722 | # | ||||||
| 3723 | # sub item_to_precedence { | ||||||
| 3724 | # my ($self, $item) = @_; | ||||||
| 3725 | # return undef; # nothing | ||||||
| 3726 | # } | ||||||
| 3727 | |||||||
| 3728 | # return the host part of $self->{'uri'}, or "localhost" if none | ||||||
| 3729 | sub uri_to_host { | ||||||
| 3730 | 0 | 0 | 0 | my ($self) = @_; | |||
| 3731 | 0 | my $uri = $self->{'uri'}; | |||||
| 3732 | ### uri_to_host(): $uri | ||||||
| 3733 | 0 | 0 | return (non_empty ($uri && $uri->can('host') && $uri->host) | ||||
| 0 | |||||||
| 3734 | // 'localhost'); | ||||||
| 3735 | } | ||||||
| 3736 | |||||||
| 3737 | sub item_to_subject { | ||||||
| 3738 | 0 | 0 | 0 | my ($self, $item) = @_; | |||
| 3739 | |||||||
| 3740 |    # Atom  | 
||||||
| 3741 | return | ||||||
| 3742 | 0 | 0 | (elt_to_rendered_line ($item->first_child('title')) | ||||
| 0 | |||||||
| 0 | |||||||
| 0 | |||||||
| 3743 | |||||||
| 3744 |       #  | 
||||||
| 3745 |       # present without a plain  | 
||||||
| 3746 | # | ||||||
| 3747 | // elt_to_rendered_line ($item->first_child('dc:title')) | ||||||
| 3748 | |||||||
| 3749 |       # eg. https://archive.org/services/collection-rss.php has  | 
||||||
| 3750 |       # in addition to plain  | 
||||||
| 3751 |       #  | 
||||||
| 3752 | # | ||||||
| 3753 | // elt_to_rendered_line ($item->first_child('media:title')) | ||||||
| 3754 | |||||||
| 3755 |       #  | 
||||||
| 3756 |       # better than nothing.  Not sure have ever actually seen  | 
||||||
| 3757 |       # without  | 
||||||
| 3758 | # | ||||||
| 3759 | // elt_to_rendered_line ($item->first_child('dc:subject')) | ||||||
| 3760 | |||||||
| 3761 | // __('no subject')); | ||||||
| 3762 | } | ||||||
| 3763 | @known{qw(/channel/title | ||||||
| 3764 | /channel/dc:subject | ||||||
| 3765 | /channel/subtitle | ||||||
| 3766 | /channel/itunes:subtitle | ||||||
| 3767 | |||||||
| 3768 | /channel/item/dc:subject | ||||||
| 3769 | /channel/item/title | ||||||
| 3770 | /channel/item/media:title | ||||||
| 3771 | /channel/item/dc:title | ||||||
| 3772 | /channel/item/itunes:title | ||||||
| 3773 | /channel/item/itunes:subtitle --not-using-this-as-yet | ||||||
| 3774 | /channel/item/slate:menuline --copy-of-subject-it-seems | ||||||
| 3775 | /channel/item/slate:rubric --blog-title | ||||||
| 3776 | /channel/item/slate:blog --blog-title | ||||||
| 3777 | /channel/item/slate:legacy_url --same-as-link-it-seems | ||||||
| 3778 | )} = (); | ||||||
| 3779 | |||||||
| 3780 | |||||||
| 3781 | # return language code string for Content-Language, or undef | ||||||
| 3782 | # return is per RFC 1766, RFC 3066, RFC 4646 | ||||||
| 3783 | # | ||||||
| 3784 | # xml:lang is defined to be per RFC 4646, no mangling needed | ||||||
| 3785 |  # RSS  | 
||||||
| 3786 |  #  | 
||||||
| 3787 | # cf. I18N::LangTags if mangling might be needed one day | ||||||
| 3788 | # | ||||||
| 3789 | sub item_to_language { | ||||||
| 3790 | 0 | 0 | 0 | my ($self, $item) = @_; | |||
| 3791 | 0 | my $lang; | |||||
| 3792 | |||||||
| 3793 | 0 | 0 | if (my $elt = $item->first_child('content')) { | ||||
| 3794 | 0 | $lang = non_empty ($elt->att('xml:lang')); | |||||
| 3795 | } | ||||||
| 3796 |    # Either  | 
||||||
| 3797 | # xml:lang="" tag, in the item itself or in channel, and maybe xml:lang in | ||||||
| 3798 |    # toplevel  | 
||||||
| 3799 |    # xml:lang, not a  | 
||||||
| 3800 | 0 | for ( ; $item; $item = $item->parent) { | |||||
| 3801 | 0 | 0 | $lang //= (non_empty ($item->first_child_trimmed_text | ||||
| 0 | |||||||
| 0 | |||||||
| 3802 | (qr/^((dc:)?language|twitter:lang)$/)) | ||||||
| 3803 | // non_empty ($item->att('xml:lang')) | ||||||
| 3804 | // next); | ||||||
| 3805 | } | ||||||
| 3806 | 0 | 0 | return ($lang // $self->{'resp'}->content_language); | ||||
| 3807 | } | ||||||
| 3808 | @known{qw(/channel/language | ||||||
| 3809 | /channel/dc:language | ||||||
| 3810 | /channel/twitter:lang | ||||||
| 3811 | /channel/item/language | ||||||
| 3812 | /channel/item/dc:language | ||||||
| 3813 | /channel/item/twitter:lang | ||||||
| 3814 | )} = (); | ||||||
| 3815 | |||||||
| 3816 | # return arrayref of copyright strings | ||||||
| 3817 | # Keep all of multiple rights/license/etc in the interests of preserving all | ||||||
| 3818 | # statements. | ||||||
| 3819 | sub item_to_copyright { | ||||||
| 3820 | 0 | 0 | 0 | my ($self, $item) = @_; | |||
| 3821 | 0 | my $channel = item_to_channel($item); | |||||
| 3822 | |||||||
| 3823 |    #  | 
||||||
| 3824 | # suppress the latter in the presence of the former (dcterms: collapsed to | ||||||
| 3825 | # dc: by the map_xmlns). | ||||||
| 3826 | # | ||||||
| 3827 |    # Atom  | 
||||||
| 3828 | # always plain text | ||||||
| 3829 | # | ||||||
| 3830 | 0 | my $re = qr/^(rights # Atom | |||||
| 3831 | |copyright # RSS, don't think entity-encoded html allowed there | ||||||
| 3832 | |dcterms:license | ||||||
| 3833 | |dc:rights | ||||||
| 3834 | |creativeCommons:licen[cs]e | ||||||
| 3835 | )$/x; | ||||||
| 3836 |    # Atom sub-elem  | 
||||||
| 3837 | 0 | my @parents = ($item, $channel, $item->children('source')); | |||||
| 3838 | |||||||
| 3839 | 0 | my @strings; | |||||
| 3840 | 0 | foreach my $elt (map {$_->children($re)} @parents) { | |||||
| 0 | |||||||
| 3841 | 0 | push @strings, | |||||
| 3842 | join_non_empty(' ', | ||||||
| 3843 | elt_to_rendered_line($elt), | ||||||
| 3844 |                       # eg.  | 
||||||
| 3845 | $elt->att('rdf:resource')); | ||||||
| 3846 | } | ||||||
| 3847 | |||||||
| 3848 | # | ||||||
| 3849 | 0 | foreach my $link (map {$_->children('link')} @parents) { | |||||
| 0 | |||||||
| 3850 | ### link for copyright: $link->sprint | ||||||
| 3851 | 0 | 0 | 0 | if (($link->att('atom:rel')//$link->att('rel')//'') eq 'license') { | |||
| 0 | |||||||
| 3852 | 0 | 0 | push @strings, $link->att('atom:href')//$link->att('href'); | ||||
| 3853 | } | ||||||
| 3854 | } | ||||||
| 3855 | ### @strings | ||||||
| 3856 | 0 | return [ List::MoreUtils::uniq(grep {defined} @strings) ]; | |||||
| 0 | |||||||
| 3857 | } | ||||||
| 3858 | @known{qw(/channel/copyright | ||||||
| 3859 | /channel/rights | ||||||
| 3860 | /channel/dc:rights | ||||||
| 3861 | /channel/dc:license | ||||||
| 3862 | /channel/creativeCommons:licence | ||||||
| 3863 | /channel/creativeCommons:license | ||||||
| 3864 | /channel/item/dc:rights | ||||||
| 3865 | /channel/item/dc:license | ||||||
| 3866 | /channel/item/creativeCommons:licence | ||||||
| 3867 | /channel/item/creativeCommons:license | ||||||
| 3868 | )} = (); | ||||||
| 3869 | # /channel/item/media:credit --nothing-much-in-this-one | ||||||
| 3870 | |||||||
| 3871 | |||||||
| 3872 | # return string or undef | ||||||
| 3873 | sub item_to_generator { | ||||||
| 3874 | 0 | 0 | 0 | my ($self, $item) = @_; | |||
| 3875 | 0 | my $channel = item_to_channel($item); | |||||
| 3876 | 0 | my @strings; | |||||
| 3877 | |||||||
| 3878 |    # both RSS and Atom use  | 
||||||
| 3879 | # Atom can include version="" and uri="" | ||||||
| 3880 | 0 | 0 | if (my $generator = $channel->first_child('generator')) { | ||||
| 3881 | 0 | push @strings, join_non_empty (' ', | |||||
| 3882 | $generator->text, | ||||||
| 3883 | $generator->att('atom:version'), | ||||||
| 3884 | $generator->att('version'), | ||||||
| 3885 | $generator->att('atom:uri'), | ||||||
| 3886 | $generator->att('uri')); | ||||||
| 3887 | } | ||||||
| 3888 | |||||||
| 3889 | # FIXME: is this bit right? | ||||||
| 3890 |    #  | ||||||
| 3891 | # source="<a href="http://nongnu.org/identica-mode/" rel="nofollow">Emacs Identica-mode</a>" | ||||||
| 3892 | # source_link="http://nongnu.org/identica-mode/"> | ||||||
| 3893 | # | ||||||
| 3894 | 0 | 0 | if (my $notice = $item->first_child('statusnet:notice_info')) { | ||||
| 3895 | 0 | 0 | if (defined (my $html = $notice->att('atom:source'))) { | ||||
| 3896 | 0 | push @strings, join_non_empty (' ', | |||||
| 3897 | html_to_rendered_line($html), | ||||||
| 3898 | $notice->att('atom:source_link')); | ||||||
| 3899 | } | ||||||
| 3900 | } | ||||||
| 3901 | |||||||
| 3902 | 0 | return collapse_whitespace (join_non_empty (', ', @strings)); | |||||
| 3903 | } | ||||||
| 3904 | @known{qw(/channel/item/statusnet:notice_info | ||||||
| 3905 | )} = (); | ||||||
| 3906 | |||||||
| 3907 | # return URL string or undef/empty | ||||||
| 3908 | sub item_to_feedburner { | ||||||
| 3909 | 0 | 0 | 0 | my ($self, $item) = @_; | |||
| 3910 | 0 | my $channel = item_to_channel($item); | |||||
| 3911 | 0 | 0 | my $elt = $channel->first_child('feedburner:info') || return; | ||||
| 3912 | 0 | 0 | my $uri = $elt->att('uri') // return; | ||||
| 3913 | 0 | return URI->new_abs ($uri, 'http://feeds.feedburner.com/')->as_string; | |||||
| 3914 | } | ||||||
| 3915 | |||||||
| 3916 |  # $elt is an Atom  | 
||||||
| 3917 | sub atom_content_flavour { | ||||||
| 3918 | 0 | 0 | 0 | my ($elt) = @_; | |||
| 3919 | 0 | 0 | if (! defined $elt) { return ''; } | ||||
| 0 | |||||||
| 3920 | 0 | 0 | my $type = ($elt->att('atom:type') // $elt->att('type')); | ||||
| 3921 | 0 | 0 | 0 | if ($elt->att('atom:src') || $elt->att('src')) { | |||
| 3922 |      #  | 
||||||
| 3923 | 0 | return 'link'; | |||||
| 3924 | } | ||||||
| 3925 | 0 | 0 | 0 | if (! defined $type | |||
| 0 | |||||||
| 0 | |||||||
| 0 | |||||||
| 3926 | || $type eq 'html' | ||||||
| 3927 | || $type eq 'xhtml' | ||||||
| 3928 | || $type eq 'application/xhtml+xml' | ||||||
| 3929 | || $type =~ m{^text/}) { | ||||||
| 3930 | 0 | return 'body'; | |||||
| 3931 | } | ||||||
| 3932 | 0 | return 'attach'; | |||||
| 3933 | } | ||||||
| 3934 | |||||||
| 3935 | sub html_wrap_fragment { | ||||||
| 3936 | 0 | 0 | 0 | my ($item, $fragment, $language) = @_; | |||
| 3937 | 0 | 0 | my $charset = (is_ascii($fragment) ? 'us-ascii' : 'utf-8'); | ||||
| 3938 | 0 | my $base_uri = App::RSS2Leafnode::XML::Twig::Other::elt_xml_base($item); | |||||
| 3939 | 0 | 0 | my $base_header = (defined $base_uri | ||||
| 3940 |                       ? "   | 
||||||
| 3941 | : ''); | ||||||
| 3942 | 0 | 0 | if (is_non_empty ($language)) { | ||||
| 3943 | 0 | $language = " lang=\"$Entitize{$language}\""; | |||||
| 3944 | } else { | ||||||
| 3945 | 0 | $language = ''; | |||||
| 3946 | } | ||||||
| 3947 | 0 | return (<<"HERE", $charset); | |||||
| 3948 | |||||||
| 3949 | |||||||
| 3950 | |||||||
| 3951 | |||||||
| 3952 | $base_header | ||||||
| 3953 | |||||||
| 3954 | $fragment | ||||||
| 3955 | |||||||
| 3956 | HERE | ||||||
| 3957 | } | ||||||
| 3958 | |||||||
| 3959 | # $self->{'rss_charset_override'}, if set, means the bytes are actually in | ||||||
| 3960 | # that charset. Enforce this by replacing the " | ||||||
| 3961 | # bytes. Do a decode() and re-encode() to cope with non-ascii like say | ||||||
| 3962 | # utf-16. | ||||||
| 3963 | # | ||||||
| 3964 | # XML::RSS::LibXML has an "encoding" option on its new(), but that's for | ||||||
| 3965 | # feed creation or something, a parse() still follows the tag. | ||||||
| 3966 | # | ||||||
| 3967 | sub enforce_rss_charset_override { | ||||||
| 3968 | 0 | 0 | 0 | my ($self, $xml) = @_; | |||
| 3969 | 0 | 0 | if (my $charset = $self->{'rss_charset_override'}) { | ||||
| 3970 | 0 | $xml = Encode::decode ($charset, $xml); | |||||
| 3971 | 0 | 0 | if ($xml =~ s/(<\?xml[^>]*encoding="?)([^">]+)/$1$charset/i) { | ||||
| 0 | |||||||
| 3972 | 0 | $self->verbose (2, "replace encoding=$2 tag with encoding=$charset"); | |||||
| 3973 | } elsif ($xml =~ s/(<\?xml[^?>]*)/$1 encoding="$charset"/i) { | ||||||
| 3974 | 0 | $self->verbose (2, "insert encoding=\"$charset\""); | |||||
| 3975 | } else { | ||||||
| 3976 | 0 | my $str = "\n"; | |||||
| 3977 | 0 | $self->verbose (2, "insert $str"); | |||||
| 3978 | 0 | $xml = $str . $xml; | |||||
| 3979 | } | ||||||
| 3980 | 0 | $self->verbose (3, "xml now:\n$xml\n"); | |||||
| 3981 | 0 | $xml = Encode::encode ($charset, $xml); | |||||
| 3982 | } | ||||||
| 3983 | 0 | return $xml; | |||||
| 3984 | } | ||||||
| 3985 | |||||||
| 3986 | # slightly experimental extract of "cap" fields as from | ||||||
| 3987 | # http://www.nws.noaa.gov/alerts-beta/ | ||||||
| 3988 | # http://www.weather.gov/alerts-beta/ca.php?x=0 | ||||||
| 3989 | sub item_common_alert_protocol { | ||||||
| 3990 | 0 | 0 | 0 | my ($self, $item, $want_html) = @_; | |||
| 3991 | 0 | my @fields; | |||||
| 3992 | 0 | foreach my $elt ($item->children(qr/^cap:/)) { | |||||
| 3993 | 0 | (my $field = $elt->name) =~ s/^cap://; | |||||
| 3994 | 0 | 0 | 0 | if ($field eq 'geocode' || $field eq 'parameter') { | |||
| 3995 | # dunno how to show these yet ... | ||||||
| 3996 | 0 | next; | |||||
| 3997 | } | ||||||
| 3998 | 0 | $known{'/channel/item/'.$elt->name} = undef; | |||||
| 3999 | |||||||
| 4000 | 0 | my $value = elt_to_rendered_line ($elt); | |||||
| 4001 | 0 | $value = Text::Trim::trim ($value); | |||||
| 4002 | 0 | 0 | if (is_non_empty ($value)) { | ||||
| 4003 | 0 | push @fields, [ "\u$field: ", $value ]; | |||||
| 4004 | } | ||||||
| 4005 | } | ||||||
| 4006 | 0 | 0 | if (! @fields) { | ||||
| 4007 | 0 | return ''; | |||||
| 4008 | } | ||||||
| 4009 | # FIXME: This $width padding doesn't come out in html, only in text. The | ||||||
| 4010 | # NOAA is Atom plain text, so that one is ok at least. | ||||||
| 4011 | 0 | my $width = max(map {length $_->[0]} @fields); | |||||
| 0 | |||||||
| 4012 | 0 | @fields = map { my $field = $_->[0]; | |||||
| 0 | |||||||
| 4013 | 0 | my $value = $_->[1]; | |||||
| 4014 | 0 | $field = sprintf ('%-*s', $width, $field); | |||||
| 4015 | 0 | $self->text_wrap ($value, $field) | |||||
| 4016 | } @fields; | ||||||
| 4017 | 0 | 0 | if ($want_html) { | ||||
| 4018 | 0 |      return " \n"  | 
|||||
| 4019 | 0 |        . join(" \n", map {$Entitize{$_}} @fields)  | 
|||||
| 4020 | . "\n\n"; | ||||||
| 4021 | } else { | ||||||
| 4022 | 0 | return "\n" | |||||
| 4023 | . join("\n", @fields) | ||||||
| 4024 | . "\n"; | ||||||
| 4025 | } | ||||||
| 4026 | } | ||||||
| 4027 | |||||||
| 4028 | sub item_unknowns { | ||||||
| 4029 | 0 | 0 | 0 | my ($self, $item, $want_html) = @_; | |||
| 4030 | ### item_unknowns() ... | ||||||
| 4031 | |||||||
| 4032 | 0 | my $xml = ''; | |||||
| 4033 | 0 | 0 | foreach my $elt (map {$_->tag eq 'media:group' # descend into media:group | ||||
| 0 | |||||||
| 4034 | ? $_->children : $_} | ||||||
| 4035 | $item->children) { | ||||||
| 4036 | 0 | 0 | next if $elt->tag =~ /^#/; # text | ||||
| 4037 | 0 | 0 | next if App::RSS2Leafnode::XML::Twig::Other::elt_is_empty($elt); | ||||
| 4038 | 0 | my $path = $elt->path; | |||||
| 4039 | 0 | $path =~ s{^/(rss|channel)/channel}{/channel}; | |||||
| 4040 | 0 | $path =~ s{^/(feed|rdf:RDF)}{/channel}; | |||||
| 4041 | 0 | $path =~ s{^/channel/entry}{/channel/item}; | |||||
| 4042 | 0 | 0 | next if $path =~ m{/xhtml}; | ||||
| 4043 | 0 | 0 | next if $path =~ m{^/channel/item/(description|content:encoded)/}; | ||||
| 4044 | 0 | 0 | next if exists $known{$path}; | ||||
| 4045 | ### unknown path: $path | ||||||
| 4046 | |||||||
| 4047 | 0 | require Text::Wrap; | |||||
| 4048 | 0 | my $part = do { | |||||
| 4049 | 0 | local $Text::Wrap::columns = $self->{'render_width'} + 1 + 4; | |||||
| 4050 | 0 | local $Text::Wrap::huge = 'overflow'; # don't break long words | |||||
| 4051 | 0 | local $Text::Wrap::unexpand = 0; # no tabs in output | |||||
| 4052 | 0 | $elt->sprint | |||||
| 4053 | }; | ||||||
| 4054 | 0 | $part =~ s/^ //mg; # indentation from element depth | |||||
| 4055 | 0 | $part =~ s/^\n+//; # leading blank lines | |||||
| 4056 | 0 | $xml .= $part; | |||||
| 4057 | } | ||||||
| 4058 | 0 | 0 | if ($xml eq '') { | ||||
| 4059 | 0 | return ''; | |||||
| 4060 | } | ||||||
| 4061 | ### $xml | ||||||
| 4062 | |||||||
| 4063 | 0 | 0 | if ($want_html) { | ||||
| 4064 | 0 |      return "\n \n" . __('Further feed XML:') . "  | 
|||||
| 4065 |        . "$Entitize{$xml}\n\n";  | 
||||||
| 4066 | } else { | ||||||
| 4067 | 0 | return "\n" . __('Further feed XML:') . "\n" . $xml; | |||||
| 4068 | } | ||||||
| 4069 | } | ||||||
| 4070 | |||||||
| 4071 | @known{qw(/channel/item/media:group/media:title | ||||||
| 4072 | /channel/item/media:group/media:description | ||||||
| 4073 | /channel/item/media:group/media:credit | ||||||
| 4074 | /channel/item/media:group/media:player | ||||||
| 4075 | /channel/item/media:group/media:thumbnail | ||||||
| 4076 | /channel/item/media:group/media:content | ||||||
| 4077 | /channel/item/media:group/media:copyright | ||||||
| 4078 | |||||||
| 4079 | --ENHANCE-ME--nothing-for-these-yet | ||||||
| 4080 | /channel/item/media:group/media:category | ||||||
| 4081 | /channel/item/media:group/media:rating | ||||||
| 4082 | )} = (); # hash slice | ||||||
| 4083 | |||||||
| 4084 | sub media_group_to_html { | ||||||
| 4085 | 0 | 0 | 0 | my ($self, $group) = @_; | |||
| 4086 | ### media_group_to_html(): "$group" | ||||||
| 4087 | |||||||
| 4088 | 0 |    my $ret = " \n";  | 
|||||
| 4089 | 0 | my @lines; | |||||
| 4090 | |||||||
| 4091 | 0 | foreach my $elt ($group->children('media:title'), | |||||
| 4092 | $group->children('media:description')) { | ||||||
| 4093 | 0 | push @lines, elt_to_html($elt); | |||||
| 4094 | } | ||||||
| 4095 | |||||||
| 4096 | 0 | foreach my $elt ($group->children('media:credit')) { | |||||
| 4097 | 0 | my $html = elt_to_html($elt); | |||||
| 4098 | 0 | 0 | if (defined (my $role = non_empty($elt->att('role')))) { | ||||
| 4099 | 0 | $html .= " ($Entitize{$role})"; | |||||
| 4100 | } | ||||||
| 4101 | 0 | push @lines, $html; | |||||
| 4102 | } | ||||||
| 4103 | 0 | foreach my $elt ($group->children('media:player'), | |||||
| 4104 | $group->children('media:thumbnail'), | ||||||
| 4105 | $group->children('media:content')) { | ||||||
| 4106 | 0 | 0 | my $url = $elt->att('url') // next; | ||||
| 4107 | 0 | my $abs_url = App::RSS2Leafnode::XML::Twig::Other::elt_xml_based_uri | |||||
| 4108 | ($group, $url); | ||||||
| 4109 | |||||||
| 4110 | 0 | my $html = " | |||||
| 4111 | 0 | 0 | if (defined (my $type = non_empty($elt->att('type')))) { | ||||
| 4112 | 0 | $html .= " type=\"$Entitize{$type}\""; | |||||
| 4113 | } | ||||||
| 4114 | 0 | 0 | if (defined (my $lang = non_empty($elt->att('lang')))) { | ||||
| 4115 | 0 | $html .= " hreflang=\"$Entitize{$lang}\""; | |||||
| 4116 | } | ||||||
| 4117 | 0 | $html .= ">$Entitize{$url}$url"; | |||||
| 4118 | { | ||||||
| 4119 | 0 | my @paren; | |||||
| 0 | |||||||
| 4120 | 0 | 0 | if (defined (my $size = non_empty($elt->att('fileSize')))) { | ||||
| 4121 | 0 | push @paren, $self->format_size_in_bytes($size); | |||||
| 4122 | } | ||||||
| 4123 | 0 | 0 | if (defined (my $duration = non_empty($elt->att('duration')))) { | ||||
| 4124 | 0 | 0 | if ($duration !~ /:/) { | ||||
| 4125 | 0 | $duration = __px('s-for-seconds', '{duration}s', | |||||
| 4126 | duration => $duration); | ||||||
| 4127 | } | ||||||
| 4128 | 0 | push @paren, $duration; | |||||
| 4129 | } | ||||||
| 4130 | 0 | 0 | if (@paren) { | ||||
| 4131 | 0 | $html .= $Entitize{' (' . join(', ',@paren). ')'}; | |||||
| 4132 | } | ||||||
| 4133 | } | ||||||
| 4134 | 0 | $html .= "\n"; | |||||
| 4135 | 0 | push @lines, $html; | |||||
| 4136 | } | ||||||
| 4137 | |||||||
| 4138 | 0 | foreach my $elt ($group->children('media:copyright')) { | |||||
| 4139 | 0 | push @lines, "Copyright: ".elt_to_html($elt); | |||||
| 4140 | } | ||||||
| 4141 | |||||||
| 4142 | ### total lines: scalar(@lines) | ||||||
| 4143 | 0 |    return " \n" . join("  | 
|||||
| 4144 | } | ||||||
| 4145 | |||||||
| 4146 | sub elt_to_html { | ||||||
| 4147 | 0 | 0 | 0 | my ($elt) = @_; | |||
| 4148 | 0 | 0 | defined $elt or return; | ||||
| 4149 | |||||||
| 4150 | 0 | my $type = elt_content_type ($elt); | |||||
| 4151 | 0 | 0 | if ($type eq 'xhtml') { | ||||
| 4152 | 0 | return elt_xhtml_to_html($elt); | |||||
| 4153 | } | ||||||
| 4154 | 0 | my $str = elt_subtext($elt); | |||||
| 4155 | 0 | 0 | if ($type eq 'html') { | ||||
| 4156 | 0 | return $str; | |||||
| 4157 | } else { | ||||||
| 4158 | 0 | return $Entitize{$str}; | |||||
| 4159 | } | ||||||
| 4160 | } | ||||||
| 4161 | |||||||
| 4162 | # $body construction below | ||||||
| 4163 | @known{qw(/channel/item/description | ||||||
| 4164 | /channel/item/dc:description | ||||||
| 4165 | /channel/item/itunes:summary | ||||||
| 4166 | /channel/item/content:encoded | ||||||
| 4167 | /channel/item/summary | ||||||
| 4168 | )} = (); | ||||||
| 4169 | |||||||
| 4170 | # $item is an XML::Twig::Elt | ||||||
| 4171 | # | ||||||
| 4172 | sub fetch_rss_process_one_item { | ||||||
| 4173 | 0 | 0 | 0 | my ($self, $item) = @_; | |||
| 4174 | 0 | my $subject = $self->item_to_subject ($item); | |||||
| 4175 | 0 | $self->verbose (1, ' ', __x('item: {subject}', subject => $subject)); | |||||
| 4176 | |||||||
| 4177 | 0 | my $msgid = $self->item_to_msgid ($item); | |||||
| 4178 | 0 | my $new = 0; | |||||
| 4179 | |||||||
| 4180 | 0 | 0 | if (! $self->nntp_message_id_exists ($msgid)) { | ||||
| 4181 | 0 | my $channel = item_to_channel($item); | |||||
| 4182 | 0 | my ($from, @from_links) = $self->item_to_from($item); | |||||
| 4183 | 0 | my @links = ($self->item_to_links ($item), | |||||
| 4184 | @from_links); | ||||||
| 4185 | |||||||
| 4186 | # For comments feeds show "Re: Foo" as the subject. Haven't seen a | ||||||
| 4187 |      # comments feed with anything useful in the  | 
||||||
| 4188 | # including it at the start of the message body if it was any good. | ||||||
| 4189 | # | ||||||
| 4190 | # | ||||||
| 4191 |      # http://www.netzpolitik.org/feed/ has  | 
||||||
| 4192 | # just "Von: Foo" where Foo is the poster's name. | ||||||
| 4193 | # | ||||||
| 4194 | # my $dummy = $self->DUMMY_EMAIL_ADDRESS; | ||||||
| 4195 | # if ($from =~ /(.*) <\Q$dummy\E>$/ | ||||||
| 4196 | # && $subject eq "Von: $1") { | ||||||
| 4197 | # $subject = $self->{'getting_rss_comments'}; | ||||||
| 4198 | # } | ||||||
| 4199 | # | ||||||
| 4200 | 0 | 0 | if (defined $self->{'getting_rss_comments'}) { | ||||
| 4201 | 0 | $subject = $self->{'getting_rss_comments'}; | |||||
| 4202 | } | ||||||
| 4203 | |||||||
| 4204 | 0 | my $list_post = googlegroups_link_email(@links); | |||||
| 4205 | 0 | 0 | my $precedence = (defined $list_post ? 'list' : undef); | ||||
| 4206 | 0 | my $language = $self->item_to_language($item); | |||||
| 4207 | |||||||
| 4208 |      # RSS  | 
||||||
| 4209 | # http://www.w3.org/TR/REC-PICS-labels | ||||||
| 4210 |      # ENHANCE-ME: Maybe transform  | 
||||||
| 4211 | # PICS too maybe, unless it only applies to the enclosure as such. Maybe | ||||||
| 4212 |      #  | 
||||||
| 4213 | 0 | my $pics_label = collapse_whitespace ($channel->first_child_text('rating')); | |||||
| 4214 | |||||||
| 4215 | # Crib: an undef value for a header means omit that header, which is good | ||||||
| 4216 | # for say the merely optional "Content-Language" | ||||||
| 4217 | # | ||||||
| 4218 | # there can be multiple "feed" links from Atom ... | ||||||
| 4219 | # 'X-RSS-Feed-Link:' => $channel->{'link'}, | ||||||
| 4220 | # | ||||||
| 4221 | 0 | my %headers | |||||
| 4222 | = ('Path:' => scalar ($self->uri_to_host), | ||||||
| 4223 | 'Newsgroups:' => $self->{'nntp_group'}, | ||||||
| 4224 | From => $from, | ||||||
| 4225 | Subject => $subject, | ||||||
| 4226 | Keywords => scalar ($self->item_to_keywords($item)), | ||||||
| 4227 | Date => scalar ($self->item_to_date($item)), | ||||||
| 4228 | 'In-Reply-To:' => scalar ($self->item_to_in_reply_to($item)), | ||||||
| 4229 | References => $self->{'References:'}, | ||||||
| 4230 | 'Message-ID' => $msgid, | ||||||
| 4231 | 'Content-Language:' => $language, | ||||||
| 4232 | 'Importance:' => scalar ($self->item_to_importance($item)), | ||||||
| 4233 | 'Priority:' => scalar ($self->item_to_priority($item)), | ||||||
| 4234 | 'Face:' => scalar ($self->item_to_face($item)), | ||||||
| 4235 | 'List-Post:' => $list_post, | ||||||
| 4236 | 'Precedence:' => $precedence, | ||||||
| 4237 | 'PICS-Label:' => $pics_label, | ||||||
| 4238 | 'X-Copyright:' => scalar ($self->item_to_copyright($item)), | ||||||
| 4239 | 'X-RSS-URL:' => scalar ($self->{'uri'}->as_string), | ||||||
| 4240 | 'X-RSS-Feedburner:' => scalar ($self->item_to_feedburner($item)), | ||||||
| 4241 | 'X-RSS-Generator:' => scalar ($self->item_to_generator($item)), | ||||||
| 4242 | ); | ||||||
| 4243 | |||||||
| 4244 | 0 | my $attach_elt; | |||||
| 4245 | |||||||
| 4246 |      #  | 
||||||
| 4247 |      # a copy of  | 
||||||
| 4248 | # | ||||||
| 4249 |      # ENHANCE-ME:  | 
||||||
| 4250 |      # as well as  | 
||||||
| 4251 | # | ||||||
| 4252 | my $body = ( | ||||||
| 4253 |                  #  | 
||||||
| 4254 |                  #  | 
||||||
| 4255 | $item->first_child('content:encoded') | ||||||
| 4256 | || $item->first_child('description') | ||||||
| 4257 | || $item->first_child('dc:description') | ||||||
| 4258 | || $item->first_child('itunes:summary') | ||||||
| 4259 | 0 | 0 | || do { | ||||
| 4260 |                    # Atom spec is for no more than one  | 
||||||
| 4261 | # Exclude "link", and leave "attach" to code below. | ||||||
| 4262 | my $elt = $item->first_child('content'); | ||||||
| 4263 | my $flavour = atom_content_flavour($elt); | ||||||
| 4264 | ($flavour eq 'link' ? undef | ||||||
| 4265 | : $flavour eq 'attach' ? do { $attach_elt = $elt; undef } | ||||||
| 4266 | : $elt) | ||||||
| 4267 | } | ||||||
| 4268 | || $item->first_child('summary')); # Atom | ||||||
| 4269 | |||||||
| 4270 | 0 | my $body_type = elt_content_type ($body); | |||||
| 4271 | 0 | $self->verbose (3, ' body_type from elt: ', $body_type); | |||||
| 4272 | 0 | my $body_charset = 'utf-8'; | |||||
| 4273 | 0 | my $body_base_url = App::RSS2Leafnode::XML::Twig::Other::elt_xml_base ($body); | |||||
| 4274 | 0 | 0 | if (! defined $body_type) { # no $body element at all | ||||
| 0 | |||||||
| 0 | |||||||
| 0 | |||||||
| 0 | |||||||
| 4275 | 0 | $body = ''; | |||||
| 4276 | 0 | $body_type = 'text/plain'; | |||||
| 4277 | |||||||
| 4278 | } elsif ($body_type eq 'xhtml') { # Atom | ||||||
| 4279 | 0 | $body = elt_xhtml_to_html ($body); | |||||
| 4280 | 0 | $body_type = 'html'; | |||||
| 4281 | |||||||
| 4282 | } elsif ($body_type eq 'html') { # RSS or Atom | ||||||
| 4283 | 0 | $body = elt_subtext($body); | |||||
| 4284 | |||||||
| 4285 | } elsif ($body_type eq 'text') { # Atom 'text' to be flowed | ||||||
| 4286 | # should be text-only, no sub-elements, but extract sub-elements to | ||||||
| 4287 | # cope with dodgy feeds with improperly escaped html etc | ||||||
| 4288 | 0 | $body = $self->text_wrap (elt_subtext ($body)); | |||||
| 4289 | 0 | $body_type = 'text/plain'; | |||||
| 4290 | } elsif ($body_type =~ m{^text/}) { # Atom mime text type | ||||||
| 4291 | 0 | $body = elt_subtext ($body); | |||||
| 4292 | |||||||
| 4293 | } else { # Atom base64 something | ||||||
| 4294 | 0 | $body = MIME::Base64::decode ($body->text); | |||||
| 4295 | 0 | $body_charset = undef; | |||||
| 4296 | } | ||||||
| 4297 | 0 | 0 | $self->verbose (3, " body: $body_type charset=", | ||||
| 4298 | $body_charset//'undef', "\n", | ||||||
| 4299 | "$body\n"); | ||||||
| 4300 | |||||||
| 4301 | 0 | 0 | my $body_is_html = ($body_type eq 'html'|| $body_type eq 'text/html'); | ||||
| 4302 | 0 | 0 | my $links_want_html = ($body_is_html && ! $self->{'render'}); | ||||
| 4303 | 0 | 0 | $self->verbose (3, " links_want_html: ", | ||||
| 4304 | ($links_want_html ? "yes" : "no")); | ||||||
| 4305 | |||||||
| 4306 | # sort downloadables to the start, then by "priority" | ||||||
| 4307 | 1 | 1 | 1208 | use sort 'stable'; | |||
| 1 | 636 | ||||||
| 1 | 6 | ||||||
| 4308 | 0 | 0 | 0 | @links = sort {($b->{'download'}||0) <=> ($a->{'download'}||0) | |||
| 0 | 0 | ||||||
| 0 | |||||||
| 0 | |||||||
| 4309 | || ($b->{'priority'}||0) <=> ($a->{'priority'}||0)} | ||||||
| 4310 | @links; | ||||||
| 4311 | 0 | 0 | my $links_str = ($links_want_html | ||||
| 4312 | ? links_to_html(@links) | ||||||
| 4313 | : links_to_text(@links)); | ||||||
| 4314 | 0 | $links_str .= $self->item_common_alert_protocol($item, $links_want_html); | |||||
| 4315 | 0 | my @parts; | |||||
| 4316 | |||||||
| 4317 |      #  | 
||||||
| 4318 | { | ||||||
| 4319 | 0 | my $content = join ("\n", | |||||
| 0 | |||||||
| 4320 | 0 | map {$self->media_group_to_html($_)} | |||||
| 4321 | $item->children('media:group')); | ||||||
| 4322 | 0 | 0 | if (is_non_empty($content)) { | ||||
| 4323 | 0 | ($content, my $charset) = html_wrap_fragment ($item, $content); | |||||
| 4324 | 0 | my $content_type = 'text/html'; | |||||
| 4325 | 0 | ($content, $content_type, $charset, my $rendered) | |||||
| 4326 | = $self->render_maybe ($content, $content_type, $charset, | ||||||
| 4327 | $body_base_url); | ||||||
| 4328 | ### media group content: $content | ||||||
| 4329 | 0 | 0 | if ($content_type eq 'text/plain') { | ||||
| 4330 | 0 | $links_str .= $content; | |||||
| 4331 | } else { | ||||||
| 4332 | 0 | $content = Encode::encode ($charset, $content); | |||||
| 4333 | 0 | push @parts, $self->mime_build ({}, # headers | |||||
| 4334 | Type => $content_type, | ||||||
| 4335 | Charset => $charset, | ||||||
| 4336 | Data => $content); | ||||||
| 4337 | } | ||||||
| 4338 | } | ||||||
| 4339 | } | ||||||
| 4340 | |||||||
| 4341 | 0 | 0 | if (is_non_empty(my $content | ||||
| 4342 | = $self->item_unknowns($item, $links_want_html))) { | ||||||
| 4343 | 0 | 0 | my $content_type = ($links_want_html ? 'text/html' : 'text/plain'); | ||||
| 4344 | 0 | 0 | if (@parts) { | ||||
| 4345 | 0 | 0 | my $charset = (is_ascii($content) ? 'us-ascii' : 'utf-8'); | ||||
| 4346 | 0 | $content = Encode::encode ($charset, $content); | |||||
| 4347 | 0 | push @parts, $self->mime_build ({}, # headers | |||||
| 4348 | Type => $content_type, | ||||||
| 4349 | Charset => $charset, | ||||||
| 4350 | Data => $content); | ||||||
| 4351 | } else { | ||||||
| 4352 | 0 | $links_str .= $content; | |||||
| 4353 | } | ||||||
| 4354 | } | ||||||
| 4355 | |||||||
| 4356 | 0 | 0 | if ($self->{'rss_get_links'}) { | ||||
| 4357 | 0 | foreach my $l (@links) { | |||||
| 4358 | 0 | 0 | next if ! $l->{'download'}; | ||||
| 4359 | 0 | my $url = $l->{'uri'}; | |||||
| 4360 | 0 | $self->verbose (1, ' ', __x('link: "{name}" {url}', | |||||
| 4361 | name => $l->{'name'}, | ||||||
| 4362 | url => $url)); | ||||||
| 4363 | 0 | require HTTP::Request; | |||||
| 4364 | 0 | my $req = HTTP::Request->new (GET => $url); | |||||
| 4365 | 0 | my $resp = $self->ua->request($req); | |||||
| 4366 | 0 | $resp = $self->aireview_follow ($url, $resp); | |||||
| 4367 | |||||||
| 4368 | 0 | 0 | if (! $resp->is_success) { | ||||
| 4369 | 0 | print __x("rss2leafnode: {url}\n {status}\n", | |||||
| 4370 | url => $l->{'uri'}, | ||||||
| 4371 | status => $resp->status_line); | ||||||
| 4372 | 0 | my $msg = __x("Cannot download link {url}\n {status}", | |||||
| 4373 | url => $l->{'uri'}, | ||||||
| 4374 | status => $resp->status_line); | ||||||
| 4375 | 0 | 0 | if ($links_want_html) { | ||||
| 4376 | 0 | $msg = $Entitize{$msg}; | |||||
| 4377 | 0 |              $msg =~ s/\n/ /;  | 
|||||
| 4378 | 0 |              $links_str .= " $msg\n \n"; | 
|||||
| 4379 | } else { | ||||||
| 4380 | 0 | $links_str .= "\n$msg\n"; | |||||
| 4381 | } | ||||||
| 4382 | 0 | next; | |||||
| 4383 | } | ||||||
| 4384 | |||||||
| 4385 | # suspect little value in a description when inlined | ||||||
| 4386 | # 'Content-Description:' => mimewords_non_ascii($l->{'title'}) | ||||||
| 4387 | # favicon used for Face if nothing in the item | ||||||
| 4388 | # | ||||||
| 4389 | 0 | $self->enforce_html_charset_from_content ($resp); | |||||
| 4390 | 0 | 0 | $headers{'Face:'} ||= $self->http_resp_to_face($resp); | ||||
| 4391 | 0 | $self->http_resp_extract_main($resp); | |||||
| 4392 | 0 | push @parts, $self->mime_part_from_response($resp); | |||||
| 4393 | } | ||||||
| 4394 | } | ||||||
| 4395 | 0 | 0 | 0 | if ($links_want_html && $body_type eq 'html') { | |||
| 4396 | # append to html fragment | ||||||
| 4397 | 0 | $body .= $links_str; | |||||
| 4398 | 0 | undef $links_str; | |||||
| 4399 | } | ||||||
| 4400 | |||||||
| 4401 | 0 | 0 | if ($body_type eq 'html') { | ||||
| 4402 | 0 | ($body, $body_charset) = html_wrap_fragment ($item, $body, $language); | |||||
| 4403 | 0 | $body_type = 'text/html'; | |||||
| 4404 | } | ||||||
| 4405 | 0 | 0 | if (defined $body_charset) { | ||||
| 4406 | 0 | $body = Encode::encode ($body_charset, $body); | |||||
| 4407 | } | ||||||
| 4408 | |||||||
| 4409 | 0 | ($body, $body_type, $body_charset) | |||||
| 4410 | = $self->render_maybe ($body, $body_type, $body_charset, $body_base_url); | ||||||
| 4411 | |||||||
| 4412 | 0 | 0 | if ($body_type eq 'text/plain') { | ||||
| 4413 | # remove trailing whitespace from any text | ||||||
| 4414 | 0 | $body =~ s/\s+$//; | |||||
| 4415 | 0 | $body .= "\n"; | |||||
| 4416 | |||||||
| 4417 | 0 | 0 | if (! $links_want_html) { | ||||
| 4418 | # append to text/plain, either atom type=text or rendered html | ||||||
| 4419 | 0 | 0 | unless (is_empty ($links_str)) { | ||||
| 4420 | 0 | $links_str = Encode::encode ($body_charset, $links_str); | |||||
| 4421 | 0 | $body .= "\n$links_str\n"; | |||||
| 4422 | } | ||||||
| 4423 | 0 | undef $links_str; | |||||
| 4424 | } | ||||||
| 4425 | } | ||||||
| 4426 | |||||||
| 4427 | 0 | 0 | unless (is_empty ($links_str)) { | ||||
| 4428 | 0 | my $links_type; | |||||
| 4429 | my $links_charset; | ||||||
| 4430 | 0 | 0 | if ($links_want_html) { | ||||
| 4431 | 0 | $links_type = 'text/html'; | |||||
| 4432 | 0 | ($links_str, $links_charset) = html_wrap_fragment ($item, $links_str); | |||||
| 4433 | } else { | ||||||
| 4434 | 0 | $links_type = 'text/plain'; | |||||
| 4435 | 0 | 0 | $links_charset = (is_ascii($links_str) ? 'us-ascii' : 'utf-8'); | ||||
| 4436 | } | ||||||
| 4437 | 0 | $links_str = Encode::encode ($links_charset, $links_str); | |||||
| 4438 | 0 | unshift @parts, $self->mime_build ({}, | |||||
| 4439 | Type => $links_type, | ||||||
| 4440 | Encoding => $links_charset, | ||||||
| 4441 | Data => $links_str); | ||||||
| 4442 | } | ||||||
| 4443 | |||||||
| 4444 | |||||||
| 4445 | 0 | my $top = $self->mime_build (\%headers, | |||||
| 4446 | Top => 1, | ||||||
| 4447 | Type => $body_type, | ||||||
| 4448 | Charset => $body_charset, | ||||||
| 4449 | Data => $body); | ||||||
| 4450 | |||||||
| 4451 |      # Atom  | 
||||||
| 4452 | 0 | 0 | if ($attach_elt) { | ||||
| 4453 | # ENHANCE-ME: this decodes base64 from the xml and then re-encodes for | ||||||
| 4454 | # the mime, is it possible to pass straight in? | ||||||
| 4455 | 0 | 0 | unshift @parts, $self->mime_build | ||||
| 4456 | ({ 'Content-Location:' => $self->{'uri'}->as_string }, | ||||||
| 4457 | Type => scalar ($attach_elt->att('atom:type') | ||||||
| 4458 | // $attach_elt->att('type')), | ||||||
| 4459 | Encoding => 'base64', | ||||||
| 4460 | Data => MIME::Base64::decode($attach_elt->text)); | ||||||
| 4461 | } | ||||||
| 4462 | |||||||
| 4463 | 0 | $self->verbose (2, 'parts count: ',scalar(@parts)); | |||||
| 4464 | 0 | foreach my $part (@parts) { | |||||
| 4465 | 0 | $top->make_multipart; | |||||
| 4466 | 0 | $top->add_part ($part); | |||||
| 4467 | } | ||||||
| 4468 | |||||||
| 4469 | 0 | mime_entity_lines($top); | |||||
| 4470 | 0 | 0 | $self->nntp_post($top) || return 0; | ||||
| 4471 | 0 | $self->verbose (1, ' ', __('posted')); | |||||
| 4472 | 0 | $new++; | |||||
| 4473 | } | ||||||
| 4474 | |||||||
| 4475 | # ENHANCE-ME: check the replies count to see if more to fetch | ||||||
| 4476 | 0 | 0 | if ($self->{'rss_get_comments'}) { | ||||
| 4477 | 0 | my ($comments_rss_url, $comments_count) | |||||
| 4478 | = $self->item_to_comments_rss($item); | ||||||
| 4479 | ### rss_get_comments: $comments_rss_url, $comments_count | ||||||
| 4480 | 0 | 0 | if (defined $comments_rss_url) { | ||||
| 4481 | |||||||
| 4482 | # ENHANCE-ME: There's also a thr:updated in RFC 4685, but haven't seen | ||||||
| 4483 | # that ever actually used. | ||||||
| 4484 | 0 | my $status = $self->status_geturl ($comments_rss_url); | |||||
| 4485 | 0 | 0 | 0 | if (defined $status->{'comments_count'} | |||
| 0 | |||||||
| 4486 | && defined $comments_count | ||||||
| 4487 | && $status->{'comments_count'} == $comments_count) { | ||||||
| 4488 | 0 | $self->verbose (1, ' ', __x('comments count unchanged: {count}', | |||||
| 4489 | count => $comments_count)); | ||||||
| 4490 | |||||||
| 4491 | } else { | ||||||
| 4492 | 0 | local $self->{'rss_get_links'} = 0; | |||||
| 4493 | 0 | local $self->{'rss_get_comments'} = 0; | |||||
| 4494 | 0 | local $self->{'comments_count'} = $comments_count; | |||||
| 4495 | # "Re:" is not translated, variants of that are very annoying | ||||||
| 4496 | 0 | local $self->{'getting_rss_comments'} = "Re: $subject"; | |||||
| 4497 | 0 | local $self->{'References:'} = $msgid; | |||||
| 4498 | 0 | $new += fetch_rss ($self, $self->{'nntp_group'}, $comments_rss_url); | |||||
| 4499 | } | ||||||
| 4500 | } | ||||||
| 4501 | } | ||||||
| 4502 | 0 | return $new; | |||||
| 4503 | } | ||||||
| 4504 | |||||||
| 4505 | sub item_to_comments_rss { | ||||||
| 4506 | 0 | 0 | 0 | my ($self, $item) = @_; | |||
| 4507 | 0 | my ($url, $url_elt); | |||||
| 4508 | |||||||
| 4509 | # Atom | ||||||
| 4510 | # href='http:/...' /> | ||||||
| 4511 | 0 | foreach my $elt ($item->children('link')) { | |||||
| 4512 | 0 | 0 | my $rel = ($elt->att('rel') | ||||
| 0 | |||||||
| 4513 | // $elt->att('atom:rel') | ||||||
| 4514 | // next); | ||||||
| 4515 | 0 | 0 | $rel eq 'replies' or next; | ||||
| 4516 | 0 | 0 | $self->atom_link_is_rss($elt) or next; | ||||
| 4517 | 0 | 0 | my $href = ($elt->att('href') | ||||
| 4518 | // $elt->att('atom:href')); | ||||||
| 4519 | 0 | 0 | if (is_non_empty ($href)) { | ||||
| 4520 | 0 | $url = $href; | |||||
| 4521 | 0 | $elt = $url_elt; | |||||
| 4522 | } | ||||||
| 4523 | } | ||||||
| 4524 | |||||||
| 4525 |    #  | 
||||||
| 4526 | # it appeared in the spec page as wfw:commentRSS, so ignore case | ||||||
| 4527 | 0 | 0 | if (! defined $url) { | ||||
| 4528 | 0 | my $u = $item->first_child_trimmed_text (qr/^wfw:commentRss$/i); | |||||
| 4529 | 0 | 0 | if (is_non_empty ($u)) { | ||||
| 4530 | 0 | $url = $u; | |||||
| 4531 | } | ||||||
| 4532 | } | ||||||
| 4533 | |||||||
| 4534 | 0 | 0 | return ($url, | ||||
| 4535 | (defined($url) && $self->item_elt_comments_count($item,$url_elt))); | ||||||
| 4536 | } | ||||||
| 4537 | |||||||
| 4538 |  #  | 
||||||
| 4539 | # are which are replies, there's no comments link as such for it to refer | ||||||
| 4540 | # to, it seems | ||||||
| 4541 | sub item_elt_comments_count { | ||||||
| 4542 | 0 | 0 | 0 | my ($self, $item, $elt) = @_; | |||
| 4543 | 0 | 0 | return (($elt && $elt->att('thr:count')) | ||||
| 0 | |||||||
| 0 | |||||||
| 0 | |||||||
| 0 | |||||||
| 0 | |||||||
| 0 | |||||||
| 4544 | // ($elt && $elt->att('count')) | ||||||
| 4545 | // ($elt && $elt->att('atom:count')) | ||||||
| 4546 | // non_empty ($item->first_child_trimmed_text('thr:total')) | ||||||
| 4547 | // non_empty ($item->first_child_trimmed_text('slash:comments'))); | ||||||
| 4548 | } | ||||||
| 4549 | @known{qw(/channel/item/jf:replyCount | ||||||
| 4550 | )} = (); | ||||||
| 4551 | |||||||
| 4552 | # $group is a string, the name of a local newsgroup | ||||||
| 4553 | # $url is a string, an RSS feed to be read | ||||||
| 4554 | # | ||||||
| 4555 | sub fetch_rss { | ||||||
| 4556 | 0 | 0 | 1 | my ($self, $group, $url, %options) = @_; | |||
| 4557 | 0 | local @{$self}{keys %options} = values %options; # hash slice | |||||
| 0 | |||||||
| 4558 | 0 | $self->verbose (2, "fetch_rss: $group $url"); | |||||
| 4559 | |||||||
| 4560 | 0 | my $group_uri = URI->new($group,'news'); | |||||
| 4561 | 0 | local $self->{'nntp_host'} = uri_to_nntp_host ($group_uri); | |||||
| 4562 | 0 | local $self->{'nntp_group'} = $group = $group_uri->group; | |||||
| 4563 | 0 | 0 | $self->nntp_group_check($group) or return 0; | ||||
| 4564 | |||||||
| 4565 | # an in-memory cookie jar, used only per-RSS feed and then discarded, | ||||||
| 4566 | # which means only kept for fetching for $self->{'rss_get_links'} from a | ||||||
| 4567 | # feed | ||||||
| 4568 | 0 | $self->ua->cookie_jar({}); | |||||
| 4569 | |||||||
| 4570 | 0 | 0 | if (defined $self->{'getting_rss_comments'}) { | ||||
| 4571 | 0 | $self->verbose (1, ' ', __x('rss comments: {url}', url => $url)); | |||||
| 4572 | } else { | ||||||
| 4573 | 0 | $self->verbose (1, __x('feed: {url}', url => $url)); | |||||
| 4574 | } | ||||||
| 4575 | 0 | require HTTP::Request; | |||||
| 4576 | 0 | my $req = HTTP::Request->new (GET => $url); | |||||
| 4577 | 0 | 0 | $self->status_etagmod_req($req,1) || return 0; | ||||
| 4578 | |||||||
| 4579 | # $req->uri can be a URI object or a string | ||||||
| 4580 | 0 | local $self->{'uri'} = URI->new ($req->uri); | |||||
| 4581 | |||||||
| 4582 | 0 | my $resp = $self->ua->request($req); | |||||
| 4583 | 0 | 0 | if ($resp->code == 304) { | ||||
| 4584 | 0 | $self->status_unchanged ($url); | |||||
| 4585 | 0 | return 0; | |||||
| 4586 | } | ||||||
| 4587 | 0 | 0 | if (! $resp->is_success) { | ||||
| 4588 | 0 | print __x("rss2leafnode: {url}\n {status}\n", | |||||
| 4589 | url => $url, | ||||||
| 4590 | status => $resp->status_line); | ||||||
| 4591 | 0 | return 0; | |||||
| 4592 | } | ||||||
| 4593 | 0 | local $self->{'resp'} = $resp; | |||||
| 4594 | |||||||
| 4595 | 0 | $self->verbose (3, "response:", $resp->dump, "\n"); # extra newline | |||||
| 4596 | 0 | 0 | $resp->decode | ||||
| 4597 | or die "Oops, cannot decode Content-Encoding: ", | ||||||
| 4598 | $self->header("Content-Encoding"); | ||||||
| 4599 | |||||||
| 4600 | 0 | my $xml = $resp->content; # raw bytes | |||||
| 4601 | 0 | $xml = $self->enforce_rss_charset_override ($xml); | |||||
| 4602 | |||||||
| 4603 | 0 | my ($twig, $err) = $self->twig_parse($xml); | |||||
| 4604 | 0 | 0 | if (defined $err) { | ||||
| 4605 | 0 | my $message = __x("XML::Twig parse error on\n\n {url}\n\n", | |||||
| 4606 | url => $url); | ||||||
| 4607 | 0 | 0 | if ($resp->request->uri ne $url) { | ||||
| 4608 | 0 | $message .= __x("which redirected to\n\n {url}\n\n", | |||||
| 4609 | url => $resp->request->uri); | ||||||
| 4610 | } | ||||||
| 4611 | 0 | $message .= $err . "\n\n" . __("Raw XML below.\n") . "\n"; | |||||
| 4612 | 0 | $self->error_message | |||||
| 4613 | (__x("Error parsing {url}", url => $url), | ||||||
| 4614 | $message, $xml); | ||||||
| 4615 | # after successful error message to news | ||||||
| 4616 | 0 | $self->status_etagmod_resp ($url, $resp); | |||||
| 4617 | 0 | return 0; | |||||
| 4618 | } | ||||||
| 4619 | 0 | 0 | if ($self->{'verbose'} >= 3) { | ||||
| 4620 | 0 | require Data::Dumper; | |||||
| 4621 | 0 | $self->verbose (3, | |||||
| 4622 | Data::Dumper->new([$twig->root],['root']) | ||||||
| 4623 | ->Indent(1)->Sortkeys(1)->Dump); | ||||||
| 4624 | } | ||||||
| 4625 | |||||||
| 4626 | # "item" for RSS/RDF, "entry" for Atom | ||||||
| 4627 | 0 | my @items = $twig->descendants(qr/^(item|entry)$/); | |||||
| 4628 | |||||||
| 4629 | 0 | @items = $self->rss_newest_only_items(@items); | |||||
| 4630 | |||||||
| 4631 | 0 | my $new = 0; | |||||
| 4632 | 0 | foreach my $item (@items) { | |||||
| 4633 | 0 | $new += $self->fetch_rss_process_one_item ($item); | |||||
| 4634 | } | ||||||
| 4635 | |||||||
| 4636 | 0 | 0 | if ($self->{'verbose'} >= 2) { | ||||
| 4637 | 0 | my $jar = $self->ua->cookie_jar; | |||||
| 4638 | 0 | 0 | 0 | if ($jar && (my $str = $jar->as_string ne '')) { | |||
| 4639 | 0 | $self->verbose (2, "accumulated cookies from this feed:\n", $str); | |||||
| 4640 | } else { | ||||||
| 4641 | 0 | $self->verbose (2, 'no cookies from this feed'); | |||||
| 4642 | } | ||||||
| 4643 | } | ||||||
| 4644 | 0 | $self->ua->cookie_jar (undef); | |||||
| 4645 | |||||||
| 4646 | 0 | $self->status_etagmod_resp ($url, $resp, $twig); | |||||
| 4647 | 0 | say __xn('{group}: {count} new article', | |||||
| 4648 | '{group}: {count} new articles', | ||||||
| 4649 | $new, | ||||||
| 4650 | group => $group, | ||||||
| 4651 | count => $new); | ||||||
| 4652 | |||||||
| 4653 | 0 | return $new; | |||||
| 4654 | } | ||||||
| 4655 | |||||||
| 4656 | 1; | ||||||
| 4657 | __END__ |