File Coverage

blib/lib/WebFetch.pm
Criterion Covered Total %
statement 16 18 88.8
branch n/a
condition n/a
subroutine 6 6 100.0
pod n/a
total 22 24 91.6


line stmt bran cond sub pod time code
1             # WebFetch - infrastructure for downloading ("fetching") information from
2             # various sources around the Internet or the local system in order to
3             # present them for display, or to export local information to other sites
4             # on the Internet
5             #
6             # Copyright (c) 1998-2009 Ian Kluft. This program is free software; you can
7             # redistribute it and/or modify it under the terms of the GNU General Public
8             # License Version 3. See http://www.webfetch.org/GPLv3.txt
9              
10             package WebFetch;
11              
12             =head1 NAME
13              
14             WebFetch - Perl module to download and save information from the Web
15              
16             =head1 SYNOPSIS
17              
18             use WebFetch;
19              
20             =head1 DESCRIPTION
21              
22             The WebFetch module is a framework for downloading and saving
23             information from the web, and for saving or re-displaying it.
24             It provides a generalized interface for saving to a file
25             while keeping the previous version as a backup.
26             This is mainly intended for use in a cron-job to acquire
27             periodically-updated information.
28              
29             WebFetch allows the user to specify a source and destination, and
30             the input and output formats. It is possible to write new Perl modules
31             to the WebFetch API in order to add more input and output formats.
32              
33             The currently-provided input formats are Atom, RSS, WebFetch "SiteNews" files
34             and raw Perl data structures.
35              
36             The currently-provided output formats are RSS, WebFetch "SiteNews" files,
37             the Perl Template Toolkit, and export into a TWiki site.
38              
39             Some modules which were specific to pre-RSS/Atom web syndication formats
40             have been deprecated. Those modules can be found in the CPAN archive
41             in WebFetch 0.10. Those modules are no longer compatible with changes
42             in the current WebFetch API.
43              
44             =head1 INSTALLATION
45              
46             After unpacking and the module sources from the tar file, run
47              
48             C
49              
50             C
51              
52             C
53              
54             Or from a CPAN shell you can simply type "C"
55             and it will download, build and install it for you.
56              
57             If you need help setting up a separate area to install the modules
58             (i.e. if you don't have write permission where perl keeps its modules)
59             then see the Perl FAQ.
60              
61             To begin using the WebFetch modules, you will need to test your
62             fetch operations manually, put them into a crontab, and then
63             use server-side include (SSI) or a similar server configuration to
64             include the files in a live web page.
65              
66             =head2 MANUALLY TESTING A FETCH OPERATION
67              
68             Select a directory which will be the storage area for files created
69             by WebFetch. This is an important administrative decision -
70             keep the volatile automatically-generated files in their own directory
71             so they'll be separated from manually-maintained files.
72              
73             Choose the specific WebFetch-derived modules that do the work you want.
74             See their particular manual/web pages for details on command-line arguments.
75             Test run them first before committing to a crontab.
76              
77             =head2 SETTING UP CRONTAB ENTRIES
78              
79             If needed, see the manual pages for crontab(1), crontab(5) and any
80             web sites or books on Unix system administration.
81              
82             Since WebFetch command lines are usually very long, the user may prefer
83             to make one or more scripts as front-ends so crontab entries aren't so big.
84              
85             Try not to run crontab entries too often - be aware if the site you're
86             accessing has any resource constraints, and how often their information
87             gets updated. If they request users not to access a feed more often
88             than a certain interval, respect it. (It isn't hard to find violators
89             in server logs.) If in doubt, try every 30 minutes until more information
90             becomes available.
91              
92             =head1 WebFetch FUNCTIONS
93              
94             The following function definitions assume B> is a blessed
95             reference to a module that is derived from (inherits from) WebFetch.
96              
97             =over 4
98              
99             =cut
100              
101 1     1   32341 use strict;
  1         2  
  1         40  
102              
103 1     1   1399 use Getopt::Long;
  1         27294  
  1         8  
104 1     1   1318 use LWP::UserAgent;
  1         53774  
  1         35  
105 1     1   11 use HTTP::Request;
  1         2  
  1         26  
106 1     1   914 use Date::Calc;
  1         45404  
  1         228  
107              
108             # define exceptions/errors
109             use Exception::Class (
110 0           'WebFetch::Exception',
111             'WebFetch::TracedException' => {
112             isa => 'WebFetch::Exception',
113             },
114              
115             'WebFetch::Exception::DataWrongType' => {
116             isa => 'WebFetch::TracedException',
117             alias => 'throw_data_wrongtype',
118             description => "provided data must be a WebFetch::Data::Store",
119             },
120              
121             'WebFetch::Exception::GetoptError' => {
122             isa => 'WebFetch::Exception',
123             alias => 'throw_getopt_error',
124             description => "software error during command line processing",
125             },
126              
127             'WebFetch::Exception::Usage' => {
128             isa => 'WebFetch::Exception',
129             alias => 'throw_cli_usage',
130             description => "command line processing failed",
131             },
132              
133             'WebFetch::Exception::Save' => {
134             isa => 'WebFetch::Exception',
135             alias => 'throw_save_error',
136             description => "an error occurred while saving the data",
137             },
138              
139             'WebFetch::Exception::NoSave' => {
140             isa => 'WebFetch::Exception',
141             alias => 'throw_no_save',
142             description => "unable to save: no data or nowhere to save it",
143             },
144              
145             'WebFetch::Exception::NoHandler' => {
146             isa => 'WebFetch::Exception',
147             alias => 'throw_no_handler',
148             description => "no handler was found",
149             },
150              
151             'WebFetch::Exception::MustOverride' => {
152             isa => 'WebFetch::TracedException',
153             alias => 'throw_abstract',
154             description => "A WebFetch function was called which is "
155             ."supposed to be overridden by a subclass",
156             },
157              
158             'WebFetch::Exception::NetworkGet' => {
159             isa => 'WebFetch::Exception',
160             description => "Failed to access RSS feed",
161             },
162              
163             'WebFetch::Exception::ModLoadFailure' => {
164             isa => 'WebFetch::Exception',
165             alias => 'throw_mod_load_failure',
166             description => "failed to load a WebFetch Perl module",
167             },
168              
169             'WebFetch::Exception::ModRunFailure' => {
170             isa => 'WebFetch::Exception',
171             alias => 'throw_mod_run_failure',
172             description => "failed to run a WebFetch module",
173             },
174              
175             'WebFetch::Exception::ModNoRunModule' => {
176             isa => 'WebFetch::Exception',
177             alias => 'throw_no_run',
178             description => "no module was found to run the request",
179             },
180              
181             'WebFetch::Exception::AutoloadFailure' => {
182             isa => 'WebFetch::TracedException',
183             alias => 'throw_autoload_fail',
184             description => "AUTOLOAD failed to handle function call",
185             },
186              
187 1     1   660 );
  0            
188              
189             # initialize class variables
190             our $VERSION = '0.13';
191             our %default_modules = (
192             "input" => {
193             "rss" => "WebFetch::Input::RSS",
194             "sitenews" => "WebFetch::Input::SiteNews",
195             "perlstruct" => "WebFetch::Input::PerlStruct",
196             "atom" => "WebFetch::Input::Atom",
197             "dump" => "WebFetch::Input::Dump",
198             },
199             "output" => {
200             "rss" => "WebFetch::Output:RSS",
201             "atom" => "WebFetch::Output:Atom",
202             "tt" => "WebFetch::Output:TT",
203             "perlstruct" => "WebFetch::Output::PerlStruct",
204             "dump" => "WebFetch::Output::Dump",
205             }
206             );
207             our %modules;
208             our $AUTOLOAD;
209             my $debug;
210              
211             sub debug
212             {
213             $debug and print STDERR "debug: ".join( " ", @_ )."\n";
214             }
215              
216             =item WebFetch::module_register( $module, @capabilities );
217              
218             This function allows a Perl module to register itself with the WebFetch API
219             as able to perform various capabilities.
220              
221             For subclasses of WebFetch, it can be called as a class method.
222             C<__PACKAGE__->module_register( @capabilities );>
223              
224             For the $module parameter, the Perl module should provide its own
225             name, usually via the __PACKAGE__ string.
226              
227             The @capabilities array is any number of strings as needed to list the
228             capabilities which the module performs for the WebFetch API.
229             The currently-recognized capabilities are "cmdline", "input" and "output".
230             "config", "filter", "save" and "storage" are reserved for future use. The
231             function will save all the capability names that the module provides, without
232             checking whether any code will use it.
233              
234             For example, the WebFetch::Output::TT module registers itself like this:
235             C<__PACKAGE__->module_register( "cmdline", "output:tt" );>
236             meaning that it defines additional command-line options, and it provides an
237             output format handler for the "tt" format, the Perl Template Toolkit.
238              
239             =cut
240              
241             sub module_register
242             {
243             my $module = shift;
244             my @capabilities = @_;
245              
246             # each string provided is a capability the module provides
247             foreach my $capability ( @capabilities ) {
248             # A ":" if present delimits a group of capabilities
249             # such as "input:rss" for and "input" capability of "rss"
250             if ( $capability =~ /([^:]+):([^:]+)/ ) {
251             # A ":" was found so process a 2nd-level group entry
252             my $group = $1;
253             my $subcap = $2;
254             if ( !exists $modules{$group}) {
255             $modules{$group} = {};
256             }
257             if ( !exists $modules{$group}{$subcap}) {
258             $modules{$group}{$subcap} = [];
259             }
260             push @{$modules{$group}{$subcap}}, $module;
261             } else {
262             # just a simple capbility name so store it
263             if ( !exists $modules{$capability}) {
264             $modules{$capability} = [];
265             }
266             push @{$modules{$capability}}, $module;
267             }
268             }
269             }
270              
271             # module selection - choose WebFetch module based on selected file format
272             # for WebFetch internal use only
273             sub module_select
274             {
275             my $capability = shift;
276             my $is_optional = shift;
277              
278             debug "module_select($capability,$is_optional)";
279             # parse the capability string
280             my ( $group, $topic );
281             if ( $capability =~ /([^:]*):(.*)/ ) {
282             $group = $1;
283             $topic = $2
284             } else {
285             $topic = $capability;
286             }
287            
288             # check for modules to handle the specified source_format
289             my ( @handlers, %handlers, $handler );
290              
291             # consider whether a group is in use (single or double-level scan)
292             if ( $group ) {
293             # double-level scan
294              
295             # if the group exists, search in it
296             if (( exists $modules{$group}{$topic} )
297             and ( ref $modules{$group}{$topic} eq "ARRAY" ))
298             {
299             # search group for topic
300             foreach $handler (@{$modules{$group}{$topic}})
301             {
302             if ( !exists $handlers{$handler}) {
303             push @handlers, $handler;
304             $handlers{$handler} = 1;
305             }
306             }
307              
308             # otherwise check the defaults
309             } elsif ( exists $default_modules{$group}{$topic} ) {
310             # check default handlers
311             $handler = $default_modules{$group}{$topic};
312             if ( !exists $handlers{$handler}) {
313             push @handlers, $handler;
314             $handlers{$handler} = 1;
315             }
316             }
317             } else {
318             # single-level scan
319              
320             # if the topic exists, the search is a success
321             if (( exists $modules{$topic})
322             and ( ref $modules{$topic} eq "ARRAY" ))
323             {
324             @handlers = @{$modules{$topic}};
325             }
326             }
327            
328             # check if any handlers were found for this format
329             if ( ! @handlers and ! $is_optional ) {
330             throw_no_handler( "handler not found for $capability" );
331             }
332              
333             debug "module_select: ".join( " ", @handlers );
334             return @handlers;
335             }
336              
337             # satisfy POD coverage test - but don't put this function in the user manual
338             =pod
339             =cut
340              
341             # if no input or output format was specified, but only 1 is registered, pick it
342             # $group parameter should be config group to search, i.e. "input" or "output"
343             # returns the format string which will be provided
344             sub singular_handler
345             {
346             my $group = shift;
347              
348             debug "singular_handler($group)";
349             my $count = 0;
350             my ( $entry, $last );
351             foreach $entry ( keys %{$modules{$group}} ) {
352             if ( ref $modules{$group}{$entry} eq "ARRAY" ) {
353             my $entry_count = scalar @{$modules{$group}{$entry}};
354             $count += $entry_count;
355             if ( $count > 1 ) {
356             return undef;
357             }
358             if ( $entry_count == 1 ) {
359             $last = $entry;
360             }
361             }
362             }
363              
364             # if there's only one registered, that's the one to use
365             debug "singular_handler: count=$count last=$last";
366             return $count == 1 ? $last : undef;
367             }
368              
369              
370             =item fetch_main
371              
372             This function is exported into the main package.
373             For all modules which registered with an "input" capability for the requested
374             file format at the time this is called, it will call the run() function on
375             behalf of each of the packages.
376              
377             =cut
378              
379             # Find and run all the fetch_main functions in packages under WebFetch.
380             # This eliminates the need for the sub-packages to export their own
381             # fetch_main(), which users found conflicted with each other when
382             # loading more than one WebFetch-derived module.
383              
384             # fetch_main - eval wrapper for fetch_main2 to catch and display errors
385             sub main::fetch_main
386             {
387             # run fetch_main2 in an eval so we can catch exceptions
388             my $result = eval { &WebFetch::fetch_main2; };
389              
390             # process any error/exception that we may have gotten
391             if ( $@ ) {
392             my $ex = $@;
393              
394             # determine if there's an error message available to display
395             my $pkg = __PACKAGE__;
396             if ( ref $ex ) {
397             if ( my $ex_cap = Exception::Class->caught(
398             "WebFetch::Exception"))
399             {
400             if ( $ex_cap->isa( "WebFetch::TracedException" )) {
401             warn $ex_cap->trace->as_string, "\n";
402             }
403              
404             die "$pkg: ".$ex_cap->error."\n";
405             }
406             if ( $ex->can("stringify")) {
407             # Error.pm, possibly others
408             die "$pkg: ".$ex->stringify."\n";
409             } elsif ( $ex->can("as_string")) {
410             # generic - should work for many classes
411             die "$pkg: ".$ex->as_string."\n";
412             } else {
413             die "$pkg: unknown exception of type "
414             .(ref $ex)."\n";
415             }
416             } else {
417             die "pkg: $@\n";
418             }
419             }
420              
421             # success
422             exit 0;
423             }
424              
425              
426             sub fetch_main2
427             {
428             # search for modules which have registered "cmdline" capability
429             # collect their command line options
430             my ( $cli_mod, @mod_options, @mod_usage );
431             if (( exists $modules{cmdline} )
432             and ( ref $modules{cmdline} eq "ARRAY" ))
433             {
434             foreach $cli_mod ( @{$modules{cmdline}}) {
435             if ( eval "defined \@{".$cli_mod."::Options}" ) {
436             eval "push \@mod_options,"
437             ."\@{".$cli_mod."::Options}";
438             }
439             if ( eval "defined \@{".$cli_mod."::Usage}" ) {
440             eval "push \@mod_options, \@{"
441             .$cli_mod."::Usage}";
442             }
443             }
444             }
445              
446             # process command line
447             my ( $result, %options );
448             $result = eval { GetOptions ( \%options,
449             "dir:s",
450             "group:s",
451             "mode:s",
452             "source=s",
453             "source_format:s",
454             "dest=s",
455             "dest_format:s",
456             "fetch_urls",
457             "quiet",
458             "debug",
459             @mod_options ) };
460             if ( $@ ) {
461             throw_getopt_error ( "command line processing failed: $@" );
462             } elsif ( ! $result ) {
463             throw_cli_usage ( "usage: $0 --dir dirpath "
464             ."[--group group] [--mode mode] "
465             ."[--source file] [--source_format fmt-string] "
466             ."[--dest file] [--dest_format fmt-string] "
467             ."[--fetch_urls] [--quiet] "
468             .join( " ", @mod_usage ));
469             }
470              
471             # set debugging mode
472             if (( exists $options{debug}) and $options{debug}) {
473             $debug = 1;
474             }
475             debug "fetch_main";
476              
477              
478             # if either source/input or dest/output formats were not provided,
479             # check if only one handler is registered - if so that's the default
480             if ( !exists $options{source_format}) {
481             if ( my $fmt = singular_handler( "input" )) {
482             $options{source_format} = $fmt;
483             }
484             }
485             if ( !exists $options{dest_format}) {
486             if ( my $fmt = singular_handler( "output" )) {
487             $options{dest_format} = $fmt;
488             }
489             }
490              
491             # check for modules to handle the specified source_format
492             my ( @handlers, %handlers );
493             if (( exists $modules{input}{ $options{source_format}} )
494             and ( ref $modules{input}{ $options{source_format}}
495             eq "ARRAY" ))
496             {
497             my $handler;
498             foreach $handler (@{$modules{input}{$options{source_format}}})
499             {
500             if ( !exists $handlers{$handler}) {
501             push @handlers, $handler;
502             $handlers{$handler} = 1;
503             }
504             }
505             }
506             if ( exists $default_modules{ $options{source_format}} ) {
507             my $handler = $default_modules{ $options{source_format}};
508             if ( !exists $handlers{$handler}) {
509             push @handlers, $handler;
510             $handlers{$handler} = 1;
511             }
512             }
513            
514             # check if any handlers were found for this input format
515             if ( ! @handlers ) {
516             throw_no_handler( "input handler not found for "
517             .$options{source_format});
518             }
519              
520             # run the available handlers until one succeeds or none are left
521             my $pkgname;
522             my $run_count = 0;
523             foreach $pkgname ( @handlers ) {
524             debug "running for $pkgname";
525             eval { &WebFetch::run( $pkgname, \%options )};
526             if ( $@ ) {
527             print STDERR "WebFetch: run eval error: $@\n";
528             } else {
529             $run_count++;
530             last;
531             }
532             }
533             if ( $run_count == 0 ) {
534             throw_no_run( "no handlers were able or available to process "
535             ." source format" );
536             }
537             }
538              
539             =item $obj = WebFetch::new( param => "value", [...] )
540              
541             Generally, the new function should be inherited and used from a derived
542             class. However, WebFetch provides an AUTOLOAD function which will catch
543             wayward function calls from a subclass, and redirect it to the appropriate
544             function in the calling class, if it exists.
545              
546             The AUTOLOAD feature is needed because, for example, when an object is
547             instantiated in a WebFetch::Input::* class, it will later be passed to
548             a WebFetch::Output::* class, whose data method functions can be accessed
549             this way as if the WebFetch object had become a member of that class.
550              
551             =cut
552              
553             # allocate a new object
554             sub new
555             {
556             my $class = shift;
557             my $self = {};
558             bless $self, $class;
559              
560             # initialize the object parameters
561             $self->init(@_);
562              
563             # go fetch the data
564             # this function must be provided by a derived module
565             # non-fetching modules (i.e. data) must define $self->{no_fetch}=1
566             if (( ! exists $self->{no_fetch}) or ! $self->{no_fetch}) {
567             require WebFetch::Data::Store;
568             if ( exists $self->{data}) {
569             $self->{data}->isa( "WebFetch::Data::Store" )
570             or throw_data_wrongtype "object data must be "
571             ."a WebFetch::Data::Store";
572             } else {
573             $self->{data} = WebFetch::Data::Store->new();
574             }
575             $self->fetch();
576             }
577              
578             # the object has been created
579             return $self;
580             }
581              
582             =item $obj->init( ... )
583              
584             This is called from the C function that modules inherit from WebFetch.
585             If subclasses override it, they should still call it before completion.
586             It takes "name" => "value" pairs which are all placed verbatim as
587             attributes in C<$obj>.
588              
589             =cut
590              
591             # initialize attributes of new objects
592             sub init
593             {
594             my $self = shift;
595             if ( @_ ) {
596             my %params = @_;
597             @$self{keys %params} = values %params;
598             }
599             }
600              
601             =item WebFetch::mod_load ( $class )
602              
603             This specifies a WebFetch module (Perl class) which needs to be loaded.
604             In case of an error, it throws an exception.
605              
606             =cut
607              
608             sub mod_load
609             {
610             my $pkg = shift;
611              
612             # make sure we have the run package loaded
613             eval "require $pkg";
614             if ( $@ ) {
615             throw_mod_load_failure( "failed to load $pkg: $@" );
616             }
617             }
618              
619             =item WebFetch::run
620              
621             This function can be called by the C function
622             provided by WebFetch or by another user function.
623             This handles command-line processing for some standard options,
624             calling the module-specific fetch function and WebFetch's $obj->save
625             function to save the contents to one or more files.
626              
627             The command-line processing for some standard options are as follows:
628              
629             =over 4
630              
631             =item --dir I
632              
633             (required) the directory in which to write output files
634              
635             =item --group I
636              
637             (optional) the group ID to set the output file(s) to
638              
639             =item --mode I
640              
641             (optional) the file mode (permissions) to set the output file(s) to
642              
643             =item --save_file I
644              
645             (optional) save a copy of the fetched info
646             in the file named by this parameter.
647             The contents of the file are determined by the C<--dest_format> parameter.
648             If C<--dest_format> isn't defined but only one module has registered a
649             file format for saving, then that will be used by default.
650              
651             =item --quiet
652              
653             (optional) suppress printed warnings for HTTP errors
654             I<(applies only to modules which use the WebFetch::get() function)>
655             in case they are not desired for cron outputs
656              
657             =item --debug
658              
659             (optional) print verbose debugging outputs,
660             only useful for developers adding new WebFetch-based modules
661             or finding/reporting a bug in an existing module
662              
663             =back
664              
665             Modules derived from WebFetch may add their own command-line options
666             that WebFetch::run() will use by defining a variable called
667             B> in the calling module,
668             using the name/value pairs defined in Perl's Getopts::Long module.
669             Derived modules can also add to the command-line usage error message by
670             defining a variable called B> with a string of the additional
671             parameters, as they should appear in the usage message.
672              
673             =cut
674              
675             # command-line handling for WebFetch-derived classes
676             sub run
677             {
678             my $run_pkg = shift;
679             my $options_ref = shift;
680             my $obj;
681              
682             debug "entered run for $run_pkg";
683              
684             # make sure we have the run package loaded
685             mod_load $run_pkg;
686              
687             # Note: in order to add WebFetch-embedding capability, the fetch
688             # routine saves its raw data without any HTML/XML/etc formatting
689             # in @{$obj->{data}} and data-to-savable conversion routines in
690             # %{$obj->{actions}}, which contains several structures with key
691             # names matching software processing features. The purpose of
692             # this is to externalize the captured data so other software can
693             # use it too.
694              
695             # create the new object
696             # this also calls the $obj->fetch() routine for the module which
697             # has inherited from WebFetch to do this
698             debug "run before new";
699             $obj = eval $run_pkg."->new( \%\$options_ref )";
700             if ( $@ ) {
701             throw_mod_run_failure( "module run failure: ".$@ );
702             }
703              
704             # if the object had data for the WebFetch-embedding API,
705             # then data processing is external to the fetch routine
706             # (This externalizes the data for other software to capture it.)
707             debug "run before output";
708             my $dest_format = $obj->{dest_format};
709             if ( !exists $obj->{actions}) {
710             $obj->{actions} = {};
711             }
712             if (( exists $obj->{data})) {
713             if ( exists $obj->{dest}) {
714             if ( !exists $obj->{actions}{$dest_format}) {
715             $obj->{actions}{$dest_format} = [];
716             }
717             push @{$obj->{actions}{$dest_format}}, [ $obj->{dest} ];
718             }
719              
720             # perform requested actions on the data
721             $obj->do_actions();
722             } else {
723             throw_no_save( "save failed: no data or nowhere to save it" );
724             }
725              
726             debug "run before save";
727             my $result = $obj->save();
728              
729             # check for errors, throw exception to report errors per savable item
730             if ( ! $result ) {
731             my $savable;
732             my @errors;
733             foreach $savable ( @{$obj->{savable}}) {
734             (ref $savable eq "HASH") or next;
735             if ( exists $savable->{error}) {
736             push @errors, "file: ".$savable->{file}
737             ."error: " .$savable->{error};
738             }
739             }
740             if ( @errors ) {
741             throw_save_error( "error saving results in "
742             .$obj->{dir}
743             ."\n".join( "\n", @errors )."\n" );
744             }
745             }
746              
747             return $result ? 0 : 1;
748             }
749              
750             =item $obj->do_actions
751              
752             I was added in WebFetch 0.10 as part of the
753             WebFetch Embedding API.>
754             Upon entry to this function, $obj must contain the following attributes:
755              
756             =over 4
757              
758             =item data
759              
760             is a reference to a hash containing the following three (required)
761             keys:
762              
763             =over 4
764              
765             =item fields
766              
767             is a reference to an array containing the names of the fetched data fields
768             in the order they appear in the records of the I array.
769             This is necessary to define what each field is called
770             because any kind of data can be fetched from the web.
771              
772             =item wk_names
773              
774             is a reference to a hash which maps from
775             a key string with a "well-known" (to WebFetch) field type
776             to a field name used in this table.
777             The well-known names are defined as follows:
778              
779             =over 4
780              
781             =item title
782              
783             a one-liner banner or title text
784             (plain text, no HTML tags)
785              
786             =item url
787              
788             URL or file path (as appropriate) to the news source
789              
790             =item id
791              
792             unique identifier string for the entry
793              
794             =item date
795              
796             a date stamp,
797             which must be program-readable
798             by Perl's Date::Calc module in the Parse_Date() function
799             in order to support timestamp-related comparisons
800             and processing that some users have requested.
801             If the date cannot be parsed by Date::Calc,
802             either translate it when your module captures it,
803             or do not define this "well-known" field
804             because it wouldn't fit the definition.
805             (plain text, no HTML tags)
806              
807             =item summary
808              
809             a paragraph of summary text in HTML
810              
811             =item comments
812              
813             number of comments/replies at the news site
814             (plain text, no HTML tags)
815              
816             =item author
817              
818             a name, handle or login name representing the author of the news item
819             (plain text, no HTML tags)
820              
821             =item category
822              
823             a word or short phrase representing the category, topic or department
824             of the news item
825             (plain text, no HTML tags)
826              
827             =item location
828              
829             a location associated with the news item
830             (plain text, no HTML tags)
831              
832             =back
833              
834             The field names for this table are defined in the I array.
835              
836             The hash only maps for the fields available in the table.
837             If no field representing a given well-known name is present
838             in the data fields,
839             that well-known name key must not be defined in this hash.
840              
841             =item records
842              
843             an array containing the data records.
844             Each record is itself a reference to an array of strings which are
845             the data fields.
846             This is effectively a two-dimensional array or a table.
847              
848             Only one table-type set of data is permitted per fetch operation.
849             If more are needed, they should be arranged as separate fetches
850             with different parameters.
851              
852             =back
853              
854             =item actions
855              
856             is a reference to a hash.
857             The hash keys are names for handler functions.
858             The WebFetch core provides internal handler functions called
859             I (for HTML output),
860             I (for XML output),
861             I (for WebFetch::General format),
862             However, WebFetch modules may provide additional
863             format handler functions of their own by prepending
864             "fmt_handler_" to the key string used in the I array.
865              
866             The values are array references containing
867             I<"action specs">,
868             which are themselves arrays of parameters
869             that will be passed to the handler functions
870             for generating output in a specific format.
871             There may be more than one entry for a given format if multiple outputs
872             with different parameters are needed.
873              
874             The presence of values in this field mean that output is to be
875             generated in the specified format.
876             The presence of these would have been chosed by the WebFetch module that
877             created them - possibly by default settings or by a command-line argument
878             that directed a specific output format to be used.
879              
880             For each valid action spec,
881             a separate "savable" (contents to be placed in a file)
882             will be generated from the contents of the I variable.
883              
884             The valid (but all optional) keys are
885              
886             =over 4
887              
888             =item html
889              
890             the value must be a reference to an array which specifies all the
891             HTML generation (html_gen) operations that will take place upon the data.
892             Each entry in the array is itself an array reference,
893             containing the following parameters for a call to html_gen():
894              
895             =over 4
896              
897             =item filename
898              
899             a file name or path string
900             (relative to the WebFetch output directory unless a full path is given)
901             for output of HTML text.
902              
903             =item params
904              
905             a hash reference containing optional name/value parameters for the
906             HTML format handler.
907              
908             =over 4
909              
910             =item filter_func
911              
912             (optional)
913             a reference to code that, given a reference to an entry in
914             @{$self->{data}{records}},
915             returns true (1) or false (0) for whether it will be included in the
916             HTML output.
917             By default, all records are included.
918              
919             =item sort_func
920              
921             (optional)
922             a reference to code that, given references to two entries in
923             @{$self->{data}{records}},
924             returns the sort comparison value for the order they should be in.
925             By default, no sorting is done and all records (subject to filtering)
926             are accepted in order.
927              
928             =item format_func
929              
930             (optional)
931             a refernce to code that, given a reference to an entry in
932             @{$self->{data}{records}},
933             stores a savable representation of the string.
934              
935             =back
936              
937             =back
938              
939             =back
940              
941             =back
942              
943             Additional valid keys may be created by modules that inherit from WebFetch
944             by supplying a method/function named with "fmt_handler_" preceding the
945             string used for the key.
946             For example, for an "xyz" format, the handler function would be
947             I.
948             The value (the "action spec") of the hash entry
949             must be an array reference.
950             Within that array are "action spec entries",
951             each of which is a reference to an array containing the list of
952             parameters that will be passed verbatim to the I function.
953              
954             When the format handler function returns, it is expected to have
955             created entries in the $obj->{savables} array
956             (even if they only contain error messages explaining a failure),
957             which will be used by $obj->save() to save the files and print the
958             error messages.
959              
960             For coding examples, use the I functions in WebFetch.pm itself.
961              
962             =back
963              
964             =cut
965              
966             sub do_actions
967             {
968             my ( $self ) = @_;
969             debug "in WebFetch::do_actions";
970              
971             # we *really* need the data and actions to be set!
972             # otherwise assume we're in WebFetch 0.09 compatibility mode and
973             # $self->fetch() better have created its own savables already
974             if (( !exists $self->{data}) or ( !exists $self->{actions})) {
975              
976             return
977             }
978              
979             # loop through all the actions
980             my $action_spec;
981             foreach $action_spec ( keys %{$self->{actions}} ) {
982             my $handler_ref;
983              
984             # check for modules to handle the specified dest_format
985             my ( @handlers, %handlers );
986             my $action_handler = "fmt_handler_".$action_spec;
987             if ( exists $modules{output}{$action_spec}) {
988             my $class;
989             foreach $class ( @{$modules{output}{$action_spec}}) {
990             if ( $class->can( $action_handler )) {
991             $handler_ref = \&{$class."::".$action_handler};
992             last;
993             }
994             }
995             }
996              
997             if ( defined $handler_ref )
998             {
999             # loop through action spec entries (parameter lists)
1000             my $entry;
1001             foreach $entry ( @{$self->{actions}{$action_spec}}) {
1002             # parameters must be in an ARRAY ref
1003             if (ref $entry ne "ARRAY" ) {
1004             warn "warning: entry in action spec "
1005             ."\"".$action_spec."\""
1006             ."expected to be ARRAY, found "
1007             .(ref $entry)." instead "
1008             ."- ignored\n";
1009             next;
1010             }
1011              
1012             # everything looks OK - call the handler
1013             &$handler_ref($self, @$entry);
1014              
1015             # if there were errors, the handler should
1016             # have created a savable entry which
1017             # contains only the error entry so that
1018             # it will be reported by $self->save()
1019             }
1020             } else {
1021             warn "warning: action \"$action_spec\" specified but "
1022             ."\&{\$self->$action_handler}() "
1023             ."not defined in "
1024             .(ref $self)." - ignored\n";
1025             }
1026             }
1027             }
1028              
1029             =item $obj->fetch
1030              
1031             B
1032             fetch operaton specific to that module.>
1033             It will be called from C so you should not call it directly.
1034             Your fetch function should extract some data from somewhere
1035             and place of it in HTML or other meaningful form in the "savable" array.
1036              
1037             TODO: cleanup references to WebFetch 0.09 and 0.10 APIs.
1038              
1039             Upon entry to this function, $obj must contain the following attributes:
1040              
1041             =over 4
1042              
1043             =item dir
1044              
1045             The name of the directory to save in.
1046             (If called from the command-line, this will already have been provided
1047             by the required C<--dir> parameter.)
1048              
1049             =item savable
1050              
1051             a reference to an array where the "savable" items will be placed by
1052             the $obj->fetch function.
1053             (You only need to provide an array reference -
1054             other WebFetch functions can write to it.)
1055              
1056             In WebFetch 0.10 and later,
1057             this parameter should no longer be supplied by the I function
1058             (unless you wish to use 0.09 backward compatibility)
1059             because it is filled in by the I
1060             after the I function is completed
1061             based on the I and I variables
1062             that are set in the I function.
1063             (See below.)
1064              
1065             Each entry of the savable array is a hash reference with the following
1066             attributes:
1067              
1068             =over 4
1069              
1070             =item file
1071              
1072             file name to save in
1073              
1074             =item content
1075              
1076             scalar w/ entire text or raw content to write to the file
1077              
1078             =item group
1079              
1080             (optional) group setting to apply to file
1081              
1082             =item mode
1083              
1084             (optional) file permissions to apply to file
1085              
1086             =back
1087              
1088             Contents of savable items may be generated directly by derived modules
1089             or with WebFetch's C, C or C
1090             functions.
1091             These functions will set the group and mode parameters from the
1092             object's own settings, which in turn could have originated from
1093             the WebFetch command-line if this was called that way.
1094              
1095             =back
1096              
1097             Note that the fetch functions requirements changed in WebFetch 0.10.
1098             The old requirement (0.09 and earlier) is supported for backward compatibility.
1099              
1100             I,
1101             upon exit from this function, the $obj->savable array must contain
1102             one entry for each file to be saved.
1103             More than one array entry means more than one file to save.
1104             The WebFetch infrastructure will save them, retaining backup copies
1105             and setting file modes as needed.
1106              
1107             I, the "WebFetch embedding" capability was introduced.
1108             In order to do this, the captured data of the I function
1109             had to be externalized where other Perl routines could access it.
1110             So the fetch function now only populates data structures
1111             (including code references necessary to process the data.)
1112              
1113             Upon exit from the function,
1114             the following variables must be set in C<$obj>:
1115              
1116             =over 4
1117              
1118             =item data
1119              
1120             is a reference to a hash which will be used by the I function.
1121             (See above.)
1122              
1123             =item actions
1124              
1125             is a reference to a hash which will be used by the I function.
1126             (See above.)
1127              
1128             =back
1129              
1130             =cut
1131              
1132             # placeholder for fetch routines by derived classes
1133             sub fetch
1134             {
1135             throw_abstract "fetch is an abstract function and must be overridden";
1136             }
1137              
1138              
1139             =item $obj->get
1140              
1141             This WebFetch utility function will get a URL and return a reference
1142             to a scalar with the retrieved contents.
1143             Upon entry to this function, C<$obj> must contain the following attributes:
1144              
1145             =over 4
1146              
1147             =item source
1148              
1149             the URL to get
1150              
1151             =item quiet
1152              
1153             a flag which, when set to a non-zero (true) value,
1154             suppresses printing of HTTP request errors on STDERR
1155              
1156             =back
1157              
1158             =cut
1159              
1160             # utility function to get the contents of a URL
1161             sub get
1162             {
1163             my ( $self, $source ) = @_;
1164              
1165             if ( ! defined $source ) {
1166             $source = $self->{source};
1167             }
1168             if ( $self->{debug}) {
1169             print STDERR "debug: get(".$source.")\n";
1170             }
1171              
1172             # send request, capture response
1173             my $ua = LWP::UserAgent->new;
1174             $ua->agent("WebFetch/$VERSION ".$ua->agent);
1175             my $request = HTTP::Request->new(GET => $source);
1176             my $response = $ua->request($request);
1177              
1178             # abort on failure
1179             if ($response->is_error) {
1180             WebFetch::Exception::NetworkGet->throw(
1181             "The request received an error: "
1182             .$response->as_string );
1183             }
1184              
1185             # return the content
1186             my $content = $response->content;
1187             return \$content;
1188             }
1189              
1190             =item $obj->html_savable( $filename, $content )
1191              
1192             I
1193             format handler functions. See do_actions() for details.>
1194              
1195             This WebFetch utility function stores pre-generated HTML in a new entry in
1196             the $obj->{savable} array, for later writing to a file.
1197             It's basically a simple wrapper that puts HTML comments
1198             warning that it's machine-generated around the provided HTML text.
1199             This is generally a good idea so that neophyte webmasters
1200             (and you know there are a lot of them in the world :-)
1201             will see the warning before trying to manually modify
1202             your automatically-generated text.
1203              
1204             See $obj->fetch for details on the contents of the C parameter
1205              
1206             =cut
1207              
1208             # utility function to make a savable record for HTML text
1209             sub html_savable
1210             {
1211             my ( $self, $filename, $content ) = @_;
1212              
1213             $self->raw_savable( $filename,
1214             "\n"
1216             ."\n"
1218             .$content
1219             ."\n" );
1221             }
1222              
1223             =item $obj->raw_savable( $filename, $content )
1224              
1225             I
1226             format handler functions. See do_actions() for details.>
1227              
1228             This WebFetch utility function stores any raw content and a filename
1229             in the $obj->{savable} array,
1230             in preparation for writing to that file.
1231             (The actual save operation may also automatically include keeping
1232             backup files and setting the group and mode of the file.)
1233              
1234             See $obj->fetch for details on the contents of the C parameter
1235              
1236             =cut
1237              
1238             # utility function to make a savable record for raw text
1239             sub raw_savable
1240             {
1241             my ( $self, $filename, $content ) = @_;
1242              
1243             if ( !exists $self->{savable}) {
1244             $self->{savable} = [];
1245             }
1246             push ( @{$self->{savable}}, {
1247             'file' => $filename,
1248             'content' => $content,
1249             (( exists $self->{group}) ? ('group' => $self->{group}) : ()),
1250             (( exists $self->{mode}) ? ('mode' => $self->{mode}) : ())
1251             });
1252             }
1253              
1254             =item $obj->direct_fetch_savable( $filename, $source )
1255              
1256             I
1257             See do_actions() for details.>
1258              
1259             This adds a task for the save function to fetch a URL and save it
1260             verbatim in a file. This can be used to download links contained
1261             in a news feed.
1262              
1263             =cut
1264              
1265             sub direct_fetch_savable
1266             {
1267             my ( $self, $url ) = @_;
1268              
1269             if ( !exists $self->{savable}) {
1270             $self->{savable} = [];
1271             }
1272             my $filename = $url;
1273             $filename =~ s=[;?].*==;
1274             $filename =~ s=^.*/==;
1275             push ( @{$self->{savable}}, {
1276             'url' => $url,
1277             'file' => $filename,
1278             'index' => 1,
1279             (( exists $self->{group}) ? ('group' => $self->{group}) : ()),
1280             (( exists $self->{mode}) ? ('mode' => $self->{mode}) : ())
1281             });
1282             }
1283              
1284             =item $obj->no_savables_ok
1285              
1286             This can be used by an output function which handles its own intricate output
1287             operation (such as WebFetch::Output::TWiki). If the savables array is empty,
1288             it would cause an error. Using this function drops a note in it which
1289             basically says that's OK.
1290              
1291             =cut
1292              
1293             sub no_savables_ok
1294             {
1295             my $self = shift;
1296              
1297             push ( @{$self->{savable}}, {
1298             'ok_empty' => 1,
1299             });
1300             }
1301              
1302             =item $obj->save
1303              
1304             This WebFetch utility function goes through all the entries in the
1305             $obj->{savable} array and saves their contents,
1306             providing several services such as keeping backup copies,
1307             and setting the group and mode of the file, if requested to do so.
1308              
1309             If you call a WebFetch-derived module from the command-line run()
1310             or fetch_main() functions, this will already be done for you.
1311             Otherwise you will need to call it after populating the
1312             C array with one entry per file to save.
1313              
1314             Upon entry to this function, C<$obj> must contain the following attributes:
1315              
1316             =over 4
1317              
1318             =item dir
1319              
1320             directory to save files in
1321              
1322             =item savable
1323              
1324             names and contents for files to save
1325              
1326             =back
1327              
1328             See $obj->fetch for details on the contents of the C parameter
1329              
1330             =cut
1331              
1332             # file-save routines for all WebFetch-derived classes
1333             sub save
1334             {
1335             my $self = shift;
1336              
1337             if ( $self->{debug} ) {
1338             print STDERR "entering save()\n";
1339             }
1340              
1341             # check if we have attributes needed to proceed
1342             if ( !exists $self->{"dir"}) {
1343             die "WebFetch: directory path missing - "
1344             ."required for save\n";
1345             }
1346             if ( !exists $self->{savable}) {
1347             die "WebFetch: nothing to save\n";
1348             }
1349             if ( ref($self->{savable}) ne "ARRAY" ) {
1350             die "WebFetch: cannot save - savable is not an array\n";
1351             }
1352              
1353             # if fetch_urls is defined, turn link fields in the data to savables
1354             if (( exists $self->{fetch_urls}) and $self->{fetch_urls}) {
1355             my $url_fnum = $self->wk2fnum( "url" );
1356             my $entry;
1357             $self->data->reset_pos;
1358             while ( $entry = $self->data->next_record()) {
1359             my $url = $entry->url;
1360             if ( defined $url ) {
1361             $self->direct_fetch_savable( $entry->url );
1362             }
1363             }
1364             }
1365              
1366             # loop through "savable" (grouped content and filename destination)
1367             my $savable;
1368             foreach $savable ( @{$self->{savable}}) {
1369              
1370             if ( exists $savable->{file}) {
1371             debug "saving ".$savable->{file}."\n";
1372             }
1373              
1374             # an output module may have handled a more intricate operation
1375             if ( exists $savable->{ok_empty}) {
1376             last;
1377             }
1378              
1379             # verify contents of savable record
1380             if ( !exists $savable->{file}) {
1381             $savable->{error} = "missing file name - skipped";
1382             next;
1383             }
1384             if (( !exists $savable->{content})
1385             and ( !exists $savable->{url}))
1386             {
1387             $savable->{error} = "missing content or URL - skipped";
1388             next;
1389             }
1390              
1391             # generate file names
1392             my $new_content = $self->{"dir"}."/N".$savable->{file};
1393             my $main_content = $self->{"dir"}."/".$savable->{file};
1394             my $old_content = $self->{"dir"}."/O".$savable->{file};
1395              
1396             # make sure the Nxx "new content" file does not exist yet
1397             if ( -f $new_content ) {
1398             if ( !unlink $new_content ) {
1399             $savable->{error} = "cannot unlink "
1400             .$new_content.": $!";
1401             next;
1402             }
1403             }
1404              
1405             # if a URL was provided and index flag is set, use index file
1406             my %id_index;
1407             my ( $timestamp, $filename );
1408             my $was_in_index = 0;
1409             if (( exists $savable->{url}) and ( exists $savable->{index}))
1410             {
1411             require DB_File;
1412             tie %id_index, 'DB_File',
1413             $self->{dir}."/id_index.db",
1414             &DB_File::O_CREAT|&DB_File::O_RDWR, 0640;
1415             if ( exists $id_index{$savable->{url}}) {
1416             ( $timestamp, $filename ) =
1417             split /#/, $id_index{$savable->{url}};
1418             $was_in_index = 1;
1419             } else {
1420             $timestamp = time;
1421             $id_index{$savable->{url}} =
1422             $timestamp."#".$savable->{file};
1423             }
1424             untie %id_index ;
1425             }
1426              
1427             # For now, we consider it done if the file was in the index.
1428             # Future options would be to check if URL was modified.
1429             if ( $was_in_index ) {
1430             next;
1431             }
1432              
1433             # if a URL was provided and no content, get content from URL
1434             if (( ! exists $savable->{content})
1435             and ( exists $savable->{url}))
1436             {
1437             $savable->{content} =
1438             eval { ${$self->get($savable->{url})} };
1439             if ( $@ ) {
1440             next;
1441             }
1442             }
1443              
1444             # write content to the "new content" file
1445             if ( ! open ( new_content, ">:utf8", "$new_content" )) {
1446             $savable->{error} = "cannot open $new_content: $!";
1447             next;
1448             }
1449             if ( !print new_content $savable->{content}) {
1450             $savable->{error} = "failed to write to "
1451             .$new_content.": $!";
1452             close new_content;
1453             next;
1454             }
1455             if ( !close new_content ) {
1456             # this can happen with NFS errors
1457             $savable->{error} = "failed to close "
1458             .$new_content.": $!";
1459             next;
1460             }
1461              
1462             # remove the "old content" file to get it out of the way
1463             if ( -f $old_content ) {
1464             if ( !unlink $old_content ) {
1465             $savable->{error} = "cannot unlink "
1466             .$old_content.": $!";
1467             next;
1468             }
1469             }
1470              
1471             # move the main content to the old content - now it's a backup
1472             if ( -f $main_content ) {
1473             if ( !rename $main_content, $old_content ) {
1474             $savable->{error} = "cannot rename "
1475             .$main_content." to "
1476             .$old_content.": $!";
1477             next;
1478             }
1479             }
1480              
1481             # chgrp the "new content" before final installation
1482             if ( exists $savable->{group}) {
1483             my $gid = $savable->{group};
1484             if ( $gid !~ /^[0-9]+$/o ) {
1485             $gid = (getgrnam($gid))[2];
1486             if ( ! defined $gid ) {
1487             $savable->{error} = "cannot chgrp "
1488             .$new_content.": "
1489             .$savable->{group}
1490             ." does not exist";
1491             next;
1492             }
1493             }
1494             if ( ! chown $>, $gid, $new_content ) {
1495             $savable->{error} = "cannot chgrp "
1496             .$new_content." to "
1497             .$savable->{group}.": $!";
1498             next;
1499             }
1500             }
1501              
1502             # chmod the "new content" before final installation
1503             if ( exists $savable->{mode}) {
1504             if ( ! chmod oct($savable->{mode}), $new_content ) {
1505             $savable->{error} = "cannot chmod "
1506             .$new_content." to "
1507             .$savable->{mode}.": $!";
1508             next;
1509             }
1510             }
1511              
1512             # move the new content to the main content - final install
1513             if ( -f $new_content ) {
1514             if ( !rename $new_content, $main_content ) {
1515             $savable->{error} = "cannot rename "
1516             .$new_content." to "
1517             .$main_content.": $!";
1518             next;
1519             }
1520             }
1521             }
1522              
1523             # loop through savable to report any errors
1524             my $err_count = 0;
1525             foreach $savable ( @{$self->{savable}}) {
1526             if ( exists $savable->{error}) {
1527             print STDERR "WebFetch: failed to save "
1528             .$savable->{file}.": "
1529             .$savable->{error}."\n";
1530             $err_count++;
1531             }
1532             }
1533             if ( $err_count ) {
1534             die "WebFetch: $err_count errors - fetch/save failed\n";
1535             }
1536              
1537             # success if we got here
1538             return 1;
1539             }
1540              
1541             #
1542             # shortcuts to data object functions
1543             #
1544              
1545             sub data { my $self = shift; return $self->{data}; }
1546             sub wk2fname { my $self = shift; return $self->{data}->wk2fname( @_ )};
1547             sub fname2fnum { my $self = shift; return $self->{data}->fname2fnum( @_ )};
1548             sub wk2fnum { my $self = shift; return $self->{data}->wk2fnum( @_ )};
1549              
1550             =item AUTOLOAD functionality
1551              
1552             When a WebFetch input object is passed to an output class, operations
1553             on $self would not usually work. WebFetch subclasses are considered to be
1554             cooperating with each other. So WebFetch provides AUTOLOAD functionality
1555             to catch undefined function calls for its subclasses. If the calling
1556             class provides a function by the name that was attempted, then it will
1557             be redirected there.
1558              
1559             =cut
1560              
1561             # autoloader catches calls to unknown functions
1562             # redirect to the class which made the call, if the function exists
1563             sub AUTOLOAD
1564             {
1565             my $self = shift;
1566             my $type = ref($self) or throw_autoload_fail "self is not an object";
1567              
1568             my $name = $AUTOLOAD;
1569             $name =~ s/.*://; # strip fully-qualified portion, just want function
1570              
1571             # decline all-caps names - reserved for special Perl functions
1572             my ( $package, $filename, $line ) = caller;
1573             ( $name =~ /^[A-Z]+$/ ) and return;
1574             debug __PACKAGE__."::AUTOLOAD $name";
1575              
1576             # check for function in caller package
1577             # (WebFetch may hand an input module's object to an output module)
1578             if ( $package->can( $name )) {
1579             # make an alias of the sub
1580             {
1581             no strict 'refs';
1582             *{__PACKAGE__."::".$name} = \&{$package."::".$name};
1583             }
1584             #my $retval = eval $package."::".$name."( \$self, \@_ )";
1585             my $retval = eval { $self->$name( @_ ); };
1586             if ( $@ ) {
1587             my $e = Exception::Class->caught();
1588             ref $e ? $e->rethrow
1589             : throw_autoload_fail "failure in "
1590             ."autoloaded function: ".$e;
1591             }
1592             return $retval;
1593             }
1594              
1595             # if we got here, we failed
1596             throw_autoload_fail "function $name not found - "
1597             ."called by $package ($filename line $line)";
1598             }
1599              
1600             1;
1601             __END__