File Coverage

blib/lib/AI/MicroStructure/RemoteList.pm
Criterion Covered Total %
statement 18 72 25.0
branch 0 22 0.0
condition 0 24 0.0
subroutine 6 16 37.5
pod 8 8 100.0
total 32 142 22.5


line stmt bran cond sub pod time code
1             =encoding iso-8859-1
2              
3             =cut
4              
5             package AI::MicroStructure::RemoteList;
6 2     2   9 use strict;
  2         3  
  2         46  
7 2     2   46 use warnings;
  2         3  
  2         52  
8 2     2   10 use Carp;
  2         2  
  2         193  
9              
10             our $VERSION = '0.20';
11              
12             # method that extracts the items from the remote content and returns them
13             sub extract {
14 0   0 0 1   my $class = ref $_[0] || $_[0];
15 2     2   9 no strict 'refs';
  2         4  
  2         388  
16 0           my $func = ${"$class\::Remote"}{extract};
  0            
17              
18             # provide a very basic default
19             my $meth = ref $func eq 'CODE'
20 0     0     ? sub { my %seen; return grep { !$seen{$_}++ } $func->( $_[1], $_[2] ); }
  0            
  0            
21 0 0   0     : sub { return $_[1] }; # very basic default
  0            
22              
23             # put the method in the subclass symbol table (at runtime)
24 0           *{"$class\::extract"} = $meth;
  0            
25              
26             # now run the function^Wmethod
27 0           goto &$meth;
28             }
29              
30             # methods related to the source URL
31             sub source {
32 0   0 0 1   my $class = ref $_[0] || $_[0];
33 2     2   9 no strict 'refs';
  2         3  
  2         170  
34              
35 0           return ${"$class\::Remote"}{source};
  0            
36             }
37              
38             sub sources {
39 0   0 0 1   my $class = ref $_[0] || $_[0];
40 2     2   9 no strict 'refs';
  2         4  
  2         1430  
41              
42 0           my $src = ${"$class\::Remote"}{source};
  0            
43 0 0         if ( ref $src eq 'ARRAY' ) {
    0          
44 0           return @$src;
45             }
46             elsif ( ref $src eq 'HASH' ) {
47             return grep $_,
48             defined $_[1] && $_[1] ne ':all'
49 0           ? ref $_[1] ? @$src{ @{ $_[1] } }
50 0 0 0       : $src->{ $_[1] }
    0          
51             : values %$src;
52             }
53 0           return $src;
54             }
55              
56 0     0 1   sub has_remotelist { return defined $_[0]->source(); }
57              
58             # main method: return the list from the remote source
59             sub remote_list {
60 0   0 0 1   my $class = ref $_[0] || $_[0];
61 0 0         return unless $class->has_remotelist();
62              
63             # check that we can access the network
64 0           eval {
65 0           require LWP::UserAgent;
66 0 0         die "version 5.802 required ($LWP::VERSION installed)\n"
67             if $LWP::VERSION < 5.802;
68             };
69 0 0         if ($@) {
70 0           carp "LWP::UserAgent not available: $@";
71 0           return;
72             }
73              
74             # figure out the default category (for an instance)
75 0 0 0       my $category = ref $_[0] ? $_[1] || $_[0]->{category} : $_[1];
76              
77             # fetch the content
78 0           my @items;
79 0           my @srcs = $class->sources($category);
80 0           my $ua = LWP::UserAgent->new( env_proxy => 1 );
81 0           foreach my $src (@srcs) {
82 0 0         my $request = HTTP::Request->new(
83             ref $src
84             ? ( POST => $src->[0],
85             [ content_type => 'application/x-www-form-urlencoded' ],
86             $src->[1]
87             )
88             : ( GET => $src )
89             );
90              
91 0           my $res = $ua->request( $request );
92 0 0         if ( ! $res->is_success() ) {
93 0           carp "Failed to get content at $src (" . $res->status_line();
94 0           return;
95             }
96              
97             # extract, cleanup and return the data
98             # if decoding the content fails, we just deal with the raw content
99 0   0       push @items =>
      0        
100             $class->extract( $res->decoded_content() || $res->content(),
101             $category || () );
102              
103             }
104              
105             # return unique items
106 0           my %seen;
107 0           return grep { !$seen{$_}++ } @items;
  0            
108             }
109              
110             #
111             # transformation subroutines
112             #
113             sub tr_nonword {
114 0     0 1   my $str = shift;
115 0           $str =~ tr/a-zA-Z0-9_/_/c;
116 0           $str;
117             }
118              
119             sub tr_accent {
120 0     0 1   my $str = shift;
121 0           $str =~ tr{ÀÁÂÃÄÅÇÈÉÊËÌÍÎÏÑÒÓÔÕÖØÙÚÛÜÝàáâãäåçèéêëìíîïñòóôõöøùúûüýÿ}
122             {AAAAAACEEEEIIIINOOOOOOUUUUYaaaaaaceeeeiiiinoooooouuuuyy};
123 0           return $str;
124             }
125              
126             my %utf2asc = (
127             "æ" => 'ae',
128             "Æ" => 'AE',
129             "\xc5\xa0" => 'S',
130             "\x{0160}" => 'S',
131             # for pokemons
132             "\x{0101}" => 'a',
133             "\x{012b}" => 'i',
134             "\x{014d}" => 'o',
135             "\x{016b}" => 'u',
136             "\xe2\x99\x80" => 'female',
137             "\xe2\x99\x82" => 'male',
138             "\x{2640}" => 'female',
139             "\x{2642}" => 'male',
140             );
141             my $utf_re = qr/(@{[join( '|', sort keys %utf2asc )]})/;
142              
143             sub tr_utf8_basic {
144 0     0 1   my $str = shift;
145 0           $str =~ s/$utf_re/$utf2asc{$1}/go;
146 0           return $str;
147             }
148              
149             1;
150              
151             __END__
152              
153             =head1 NAME
154              
155             AI::MicroStructure::RemoteList - Retrieval of a remote source for a structure
156              
157             =head1 SYNOPSIS
158              
159             package AI::MicroStructure::contributors;
160             use strict;
161             use AI::MicroStructure::List;
162             our @ISA = qw( AI::MicroStructure::List );
163              
164             # data regarding the remote source
165             our %Remote = (
166             source =>
167             'http://search.cpan.org/dist/AI-MicroStructure/CONTRIBUTORS',
168             extract => sub {
169             my $content = shift;
170             my @items =
171             map { AI::MicroStructure::RemoteList::tr_nonword($_) }
172             map { AI::MicroStructure::RemoteList::tr_accent($_) }
173             $content =~ /^\* (.*?)\s*$/gm;
174             return @items;
175             },
176             );
177              
178             __PACKAGE__->init();
179              
180             1;
181              
182             # and the usual documentation and list definition
183              
184             =head1 DESCRIPTION
185              
186             This base class adds the capability to fetch a fresh list of items from a
187             remote source to any structure that requires it.
188              
189             To be able to fetch remote items, an C<AI::MicroStructure> structure must
190             define the package hash variable C<%Remote> with the appropriate keys.
191              
192             The keys are:
193              
194             =over 4
195              
196             =item C<source>
197              
198             The URL where the data is available. The content will be passed to the
199             C<extract> subroutine.
200              
201             Because of the various way the data can be made available on the web
202             and can be used in L<AI::MicroStructure>, this scheme has evolved to
203             support several cases:
204              
205             Single source URL:
206              
207             source => $url
208              
209             Multiple source URL:
210              
211             source => [ $url1, $url2, ... ]
212              
213             For structures with categories, it's possible to attach a URL for each
214             category:
215              
216             source => {
217             category1 => $url1,
218             category2 => $url2,
219             ...
220             }
221              
222             In the case where the C<source> is an array or a hash reference, an
223             extra case is supported, in case the source data can only be obtained
224             via a C<POST> request. In that case, the source should be provided as
225             either:
226              
227             source => [
228             [ $url1 => $data1 ],
229             [ $url2 => $data2 ],
230             ...
231             ]
232              
233             or
234              
235             source => {
236             category1 => [ $url1 => $data1 ],
237             category2 => [ $url2 => $data2 ],
238             ...
239             }
240              
241             It is possible to mix C<POST> and C<GET> URL:
242              
243             source => [
244             $url1, # GET
245             [ $url2 => $data2 ], # POST
246             ...
247             ]
248              
249             or
250              
251             source => {
252             category1 => $url1, # GET
253             category2 => [ $url2 => $data2 ], # POST
254             ...
255             }
256              
257             This means that even if there is only one source and a C<POST> request
258             must be used, then it must be provided as a list of a single item:
259              
260             source => [ [ $url => $data ] ]
261              
262             =item C<extract>
263              
264             A reference to a subroutine that extracts a list of items from a string.
265             The string is meant to be the content available at the URL stored in
266             the C<source> key.
267              
268             The coderef may receive an optional parameter corresponding to the name of
269             the category (useful if the coderef must behave differently depending on
270             the category).
271              
272             =back
273              
274             C<LWP::Simple> is used to download the remote data.
275              
276             All existing C<AI::MicroStructure> behaviours
277             (C<AI::MicroStructure::List> and C<AI::MicroStructure::Locale> are
278             subclasses of C<AI::MicroStructure::RemoteList>.
279              
280             =head1 METHODS
281              
282             As an ancestor, this class adds the following methods to an
283             C<AI::MicroStructure> structure:
284              
285             =over 4
286              
287             =item remote_list()
288              
289             Returns the list of items available at the remote source, or an empty
290             list in case of error.
291              
292             =item has_remotelist()
293              
294             Return a boolean indicating if the C<source> key is defined (and therefore
295             if the structure actually has a remote list).
296              
297             =item source()
298              
299             Return the data structure containing the source URLs. This can be quite
300             different depending on the class: a single scalar (URL), an array
301             reference (list of URLs) or a hash reference (each value being either
302             a scalar or an array reference) for structures that are subclasses of
303             C<AI::MicroStructure::MultiList>.
304              
305             =item sources( [ $category ] )
306              
307             Return the list of source URL. The C<$category> parameter can be used
308             to select the sources for a sub-category of the structure (in the case of
309             C<AI::MicroStructure::MultiList>).
310              
311             C<$category> can be an array reference containing a list of categories.
312              
313             =item extract( $content )
314              
315             Return a list of items from the C<$content> string. C<$content> is
316             expected to be the content available at the URL given by C<source()>.
317              
318             =back
319              
320             =head1 TRANSFORMATION SUBROUTINES
321              
322             The C<AI::MicroStructure::RemoteList> class also provides a few helper
323             subroutines that simplify the normalisation of items:
324              
325             =over 4
326              
327             =item tr_nonword( $str )
328              
329             Return a copy of C<$str> with all non-word characters turned into
330             underscores (C<_>).
331              
332             =item tr_accent( $str )
333              
334             Return a copy of C<$str> will all iso-8859-1 accented characters turned
335             into basic ASCII characters.
336              
337             =item tr_utf8_basic( $str )
338              
339             Return a copy of C<$str> with some of the utf-8 accented characters turned
340             into basic ASCII characters. This is very crude, but I didn't to bother
341             and depend on the proper module to do that.
342              
343             =back
344              
345             =head1 AUTHOR
346              
347             'santex' << <santex@cpan.org> >>.
348              
349             =head1 SEE ALSO
350              
351             L<AI::MicroStructure>, L<AI::MicroStructure::List>,
352             L<AI::MicroStructure::Locale>.
353              
354             =head1 COPYRIGHT
355              
356             Copyright 2009-2016 Hagen Geissler, All Rights Reserved.
357              
358             =head1 LICENSE
359             This program is free software; you can redistribute it and/or modify it
360             under the same terms as Perl itself.
361              
362             =cut