File Coverage

blib/lib/CPAN/Visitor.pm
Criterion Covered Total %
statement 39 139 28.0
branch 0 54 0.0
condition 0 15 0.0
subroutine 13 27 48.1
pod 2 3 66.6
total 54 238 22.6


line stmt bran cond sub pod time code
1 1     1   744 use 5.006;
  1         1  
  1         28  
2 1     1   3 use strict;
  1         1  
  1         24  
3 1     1   4 use warnings;
  1         1  
  1         48  
4             package CPAN::Visitor;
5             # ABSTRACT: Generic traversal of distributions in a CPAN repository
6              
7             our $VERSION = '0.005';
8              
9 1     1   432 use autodie;
  1         10989  
  1         6  
10              
11 1     1   4304 use Archive::Extract 0.34 ();
  1         110365  
  1         26  
12 1     1   7 use File::Find ();
  1         1  
  1         14  
13 1     1   388 use File::pushd 1.00 ();
  1         8152  
  1         27  
14 1     1   5 use File::Temp 0.20 ();
  1         15  
  1         14  
15 1     1   324 use Path::Class 0.17 ();
  1         13494  
  1         23  
16 1     1   472 use Parallel::ForkManager 0.007005 ();
  1         4328  
  1         22  
17              
18 1     1   444 use Moose 0.93 ;
  1         288414  
  1         8  
19 1     1   5178 use MooseX::Params::Validate 0.13;
  1         48392  
  1         5  
20 1     1   678 use namespace::autoclean 0.09 ;
  1         864  
  1         4  
21              
22             has 'cpan' => ( is => 'ro', required => 1 );
23             has 'quiet' => ( is => 'ro', default => 0 );
24             has 'stash' => ( is => 'ro', isa => 'HashRef', default => sub { {} } );
25             has 'files' => ( is => 'ro', isa => 'ArrayRef', default => sub { [] } );
26              
27             sub BUILD {
28 0     0 0   my $self = shift;
29 0 0 0       unless (
30             -d $self->cpan &&
31             -d Path::Class::dir($self->cpan, 'authors', 'id')
32             ) {
33 0           die "'cpan' parameter must be the root of a CPAN repository";
34             }
35             }
36              
37             #--------------------------------------------------------------------------#
38             # selection methods
39             #--------------------------------------------------------------------------#
40              
41             my $archive_re = qr{\.(?:tar\.(?:bz2|gz|Z)|t(?:gz|bz)|zip)$}i;
42              
43             sub select {
44 0     0 1   my ($self, %params) = validated_hash( \@_,
45             match => { isa => 'RegexpRef | ArrayRef[RegexpRef]', default => [qr/./] },
46             exclude => { isa => 'RegexpRef | ArrayRef[RegexpRef]', default => [] },
47             subtrees => { isa => 'Str | ArrayRef[Str]', default => [] },
48             all_files => { isa => 'Bool', default => 0 },
49             append => { isa => 'Bool', default => 0 },
50             );
51              
52             # normalize to arrayrefs
53 0           for my $k ( qw/match exclude subtrees/ ) {
54 0 0         next unless exists $params{$k};
55 0 0 0       next if ref $params{$k} && ref $params{$k} eq 'ARRAY';
56 0           $params{$k} = [ $params{$k} ];
57             }
58              
59             # determine search dirs
60 0           my $id_dir = Path::Class::dir($self->cpan, qw/authors id/);
61 0           my @search_dirs = map { $id_dir->subdir($_)->stringify } @{$params{subtrees}};
  0            
  0            
62 0 0         @search_dirs = $id_dir->stringify if ! @search_dirs;
63              
64             # perform search
65 0           my @found;
66             File::Find::find(
67             {
68             no_chdir => 1,
69             follow => 0,
70 0     0     preprocess => sub { my @files = sort @_; return @files },
  0            
71             wanted => sub {
72 0 0   0     return unless -f;
73 0 0 0       return unless $params{all_files} || /$archive_re/;
74 0           for my $re ( @{$params{exclude}} ) {
  0            
75 0 0         return if /$re/;
76             }
77 0           for my $re ( @{$params{match}} ) {
  0            
78 0 0         return if ! /$re/;
79             }
80 0           (my $f = Path::Class::file($_)->relative($id_dir)) =~ s{./../}{};
81 0           push @found, $f;
82             }
83             },
84 0           @search_dirs,
85             );
86              
87 0 0         if ( $params{append} ) {
88 0           push @{$self->files}, @found;
  0            
89             }
90             else {
91 0           @{$self->files} = @found;
  0            
92             }
93 0           return scalar @found;
94             }
95              
96             #--------------------------------------------------------------------------#
97             # default actions
98             #
99             # These are passed a "job" hashref. It is initialized with the following
100             # fields:
101             #
102             # distfile -- e.g. DAGOLDEN/CPAN-Visitor-0.001.tar.gz
103             # distpath -- e.g. /my/cpan/authors/id/D/DA/DAGOLDEN/CPAN-Visitor-0.001.tar.gz
104             # tempdir -- File::Temp directory object for extraction or other things
105             # stash -- the 'stash' hashref from the Visitor object
106             # quiet -- the 'quiet' flag from the Visitor object
107             # result -- an empty hashref to start; the return values from each
108             # action are added and may be referenced by subsequent actions
109             #
110             # E.g. the return value from 'extract' is the directory:
111             #
112             # $job->{result}{extract} = $unpacked_directory;
113             #
114             #--------------------------------------------------------------------------#
115              
116 0     0     sub _check { 1 } # always proceed
117              
118 0     0     sub _start { 1 } # no special start action
119              
120             # _extract returns the proper directory to chdir into
121             # if the $job->{stash}{prefer_bin} is true, it will tell Archive::Extract
122             # to use binaries
123             sub _extract {
124 0     0     my $job = shift;
125 0           local $Archive::Extract::DEBUG = 0;
126 0 0         local $Archive::Extract::PREFER_BIN = $job->{stash}{prefer_bin} ? 1 : 0;
127 0 0         local $Archive::Extract::WARN = $job->{quiet} ? 0 : 1;
128              
129             # cd to tmpdir for duration of this sub
130 0           my $pushd = File::pushd::pushd( $job->{tempdir} );
131              
132 0           my $ae = Archive::Extract->new( archive => $job->{distpath} );
133              
134 0           my $olderr;
135              
136             # stderr > /dev/null if quiet
137 0 0         if ( ! $Archive::Extract::WARN ) {
138 0           open $olderr, ">&STDERR";
139 0           open STDERR, ">", File::Spec->devnull;
140             }
141              
142 0           my $extract_ok = $ae->extract;
143              
144             # restore stderr if quiet
145 0 0         if ( ! $Archive::Extract::WARN ) {
146 0           open STDERR, ">&", $olderr;
147 0           close $olderr;
148             }
149              
150 0 0         if ( ! $extract_ok ) {
151 0 0         warn "Couldn't extract '$job->{distpath}'\n" if $Archive::Extract::WARN;
152 0           return;
153             }
154              
155             # most distributions unpack a single directory that we must enter
156             # but some behave poorly and unpack to the current directory
157 0           my @children = Path::Class::dir()->children;
158 0 0 0       if ( @children == 1 && -d $children[0] ) {
159 0           return Path::Class::dir($job->{tempdir}, $children[0])->absolute->stringify;
160             }
161             else {
162 0           return Path::Class::dir($job->{tempdir})->absolute->stringify;
163             }
164             }
165              
166             sub _enter {
167 0     0     my $job = shift;
168 0           my $curdir = Path::Class::dir()->absolute;
169 0 0         my $target_dir = $job->{result}{extract} or return;
170 0 0         if ( -d $target_dir ) {
171 0 0         unless ( -x $target_dir ) {
172 0 0         warn "Directory '$target_dir' missing +x; trying to fix it\n"
173             unless $job->{quiet};
174 0           chmod 0755, $target_dir;
175             }
176 0           chdir $target_dir;
177             }
178             else {
179 0 0         warn "Can't chdir to directory '$target_dir'\n"
180             unless $job->{quiet};
181 0           return;
182             }
183 0           return $curdir;
184             }
185              
186 0     0     sub _visit { 1 } # do nothing
187              
188             # chdir out and clean up
189             sub _leave {
190 0     0     my $job = shift;
191 0           chdir $job->{result}{enter};
192 0           return 1;
193             }
194              
195 0     0     sub _finish { 1 } # no special finish action
196              
197             #--------------------------------------------------------------------------#
198             # iteration methods
199             #--------------------------------------------------------------------------#
200              
201             # iterate()
202             #
203             # Arguments:
204             #
205             # jobs -- if greater than 1, distributions are processed in parallel
206             # via Parallel::ForkManager
207             #
208             # iterate() takes several optional callbacks which are run in the following
209             # order. Callbacks get a single hashref argument as described above under
210             # default actions.
211             #
212             # check -- whether the distribution should be processed; goes to next file
213             # if false; default is always true
214             #
215             # start -- used for any setup, logging, etc; default does nothing
216             #
217             # extract -- extracts a distribution into a temp directory or otherwise
218             # prepares for visiting; skips to finish action if it returns
219             # a false value; default returns the path to the extracted
220             # directory
221             #
222             # enter -- skips to the finish action if it returns false; default takes
223             # the result of extract, chdir's into it, and returns the
224             # original directory
225             #
226             # visit -- examine the distribution or otherwise do stuff; the default
227             # does nothing;
228             #
229             # leave -- default returns to the original directory (the result of enter)
230             #
231             # finish -- any teardown processing, logging, etc.
232              
233             sub iterate {
234 0     0 1   my ($self, %params) = validated_hash( \@_,
235             jobs => { isa => 'Int', default => 0 },
236             check => { isa => 'CodeRef', default => \&_check },
237             start => { isa => 'CodeRef', default => \&_start },
238             extract => { isa => 'CodeRef', default => \&_extract },
239             enter => { isa => 'CodeRef', default => \&_enter },
240             visit => { isa => 'CodeRef', default => \&_visit },
241             leave => { isa => 'CodeRef', default => \&_leave },
242             finish => { isa => 'CodeRef', default => \&_finish },
243             );
244              
245 0 0         my $pm = Parallel::ForkManager->new( $params{jobs} > 1 ? $params{jobs} : 0 );
246 0           for my $distfile ( @{ $self->files } ) {
  0            
247 0 0         $pm->start and next;
248 0           $self->_iterate($distfile, \%params);
249 0           $pm->finish;
250             }
251 0           $pm->wait_all_children;
252 0           return 1;
253             }
254              
255             sub _iterate {
256 0     0     my ($self, $distfile, $params) = @_;
257 0           my $curdir = Path::Class::dir()->absolute;
258              
259             # $job outside eval so that later chdir to original directory
260             # happens before $job is destroyed and any tempdirs deleted
261 0           my $job;
262 0           eval {
263 0           $job = {
264             distfile => $distfile,
265             distpath => $self->_fullpath($distfile),
266             tempdir => File::Temp->newdir(),
267             stash => $self->stash,
268             quiet => $self->quiet,
269             result => {},
270             };
271 0 0         $job->{result}{check} = $params->{check}->($job) or return;
272 0           $job->{result}{start} = $params->{start}->($job);
273 0 0         ACTION: {
274 0           $job->{result}{extract} = $params->{extract}->($job) or last ACTION;
275 0 0         $job->{result}{enter} = $params->{enter}->($job) or last ACTION;
276 0           $job->{result}{visit} = $params->{visit}->($job);
277 0           $job->{result}{leave} = $params->{leave}->($job);
278             }
279 0           $params->{finish}->($job);
280             };
281 0 0 0       warn "Error visiting $distfile: $@\n" if $@ && ! $self->quiet;
282 0           chdir $curdir;
283 0           return;
284             }
285              
286             sub _fullpath {
287 0     0     my ($self, $distfile) = @_;
288 0           my ($two, $one) = $distfile =~ /\A((.).)/;
289 0           return Path::Class::file(
290             $self->cpan, 'authors', 'id', $one, $two, $distfile
291             )->absolute->stringify;
292             }
293              
294             __PACKAGE__->meta->make_immutable;
295              
296             1;
297              
298             __END__
299              
300             =pod
301              
302             =encoding UTF-8
303              
304             =head1 NAME
305              
306             CPAN::Visitor - Generic traversal of distributions in a CPAN repository
307              
308             =head1 VERSION
309              
310             version 0.005
311              
312             =head1 SYNOPSIS
313              
314             use CPAN::Visitor;
315             my $visitor = CPAN::Visitor->new( cpan => "/path/to/cpan" );
316              
317             # Prepare to visit all distributions
318             $visitor->select();
319              
320             # Or a subset of distributions
321             $visitor->select(
322             subtrees => [ 'D/DA', 'A/AD' ], # relative to authors/id/
323             exclude => qr{/Acme-}, # No Acme- dists
324             match => qr{/Test-} # Only Test- dists
325             );
326              
327             # Action is specified via a callback
328             $visitor->iterate(
329             visit => sub {
330             my $job = shift;
331             print $job->{distfile} if -f 'Build.PL'
332             }
333             );
334              
335             # Or start with a list of files
336             $visitor = CPAN::Visitor->new(
337             cpan => "/path/to/cpan",
338             files => \@distfiles, # e.g. ANDK/CPAN-1.94.tar.gz
339             );
340             $visitor->iterate( visit => \&callback );
341              
342             # Iterate in parallel
343             $visitor->iterate( visit => \&callback, jobs => 5 );
344              
345             =head1 DESCRIPTION
346              
347             A very generic, callback-driven program to iterate over a CPAN repository.
348              
349             Needs better documentation and tests, but is provided for others to examine,
350             use or contribute to.
351              
352             =head1 USAGE
353              
354             =head2 new
355              
356             my $visitor = CPAN::Visitor->new( @args );
357              
358             Object attributes include:
359              
360             =over 4
361              
362             =item *
363              
364             C<cpan> — path to CPAN or mini CPAN repository. Required.
365              
366             =item *
367              
368             C<quiet> — whether warnings should be silenced (e.g. from extraction). Optional.
369              
370             =item *
371              
372             C<stash> — hash-ref of user-data to be made available during iteration. Optional.
373              
374             =item *
375              
376             C<files> — array-ref with a pre-selection of of distribution files. These must be in AUTHOR/NAME.suffix format. Optional.
377              
378             =back
379              
380             =head2 select
381              
382             $visitor->select( @args );
383              
384             Valid arguments include:
385              
386             =over 4
387              
388             =item *
389              
390             C<subtrees> — path or array-ref of paths to search. These must be relative to the 'authors/id/' directory within a CPAN repo. If given, only files within those subtrees will be considered. If not specified, the entire 'authors/id' tree is searched.
391              
392             =item *
393              
394             C<exclude> — qr() or array-ref of qr() patterns. If a path matches *any* pattern, it is excluded
395              
396             =item *
397              
398             C<match> — qr() or array-ref of qr() patterns. If an array-ref is provided, only paths that match *all* patterns are included
399              
400             =item *
401              
402             all_files — boolean that determines whether all files or only files that have a distribution archive suffix are selected. Default is false.
403              
404             =item *
405              
406             append — boolean that determines whether the selected files should be appended to previously selected files. The default is false, which replaces any previous selection
407              
408             =back
409              
410             The C<select> method returns a count of files selected.
411              
412             =head2 iterate
413              
414             $visitor->iterate( @args );
415              
416             Valid arguments include:
417              
418             =over 4
419              
420             =item *
421              
422             C<jobs> — non-negative integer specifying the maximum number of forked processes. Defaults to none.
423              
424             =item *
425              
426             C<check> — code reference callback
427              
428             =item *
429              
430             C<start> — code reference callback
431              
432             =item *
433              
434             C<extract> — code reference callback
435              
436             =item *
437              
438             C<enter> — code reference callback
439              
440             =item *
441              
442             C<visit> — code reference callback
443              
444             =item *
445              
446             C<leave> — code reference callback
447              
448             =item *
449              
450             C<finish> — code reference callback
451              
452             =back
453              
454             See L</ACTION CALLBACKS> for more. Generally, you only need to provide the
455             C<visit> callback, which is called from inside the unpacked distribution
456             directory.
457              
458             The C<iterate> method always returns true.
459              
460             =for Pod::Coverage BUILD
461              
462             =head1 ACTION CALLBACKS
463              
464             Each selected distribution is processed with a series of callback
465             functions. These are each passed a hash-ref with information about
466             the particular distribution being processed.
467              
468             sub _my_visit {
469             my $job = shift;
470             # do stuff
471             }
472              
473             The job hash-ref is initialized with the following fields:
474              
475             =over 4
476              
477             =item *
478              
479             C<distfile> — the unique, short CPAN distfile name, e.g. DAGOLDEN/CPAN-Visitor-0.001.tar.gz
480              
481             =item *
482              
483             C<distpath> — the absolute path the distribution archive, e.g. /my/cpan/authors/id/D/DA/DAGOLDEN/CPAN-Visitor-0.001.tar.gz
484              
485             =item *
486              
487             C<tempdir> — a File::Temp directory object for extraction or other things
488              
489             =item *
490              
491             C<stash> — the 'stash' hashref from the Visitor object
492              
493             =item *
494              
495             C<quiet> — the 'quiet' flag from the Visitor object
496              
497             =item *
498              
499             C<result> — an empty hashref to start; the return values from each action are added and may be referenced by subsequent actions
500              
501             =back
502              
503             The C<result> field is used to accumulate the return values from action
504             callbacks. For example, the return value from the default 'extract' action is
505             the unpacked distribution directory:
506              
507             $job->{result}{extract} # distribution directory path
508              
509             You do not need to store the results yourself — the C<iterate> method
510             takes care of it for you.
511              
512             Callbacks occur in the following order. Some callbacks skip further
513             processing if the return value is false.
514              
515             =over 4
516              
517             =item *
518              
519             C<check> — determines whether the distribution should be processed; goes to next file if false; default is always true
520              
521             =item *
522              
523             C<start> — used for any setup, logging, etc; default does nothing
524              
525             =item *
526              
527             C<extract> — operate on the tarball to prepare for visiting; skips to finish action if it returns a false value; the default extracts a distribution into a temp directory and returns the path to the extracted directory; if the C<stash> has a true value for C<prefer_bin>, binary tar, etc. will be preferred. This is faster, but less portable.
528              
529             =item *
530              
531             C<enter> — skips to the finish action if it returns false; default takes the result of extract, chdir's into it, and returns the original directory; if the extract result is missing the +x permissions, this will attempt to add it before calling chdir.
532              
533             =item *
534              
535             C<visit> — examine the distribution or otherwise do stuff; the default does nothing;
536              
537             =item *
538              
539             C<leave> — default returns to the original directory (the result of enter)
540              
541             =item *
542              
543             C<finish> — any teardown processing, logging, etc.
544              
545             =back
546              
547             These allow complete customization of the iteration process. For example,
548             one could do something like this:
549              
550             =over 4
551              
552             =item *
553              
554             replace the default C<extract> callback with one that returns an arrayref of distribution files without actually unpacking it into a physical directory
555              
556             =item *
557              
558             replace the default C<enter> callback with one that does nothing but return a true value; replace the default C<leave> callback likewise
559              
560             =item *
561              
562             have the C<visit> callback get the C<< $job->{result}{extract} >> listing and examine it for the presence of certain files
563              
564             =back
565              
566             This could potentially speed up iteration if only the file names within
567             the distribution are of interest and not the contents of the actual files.
568              
569             =head1 SEE ALSO
570              
571             =over 4
572              
573             =item *
574              
575             L<App::CPAN::Mini::Visit>
576              
577             =item *
578              
579             L<CPAN::Mini::Visit>
580              
581             =back
582              
583             =for :stopwords cpan testmatrix url annocpan anno bugtracker rt cpants kwalitee diff irc mailto metadata placeholders metacpan
584              
585             =head1 SUPPORT
586              
587             =head2 Bugs / Feature Requests
588              
589             Please report any bugs or feature requests through the issue tracker
590             at L<https://github.com/dagolden/CPAN-Visitor/issues>.
591             You will be notified automatically of any progress on your issue.
592              
593             =head2 Source Code
594              
595             This is open source software. The code repository is available for
596             public review and contribution under the terms of the license.
597              
598             L<https://github.com/dagolden/CPAN-Visitor>
599              
600             git clone https://github.com/dagolden/CPAN-Visitor.git
601              
602             =head1 AUTHOR
603              
604             David Golden <dagolden@cpan.org>
605              
606             =head1 COPYRIGHT AND LICENSE
607              
608             This software is Copyright (c) 2010 by David Golden.
609              
610             This is free software, licensed under:
611              
612             The Apache License, Version 2.0, January 2004
613              
614             =cut