File Coverage

blib/lib/Treex/Block/Read/BaseReader.pm
Criterion Covered Total %
statement 10 12 83.3
branch n/a
condition n/a
subroutine 4 4 100.0
pod n/a
total 14 16 87.5


line stmt bran cond sub pod time code
1             package Treex::Block::Read::BaseReader;
2             $Treex::Block::Read::BaseReader::VERSION = '2.20160630';
3 3     3   41165 use Moose;
  3         451780  
  3         22  
4 3     3   22011 use Treex::Core::Common;
  3         9  
  3         20  
5 3     3   17333 use File::Slurp;
  3         9  
  3         248  
6             with 'Treex::Core::DocumentReader';
7 3     3   1999 use Treex::Core::Document;
  0            
  0            
8              
9             sub next_document {
10             my ($self) = @_;
11             return log_fatal "method next_document must be overridden in " . ref($self);
12             }
13              
14             # Default language is und (undetermined/unknown) and selector empty.
15             has selector => ( isa => 'Treex::Type::Selector', is => 'ro', default => q{} );
16             has language => ( isa => 'Treex::Type::LangCode', is => 'ro', default => 'und' );
17              
18              
19             has from => (
20             isa => 'Treex::Core::Files',
21             is => 'rw',
22             coerce => 1,
23             required => 1,
24             handles => [qw(current_filename file_number _set_file_number)],
25             documentation => 'arrayref of filenames to be loaded, '
26             . 'coerced from a space or comma separated list of filenames, '
27             . 'see POD for details',
28             );
29              
30             has file_stem => (
31             isa => 'Str',
32             is => 'ro',
33             documentation => 'how to name the loaded documents',
34             );
35              
36             has is_one_doc_per_file => (
37             is => 'rw',
38             isa => 'Bool',
39             default => 1,
40             );
41              
42             has _file_numbers => ( is => 'rw', default => sub { {} } );
43              
44             has _file_number_width => (
45             is => 'rw',
46             isa => 'Int',
47             default => 3,
48             documentation => 'The number of digits for numbered filenames. '
49             . 'The default (3) will create filenames with three digits as "001.treex.gz".'
50             );
51              
52             has skip_finished => (
53             isa => 'Str',
54             is => 'ro',
55             documentation => 'Skip input files for which a matching non-empty output file exists '
56             . '(presumably created by a previous unfinished Treex run). '
57             . 'This parameter specifies a regex substitution how to derive the output filename from the input filename. '
58             . 'It is parallel to the parameter substitute={indir}{outdir} in writers. '
59             . 'However, you need to take care of filename extensions too, '
60             . 'e.g. if converting conll to treex, you should use skip_finished={indir/(.+).conll$}{outdir/$1.treex.gz}',
61             );
62              
63              
64             sub BUILD {
65             my ( $self, $args ) = @_;
66             if (my $regex = $self->skip_finished){
67             my $filenames_ref = $self->from->filenames;
68             my @filtered_filenames;
69             my $eval_string = '$filename =~ s' . $regex . '; 1;';
70              
71             for my $input_filename (@$filenames_ref){
72             my $filename = $input_filename;
73              
74             # see r14228 for an alternative implementation (without stringy eval) which cannot handle $1 in rexex
75             eval $eval_string or log_fatal "Failed to eval $eval_string"; ## no critic qw(BuiltinFunctions::ProhibitStringyEval)
76              
77             if (! -s $filename){
78             push @filtered_filenames, $input_filename;
79             #say "not finished: $input_filename -> $filename";
80             } #else {say "finished: $input_filename -> $filename";}
81             }
82             $self->from->_set_filenames(\@filtered_filenames);
83             my $input_number = @$filenames_ref;
84             my $filtered_number = @filtered_filenames;
85             my $finished_number = $input_number - $filtered_number;
86             log_info "$finished_number files out of $input_number were finished, reading only the remaining $filtered_number.";
87             }
88             return;
89             }
90              
91             sub next_filename {
92             my ($self) = @_;
93              
94             # return undef, but do not move further if we are at the end of document list (we might need the current file name)
95             return if ( $self->file_number >= $self->from->number_of_files );
96              
97             $self->_set_file_number( $self->file_number + 1 );
98             return $self->current_filename();
99             }
100              
101             use File::Spec;
102              
103             sub new_document {
104             my ( $self, $load_from ) = @_;
105             my $path = $self->current_filename();
106             log_fatal "next_filename() must be called before new_document()" if !defined $path;
107             my ( $volume, $dirs, $file ) = File::Spec->splitpath($path);
108              
109             # Delete file extension, e.g.
110             # file.01.conll -> file.01
111             # cs42.treex.gz -> cs42
112             $file =~ s/\.[^.]+(\.gz)?$//;
113              
114             # Substitute standard input for noname.
115             $file =~ s/^-$/noname/;
116              
117             my %args = ( file_stem => $file, loaded_from => $path );
118             if ( defined $dirs ) {
119             $args{path} = $volume . $dirs;
120             }
121              
122             # Override the naming heuristics above, if file_stem was specified.
123             if ( $self->file_stem ) {
124             $args{file_stem} = $self->file_stem;
125             }
126              
127             if ( $self->is_one_doc_per_file && !$self->file_stem ) {
128             $args{file_number} = q{};
129             }
130             else {
131             my $num = $self->_file_numbers->{$file};
132             $self->_file_numbers->{$file} = ++$num;
133             my $fmt = "%0".$self->_file_number_width."d";
134             $args{file_number} = sprintf $fmt, $num;
135             }
136              
137             if ( defined $load_from ) {
138             $args{filename} = $load_from;
139             }
140              
141             $self->_set_doc_number( $self->doc_number + 1 );
142              
143             my $document;
144             if ( defined $load_from and $load_from =~ /\.streex$/ ) {
145             $document = Treex::Core::Document->retrieve_storable($load_from);
146             $document->set_storable(1);
147             }
148             else {
149             $document = Treex::Core::Document->new( \%args );
150             }
151              
152             if ( defined $load_from && $load_from =~ /\.gz$/ ) {
153             $document->set_compress(1);
154             }
155              
156             return $document;
157             }
158              
159             sub number_of_documents {
160             my $self = shift;
161             return $self->is_one_doc_per_file ? $self->from->number_of_files : undef;
162             }
163              
164             after 'restart' => sub {
165             my $self = shift;
166             $self->_set_file_number(0);
167             };
168              
169             1;
170              
171             __END__
172              
173             =pod
174              
175             =encoding utf-8
176              
177             =head1 NAME
178              
179             Treex::Block::Read::BaseReader - abstract ancestor for document readers
180              
181             =head1 VERSION
182              
183             version 2.20160630
184              
185             =head1 DESCRIPTION
186              
187             This class serves as a common ancestor for document readers
188             that have the parameter C<from> with a space or comma separated list of filenames
189             to be loaded.
190             It is designed to implement the L<Treex::Core::DocumentReader> interface.
191              
192             In derived classes you need to define the C<next_document> method,
193             and you can use C<next_filename> and C<new_document> methods.
194              
195             =head1 ATTRIBUTES
196              
197             =over
198              
199             =item from (required)
200              
201             space or comma separated list of filenames, or C<-> for STDIN
202              
203             An '@' directly in front of a file name causes this file to be interpreted as a file
204             list, with one file name per line, e.g. '@filelist.txt' causes the reader to open
205             'filelist.txt' and read a list of files from it. File lists may be arbitrarily
206             mixed with regular files in the parameter.
207              
208             Similarly, you can use I<!> for wildcard expansion, e.g.
209             C<treex -Len Read::Treex from='!dir??/file*.txt'>.
210             The single quotes are needed for two reasons.
211             First, to prevent bash from interpreting the wildcard characters.
212             Second, to prevent bash from interpreting the exclamation mark as history expansion.
213              
214             The I<@filelist> and I<!wildcard> conventions are used in several tools, e.g. 7z or javac.
215              
216             (If you use this method via API you can specify a string array reference or a
217             L<Treex::Core::Files> object.)
218              
219             =item file_stem (optional)
220              
221             How to name the loaded documents.
222             This attribute will be saved to the same-named
223             attribute in documents and it will be used in document writers
224             to decide where to save the files.
225              
226             =back
227              
228             =head1 METHODS
229              
230             =over
231              
232             =item next_document
233              
234             This method must be overridden in derived classes.
235             (The implementation in this class just issues fatal error.)
236              
237             =item next_filename
238              
239             returns the next filename (full path) to be loaded
240             (from the list specified in the attribute C<from>)
241              
242             =item new_document($load_from?)
243              
244             Returns a new empty document with pre-filled attributes
245             C<loaded_from>, C<file_stem>, C<file_number> and C<path>
246             which are guessed based on C<current_filename>.
247              
248             =item current_filename
249              
250             returns the last filename returned by C<next_filename>
251              
252             =item is_next_document_for_this_job
253              
254             Is the document that will be returned by C<next_document>
255             supposed to be processed by this job?
256             This is relevant only in parallel processing,
257             where each job has a different C<$jobnumber> assigned.
258              
259             =item number_of_documents
260              
261             Returns the number of documents that will be read by this reader.
262             If C<is_one_doc_per_file> returns C<true>, then the number of documents
263             equals the number of files given in C<from>.
264             Otherwise, this method returns C<undef>.
265              
266             =back
267              
268             =head1 SEE
269              
270             L<Treex::Block::Read::BaseTextReader>
271             L<Treex::Block::Read::Text>
272              
273             =head1 AUTHOR
274              
275             Martin Popel <popel@ufal.mff.cuni.cz>
276              
277             =head1 COPYRIGHT AND LICENSE
278              
279             Copyright © 2011-2012 by Institute of Formal and Applied Linguistics, Charles University in Prague
280              
281             This module is free software; you can redistribute it and/or modify it under the same terms as Perl itself.