File Coverage

blib/lib/Treex/Block/Read/BaseReader.pm

Criterion	Covered	Total	%
statement	10	12	83.3
branch			n/a
condition			n/a
subroutine	4	4	100.0
pod			n/a
total	14	16	87.5

line	stmt	sub	time	code
1				package Treex::Block::Read::BaseReader;
2				$Treex::Block::Read::BaseReader::VERSION = '2.20150928';
3	3	3	71468	use Moose;
	3		427356
	3		20
4	3	3	18735	use Treex::Core::Common;
	3		8
	3		26
5	3	3	15250	use File::Slurp;
	3		7
	3		306
6				with 'Treex::Core::DocumentReader';
7	3	3	2310	use Treex::Core::Document;
	0
	0
8
9				sub next_document {
10				my ($self) = @_;
11				return log_fatal "method next_document must be overridden in " . ref($self);
12				}
13
14				# Default language is und (undetermined/unknown) and selector empty.
15				has selector => ( isa => 'Treex::Type::Selector', is => 'ro', default => q{} );
16				has language => ( isa => 'Treex::Type::LangCode', is => 'ro', default => 'und' );
17
18
19				has from => (
20				isa => 'Treex::Core::Files',
21				is => 'rw',
22				coerce => 1,
23				required => 1,
24				handles => [qw(current_filename file_number _set_file_number)],
25				documentation => 'arrayref of filenames to be loaded, '
26				. 'coerced from a space or comma separated list of filenames, '
27				. 'see POD for details',
28				);
29
30				has file_stem => (
31				isa => 'Str',
32				is => 'ro',
33				documentation => 'how to name the loaded documents',
34				);
35
36				has is_one_doc_per_file => (
37				is => 'rw',
38				isa => 'Bool',
39				default => 1,
40				);
41
42				has _file_numbers => ( is => 'rw', default => sub { {} } );
43
44				has _file_number_width => (
45				is => 'rw',
46				isa => 'Int',
47				default => 3,
48				documentation => 'The number of digits for numbered filenames. '
49				. 'The default (3) will create filenames with three digits as "001.treex.gz".'
50				);
51
52				has skip_finished => (
53				isa => 'Str',
54				is => 'ro',
55				documentation => 'Skip input files for which a matching non-empty output file exists '
56				. '(presumably created by a previous unfinished Treex run). '
57				. 'This parameter specifies a regex substitution how to derive the output filename from the input filename. '
58				. 'It is parallel to the parameter substitute={indir}{outdir} in writers. '
59				. 'However, you need to take care of filename extensions too, '
60				. 'e.g. if converting conll to treex, you should use skip_finished={indir/(.+).conll$}{outdir/$1.treex.gz}',
61				);
62
63
64				sub BUILD {
65				my ( $self, $args ) = @_;
66				if (my $regex = $self->skip_finished){
67				my $filenames_ref = $self->from->filenames;
68				my @filtered_filenames;
69				my $eval_string = '$filename =~ s' . $regex . '; 1;';
70
71				for my $input_filename (@$filenames_ref){
72				my $filename = $input_filename;
73
74				# see r14228 for an alternative implementation (without stringy eval) which cannot handle $1 in rexex
75				eval $eval_string or log_fatal "Failed to eval $eval_string"; ## no critic qw(BuiltinFunctions::ProhibitStringyEval)
76
77				if (! -s $filename){
78				push @filtered_filenames, $input_filename;
79				#say "not finished: $input_filename -> $filename";
80				} #else {say "finished: $input_filename -> $filename";}
81				}
82				$self->from->_set_filenames(\@filtered_filenames);
83				my $input_number = @$filenames_ref;
84				my $filtered_number = @filtered_filenames;
85				my $finished_number = $input_number - $filtered_number;
86				log_info "$finished_number files out of $input_number were finished, reading only the remaining $filtered_number.";
87				}
88				return;
89				}
90
91				sub next_filename {
92				my ($self) = @_;
93
94				# return undef, but do not move further if we are at the end of document list (we might need the current file name)
95				return if ( $self->file_number >= $self->from->number_of_files );
96
97				$self->_set_file_number( $self->file_number + 1 );
98				return $self->current_filename();
99				}
100
101				use File::Spec;
102
103				sub new_document {
104				my ( $self, $load_from ) = @_;
105				my $path = $self->current_filename();
106				log_fatal "next_filename() must be called before new_document()" if !defined $path;
107				my ( $volume, $dirs, $file ) = File::Spec->splitpath($path);
108
109				# Delete file extension, e.g.
110				# file.01.conll -> file.01
111				# cs42.treex.gz -> cs42
112				$file =~ s/\.[^.]+(\.gz)?$//;
113
114				# Substitute standard input for noname.
115				$file =~ s/^-$/noname/;
116
117				my %args = ( file_stem => $file, loaded_from => $path );
118				if ( defined $dirs ) {
119				$args{path} = $volume . $dirs;
120				}
121
122				# Override the naming heuristics above, if file_stem was specified.
123				if ( $self->file_stem ) {
124				$args{file_stem} = $self->file_stem;
125				}
126
127				if ( $self->is_one_doc_per_file && !$self->file_stem ) {
128				$args{file_number} = q{};
129				}
130				else {
131				my $num = $self->_file_numbers->{$file};
132				$self->_file_numbers->{$file} = ++$num;
133				my $fmt = "%0".$self->_file_number_width."d";
134				$args{file_number} = sprintf $fmt, $num;
135				}
136
137				if ( defined $load_from ) {
138				$args{filename} = $load_from;
139				}
140
141				$self->_set_doc_number( $self->doc_number + 1 );
142
143				my $document;
144				if ( defined $load_from and $load_from =~ /\.streex$/ ) {
145				$document = Treex::Core::Document->retrieve_storable($load_from);
146				$document->set_storable(1);
147				}
148				else {
149				$document = Treex::Core::Document->new( \%args );
150				}
151
152				if ( defined $load_from && $load_from =~ /\.gz$/ ) {
153				$document->set_compress(1);
154				}
155
156				return $document;
157				}
158
159				sub number_of_documents {
160				my $self = shift;
161				return $self->is_one_doc_per_file ? $self->from->number_of_files : undef;
162				}
163
164				after 'restart' => sub {
165				my $self = shift;
166				$self->_set_file_number(0);
167				};
168
169				1;
170
171				__END__
172
173				=pod
174
175				=encoding utf-8
176
177				=head1 NAME
178
179				Treex::Block::Read::BaseReader - abstract ancestor for document readers
180
181				=head1 VERSION
182
183				version 2.20150928
184
185				=head1 DESCRIPTION
186
187				This class serves as a common ancestor for document readers
188				that have the parameter C<from> with a space or comma separated list of filenames
189				to be loaded.
190				It is designed to implement the L<Treex::Core::DocumentReader> interface.
191
192				In derived classes you need to define the C<next_document> method,
193				and you can use C<next_filename> and C<new_document> methods.
194
195				=head1 ATTRIBUTES
196
197				=over
198
199				=item from (required)
200
201				space or comma separated list of filenames, or C<-> for STDIN
202
203				An '@' directly in front of a file name causes this file to be interpreted as a file
204				list, with one file name per line, e.g. '@filelist.txt' causes the reader to open
205				'filelist.txt' and read a list of files from it. File lists may be arbitrarily
206				mixed with regular files in the parameter.
207
208				Similarly, you can use I<!> for wildcard expansion, e.g.
209				C<treex -Len Read::Treex from='!dir??/file*.txt'>.
210				The single quotes are needed for two reasons.
211				First, to prevent bash from interpreting the wildcard characters.
212				Second, to prevent bash from interpreting the exclamation mark as history expansion.
213
214				The I<@filelist> and I<!wildcard> conventions are used in several tools, e.g. 7z or javac.
215
216				(If you use this method via API you can specify a string array reference or a
217				L<Treex::Core::Files> object.)
218
219				=item file_stem (optional)
220
221				How to name the loaded documents.
222				This attribute will be saved to the same-named
223				attribute in documents and it will be used in document writers
224				to decide where to save the files.
225
226				=back
227
228				=head1 METHODS
229
230				=over
231
232				=item next_document
233
234				This method must be overridden in derived classes.
235				(The implementation in this class just issues fatal error.)
236
237				=item next_filename
238
239				returns the next filename (full path) to be loaded
240				(from the list specified in the attribute C<from>)
241
242				=item new_document($load_from?)
243
244				Returns a new empty document with pre-filled attributes
245				C<loaded_from>, C<file_stem>, C<file_number> and C<path>
246				which are guessed based on C<current_filename>.
247
248				=item current_filename
249
250				returns the last filename returned by C<next_filename>
251
252				=item is_next_document_for_this_job
253
254				Is the document that will be returned by C<next_document>
255				supposed to be processed by this job?
256				This is relevant only in parallel processing,
257				where each job has a different C<$jobnumber> assigned.
258
259				=item number_of_documents
260
261				Returns the number of documents that will be read by this reader.
262				If C<is_one_doc_per_file> returns C<true>, then the number of documents
263				equals the number of files given in C<from>.
264				Otherwise, this method returns C<undef>.
265
266				=back
267
268				=head1 SEE
269
270				L<Treex::Block::Read::BaseTextReader>
271				L<Treex::Block::Read::Text>
272
273				=head1 AUTHOR
274
275				Martin Popel <popel@ufal.mff.cuni.cz>
276
277				=head1 COPYRIGHT AND LICENSE
278
279				Copyright Â© 2011-2012 by Institute of Formal and Applied Linguistics, Charles University in Prague
280
281				This module is free software; you can redistribute it and/or modify it under the same terms as Perl itself.