line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Treex::Block::Read::BaseReader; |
2
|
|
|
|
|
|
|
$Treex::Block::Read::BaseReader::VERSION = '2.20160630'; |
3
|
3
|
|
|
3
|
|
41165
|
use Moose; |
|
3
|
|
|
|
|
451780
|
|
|
3
|
|
|
|
|
22
|
|
4
|
3
|
|
|
3
|
|
22011
|
use Treex::Core::Common; |
|
3
|
|
|
|
|
9
|
|
|
3
|
|
|
|
|
20
|
|
5
|
3
|
|
|
3
|
|
17333
|
use File::Slurp; |
|
3
|
|
|
|
|
9
|
|
|
3
|
|
|
|
|
248
|
|
6
|
|
|
|
|
|
|
with 'Treex::Core::DocumentReader'; |
7
|
3
|
|
|
3
|
|
1999
|
use Treex::Core::Document; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
8
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
sub next_document { |
10
|
|
|
|
|
|
|
my ($self) = @_; |
11
|
|
|
|
|
|
|
return log_fatal "method next_document must be overridden in " . ref($self); |
12
|
|
|
|
|
|
|
} |
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
# Default language is und (undetermined/unknown) and selector empty. |
15
|
|
|
|
|
|
|
has selector => ( isa => 'Treex::Type::Selector', is => 'ro', default => q{} ); |
16
|
|
|
|
|
|
|
has language => ( isa => 'Treex::Type::LangCode', is => 'ro', default => 'und' ); |
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
has from => ( |
20
|
|
|
|
|
|
|
isa => 'Treex::Core::Files', |
21
|
|
|
|
|
|
|
is => 'rw', |
22
|
|
|
|
|
|
|
coerce => 1, |
23
|
|
|
|
|
|
|
required => 1, |
24
|
|
|
|
|
|
|
handles => [qw(current_filename file_number _set_file_number)], |
25
|
|
|
|
|
|
|
documentation => 'arrayref of filenames to be loaded, ' |
26
|
|
|
|
|
|
|
. 'coerced from a space or comma separated list of filenames, ' |
27
|
|
|
|
|
|
|
. 'see POD for details', |
28
|
|
|
|
|
|
|
); |
29
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
has file_stem => ( |
31
|
|
|
|
|
|
|
isa => 'Str', |
32
|
|
|
|
|
|
|
is => 'ro', |
33
|
|
|
|
|
|
|
documentation => 'how to name the loaded documents', |
34
|
|
|
|
|
|
|
); |
35
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
has is_one_doc_per_file => ( |
37
|
|
|
|
|
|
|
is => 'rw', |
38
|
|
|
|
|
|
|
isa => 'Bool', |
39
|
|
|
|
|
|
|
default => 1, |
40
|
|
|
|
|
|
|
); |
41
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
has _file_numbers => ( is => 'rw', default => sub { {} } ); |
43
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
has _file_number_width => ( |
45
|
|
|
|
|
|
|
is => 'rw', |
46
|
|
|
|
|
|
|
isa => 'Int', |
47
|
|
|
|
|
|
|
default => 3, |
48
|
|
|
|
|
|
|
documentation => 'The number of digits for numbered filenames. ' |
49
|
|
|
|
|
|
|
. 'The default (3) will create filenames with three digits as "001.treex.gz".' |
50
|
|
|
|
|
|
|
); |
51
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
has skip_finished => ( |
53
|
|
|
|
|
|
|
isa => 'Str', |
54
|
|
|
|
|
|
|
is => 'ro', |
55
|
|
|
|
|
|
|
documentation => 'Skip input files for which a matching non-empty output file exists ' |
56
|
|
|
|
|
|
|
. '(presumably created by a previous unfinished Treex run). ' |
57
|
|
|
|
|
|
|
. 'This parameter specifies a regex substitution how to derive the output filename from the input filename. ' |
58
|
|
|
|
|
|
|
. 'It is parallel to the parameter substitute={indir}{outdir} in writers. ' |
59
|
|
|
|
|
|
|
. 'However, you need to take care of filename extensions too, ' |
60
|
|
|
|
|
|
|
. 'e.g. if converting conll to treex, you should use skip_finished={indir/(.+).conll$}{outdir/$1.treex.gz}', |
61
|
|
|
|
|
|
|
); |
62
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
sub BUILD { |
65
|
|
|
|
|
|
|
my ( $self, $args ) = @_; |
66
|
|
|
|
|
|
|
if (my $regex = $self->skip_finished){ |
67
|
|
|
|
|
|
|
my $filenames_ref = $self->from->filenames; |
68
|
|
|
|
|
|
|
my @filtered_filenames; |
69
|
|
|
|
|
|
|
my $eval_string = '$filename =~ s' . $regex . '; 1;'; |
70
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
for my $input_filename (@$filenames_ref){ |
72
|
|
|
|
|
|
|
my $filename = $input_filename; |
73
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
# see r14228 for an alternative implementation (without stringy eval) which cannot handle $1 in rexex |
75
|
|
|
|
|
|
|
eval $eval_string or log_fatal "Failed to eval $eval_string"; ## no critic qw(BuiltinFunctions::ProhibitStringyEval) |
76
|
|
|
|
|
|
|
|
77
|
|
|
|
|
|
|
if (! -s $filename){ |
78
|
|
|
|
|
|
|
push @filtered_filenames, $input_filename; |
79
|
|
|
|
|
|
|
#say "not finished: $input_filename -> $filename"; |
80
|
|
|
|
|
|
|
} #else {say "finished: $input_filename -> $filename";} |
81
|
|
|
|
|
|
|
} |
82
|
|
|
|
|
|
|
$self->from->_set_filenames(\@filtered_filenames); |
83
|
|
|
|
|
|
|
my $input_number = @$filenames_ref; |
84
|
|
|
|
|
|
|
my $filtered_number = @filtered_filenames; |
85
|
|
|
|
|
|
|
my $finished_number = $input_number - $filtered_number; |
86
|
|
|
|
|
|
|
log_info "$finished_number files out of $input_number were finished, reading only the remaining $filtered_number."; |
87
|
|
|
|
|
|
|
} |
88
|
|
|
|
|
|
|
return; |
89
|
|
|
|
|
|
|
} |
90
|
|
|
|
|
|
|
|
91
|
|
|
|
|
|
|
sub next_filename { |
92
|
|
|
|
|
|
|
my ($self) = @_; |
93
|
|
|
|
|
|
|
|
94
|
|
|
|
|
|
|
# return undef, but do not move further if we are at the end of document list (we might need the current file name) |
95
|
|
|
|
|
|
|
return if ( $self->file_number >= $self->from->number_of_files ); |
96
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
$self->_set_file_number( $self->file_number + 1 ); |
98
|
|
|
|
|
|
|
return $self->current_filename(); |
99
|
|
|
|
|
|
|
} |
100
|
|
|
|
|
|
|
|
101
|
|
|
|
|
|
|
use File::Spec; |
102
|
|
|
|
|
|
|
|
103
|
|
|
|
|
|
|
sub new_document { |
104
|
|
|
|
|
|
|
my ( $self, $load_from ) = @_; |
105
|
|
|
|
|
|
|
my $path = $self->current_filename(); |
106
|
|
|
|
|
|
|
log_fatal "next_filename() must be called before new_document()" if !defined $path; |
107
|
|
|
|
|
|
|
my ( $volume, $dirs, $file ) = File::Spec->splitpath($path); |
108
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
# Delete file extension, e.g. |
110
|
|
|
|
|
|
|
# file.01.conll -> file.01 |
111
|
|
|
|
|
|
|
# cs42.treex.gz -> cs42 |
112
|
|
|
|
|
|
|
$file =~ s/\.[^.]+(\.gz)?$//; |
113
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
# Substitute standard input for noname. |
115
|
|
|
|
|
|
|
$file =~ s/^-$/noname/; |
116
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
my %args = ( file_stem => $file, loaded_from => $path ); |
118
|
|
|
|
|
|
|
if ( defined $dirs ) { |
119
|
|
|
|
|
|
|
$args{path} = $volume . $dirs; |
120
|
|
|
|
|
|
|
} |
121
|
|
|
|
|
|
|
|
122
|
|
|
|
|
|
|
# Override the naming heuristics above, if file_stem was specified. |
123
|
|
|
|
|
|
|
if ( $self->file_stem ) { |
124
|
|
|
|
|
|
|
$args{file_stem} = $self->file_stem; |
125
|
|
|
|
|
|
|
} |
126
|
|
|
|
|
|
|
|
127
|
|
|
|
|
|
|
if ( $self->is_one_doc_per_file && !$self->file_stem ) { |
128
|
|
|
|
|
|
|
$args{file_number} = q{}; |
129
|
|
|
|
|
|
|
} |
130
|
|
|
|
|
|
|
else { |
131
|
|
|
|
|
|
|
my $num = $self->_file_numbers->{$file}; |
132
|
|
|
|
|
|
|
$self->_file_numbers->{$file} = ++$num; |
133
|
|
|
|
|
|
|
my $fmt = "%0".$self->_file_number_width."d"; |
134
|
|
|
|
|
|
|
$args{file_number} = sprintf $fmt, $num; |
135
|
|
|
|
|
|
|
} |
136
|
|
|
|
|
|
|
|
137
|
|
|
|
|
|
|
if ( defined $load_from ) { |
138
|
|
|
|
|
|
|
$args{filename} = $load_from; |
139
|
|
|
|
|
|
|
} |
140
|
|
|
|
|
|
|
|
141
|
|
|
|
|
|
|
$self->_set_doc_number( $self->doc_number + 1 ); |
142
|
|
|
|
|
|
|
|
143
|
|
|
|
|
|
|
my $document; |
144
|
|
|
|
|
|
|
if ( defined $load_from and $load_from =~ /\.streex$/ ) { |
145
|
|
|
|
|
|
|
$document = Treex::Core::Document->retrieve_storable($load_from); |
146
|
|
|
|
|
|
|
$document->set_storable(1); |
147
|
|
|
|
|
|
|
} |
148
|
|
|
|
|
|
|
else { |
149
|
|
|
|
|
|
|
$document = Treex::Core::Document->new( \%args ); |
150
|
|
|
|
|
|
|
} |
151
|
|
|
|
|
|
|
|
152
|
|
|
|
|
|
|
if ( defined $load_from && $load_from =~ /\.gz$/ ) { |
153
|
|
|
|
|
|
|
$document->set_compress(1); |
154
|
|
|
|
|
|
|
} |
155
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
return $document; |
157
|
|
|
|
|
|
|
} |
158
|
|
|
|
|
|
|
|
159
|
|
|
|
|
|
|
sub number_of_documents { |
160
|
|
|
|
|
|
|
my $self = shift; |
161
|
|
|
|
|
|
|
return $self->is_one_doc_per_file ? $self->from->number_of_files : undef; |
162
|
|
|
|
|
|
|
} |
163
|
|
|
|
|
|
|
|
164
|
|
|
|
|
|
|
after 'restart' => sub { |
165
|
|
|
|
|
|
|
my $self = shift; |
166
|
|
|
|
|
|
|
$self->_set_file_number(0); |
167
|
|
|
|
|
|
|
}; |
168
|
|
|
|
|
|
|
|
169
|
|
|
|
|
|
|
1; |
170
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
__END__ |
172
|
|
|
|
|
|
|
|
173
|
|
|
|
|
|
|
=pod |
174
|
|
|
|
|
|
|
|
175
|
|
|
|
|
|
|
=encoding utf-8 |
176
|
|
|
|
|
|
|
|
177
|
|
|
|
|
|
|
=head1 NAME |
178
|
|
|
|
|
|
|
|
179
|
|
|
|
|
|
|
Treex::Block::Read::BaseReader - abstract ancestor for document readers |
180
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
=head1 VERSION |
182
|
|
|
|
|
|
|
|
183
|
|
|
|
|
|
|
version 2.20160630 |
184
|
|
|
|
|
|
|
|
185
|
|
|
|
|
|
|
=head1 DESCRIPTION |
186
|
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
This class serves as a common ancestor for document readers |
188
|
|
|
|
|
|
|
that have the parameter C<from> with a space or comma separated list of filenames |
189
|
|
|
|
|
|
|
to be loaded. |
190
|
|
|
|
|
|
|
It is designed to implement the L<Treex::Core::DocumentReader> interface. |
191
|
|
|
|
|
|
|
|
192
|
|
|
|
|
|
|
In derived classes you need to define the C<next_document> method, |
193
|
|
|
|
|
|
|
and you can use C<next_filename> and C<new_document> methods. |
194
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
=head1 ATTRIBUTES |
196
|
|
|
|
|
|
|
|
197
|
|
|
|
|
|
|
=over |
198
|
|
|
|
|
|
|
|
199
|
|
|
|
|
|
|
=item from (required) |
200
|
|
|
|
|
|
|
|
201
|
|
|
|
|
|
|
space or comma separated list of filenames, or C<-> for STDIN |
202
|
|
|
|
|
|
|
|
203
|
|
|
|
|
|
|
An '@' directly in front of a file name causes this file to be interpreted as a file |
204
|
|
|
|
|
|
|
list, with one file name per line, e.g. '@filelist.txt' causes the reader to open |
205
|
|
|
|
|
|
|
'filelist.txt' and read a list of files from it. File lists may be arbitrarily |
206
|
|
|
|
|
|
|
mixed with regular files in the parameter. |
207
|
|
|
|
|
|
|
|
208
|
|
|
|
|
|
|
Similarly, you can use I<!> for wildcard expansion, e.g. |
209
|
|
|
|
|
|
|
C<treex -Len Read::Treex from='!dir??/file*.txt'>. |
210
|
|
|
|
|
|
|
The single quotes are needed for two reasons. |
211
|
|
|
|
|
|
|
First, to prevent bash from interpreting the wildcard characters. |
212
|
|
|
|
|
|
|
Second, to prevent bash from interpreting the exclamation mark as history expansion. |
213
|
|
|
|
|
|
|
|
214
|
|
|
|
|
|
|
The I<@filelist> and I<!wildcard> conventions are used in several tools, e.g. 7z or javac. |
215
|
|
|
|
|
|
|
|
216
|
|
|
|
|
|
|
(If you use this method via API you can specify a string array reference or a |
217
|
|
|
|
|
|
|
L<Treex::Core::Files> object.) |
218
|
|
|
|
|
|
|
|
219
|
|
|
|
|
|
|
=item file_stem (optional) |
220
|
|
|
|
|
|
|
|
221
|
|
|
|
|
|
|
How to name the loaded documents. |
222
|
|
|
|
|
|
|
This attribute will be saved to the same-named |
223
|
|
|
|
|
|
|
attribute in documents and it will be used in document writers |
224
|
|
|
|
|
|
|
to decide where to save the files. |
225
|
|
|
|
|
|
|
|
226
|
|
|
|
|
|
|
=back |
227
|
|
|
|
|
|
|
|
228
|
|
|
|
|
|
|
=head1 METHODS |
229
|
|
|
|
|
|
|
|
230
|
|
|
|
|
|
|
=over |
231
|
|
|
|
|
|
|
|
232
|
|
|
|
|
|
|
=item next_document |
233
|
|
|
|
|
|
|
|
234
|
|
|
|
|
|
|
This method must be overridden in derived classes. |
235
|
|
|
|
|
|
|
(The implementation in this class just issues fatal error.) |
236
|
|
|
|
|
|
|
|
237
|
|
|
|
|
|
|
=item next_filename |
238
|
|
|
|
|
|
|
|
239
|
|
|
|
|
|
|
returns the next filename (full path) to be loaded |
240
|
|
|
|
|
|
|
(from the list specified in the attribute C<from>) |
241
|
|
|
|
|
|
|
|
242
|
|
|
|
|
|
|
=item new_document($load_from?) |
243
|
|
|
|
|
|
|
|
244
|
|
|
|
|
|
|
Returns a new empty document with pre-filled attributes |
245
|
|
|
|
|
|
|
C<loaded_from>, C<file_stem>, C<file_number> and C<path> |
246
|
|
|
|
|
|
|
which are guessed based on C<current_filename>. |
247
|
|
|
|
|
|
|
|
248
|
|
|
|
|
|
|
=item current_filename |
249
|
|
|
|
|
|
|
|
250
|
|
|
|
|
|
|
returns the last filename returned by C<next_filename> |
251
|
|
|
|
|
|
|
|
252
|
|
|
|
|
|
|
=item is_next_document_for_this_job |
253
|
|
|
|
|
|
|
|
254
|
|
|
|
|
|
|
Is the document that will be returned by C<next_document> |
255
|
|
|
|
|
|
|
supposed to be processed by this job? |
256
|
|
|
|
|
|
|
This is relevant only in parallel processing, |
257
|
|
|
|
|
|
|
where each job has a different C<$jobnumber> assigned. |
258
|
|
|
|
|
|
|
|
259
|
|
|
|
|
|
|
=item number_of_documents |
260
|
|
|
|
|
|
|
|
261
|
|
|
|
|
|
|
Returns the number of documents that will be read by this reader. |
262
|
|
|
|
|
|
|
If C<is_one_doc_per_file> returns C<true>, then the number of documents |
263
|
|
|
|
|
|
|
equals the number of files given in C<from>. |
264
|
|
|
|
|
|
|
Otherwise, this method returns C<undef>. |
265
|
|
|
|
|
|
|
|
266
|
|
|
|
|
|
|
=back |
267
|
|
|
|
|
|
|
|
268
|
|
|
|
|
|
|
=head1 SEE |
269
|
|
|
|
|
|
|
|
270
|
|
|
|
|
|
|
L<Treex::Block::Read::BaseTextReader> |
271
|
|
|
|
|
|
|
L<Treex::Block::Read::Text> |
272
|
|
|
|
|
|
|
|
273
|
|
|
|
|
|
|
=head1 AUTHOR |
274
|
|
|
|
|
|
|
|
275
|
|
|
|
|
|
|
Martin Popel <popel@ufal.mff.cuni.cz> |
276
|
|
|
|
|
|
|
|
277
|
|
|
|
|
|
|
=head1 COPYRIGHT AND LICENSE |
278
|
|
|
|
|
|
|
|
279
|
|
|
|
|
|
|
Copyright © 2011-2012 by Institute of Formal and Applied Linguistics, Charles University in Prague |
280
|
|
|
|
|
|
|
|
281
|
|
|
|
|
|
|
This module is free software; you can redistribute it and/or modify it under the same terms as Perl itself. |