line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Treex::Block::Read::BaseAlignedReader; |
2
|
|
|
|
|
|
|
$Treex::Block::Read::BaseAlignedReader::VERSION = '2.20151102'; |
3
|
3
|
|
|
3
|
|
23424
|
use strict; |
|
3
|
|
|
|
|
6
|
|
|
3
|
|
|
|
|
85
|
|
4
|
3
|
|
|
3
|
|
13
|
use warnings; |
|
3
|
|
|
|
|
7
|
|
|
3
|
|
|
|
|
86
|
|
5
|
3
|
|
|
3
|
|
846
|
use Moose; |
|
3
|
|
|
|
|
469642
|
|
|
3
|
|
|
|
|
19
|
|
6
|
3
|
|
|
3
|
|
20545
|
use Treex::Core::Common; |
|
3
|
|
|
|
|
591326
|
|
|
3
|
|
|
|
|
21
|
|
7
|
|
|
|
|
|
|
with 'Treex::Core::DocumentReader'; |
8
|
3
|
|
|
3
|
|
19597
|
use Treex::Core::Document; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
sub next_document { |
11
|
|
|
|
|
|
|
my ($self) = @_; |
12
|
|
|
|
|
|
|
return log_fatal "method next_document must be overriden in " . ref($self); |
13
|
|
|
|
|
|
|
} |
14
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
has selector => ( isa => 'Treex::Type::Selector', is => 'ro', default => '' ); |
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
has file_stem => ( |
18
|
|
|
|
|
|
|
isa => 'Str', |
19
|
|
|
|
|
|
|
is => 'ro', |
20
|
|
|
|
|
|
|
documentation => 'how to name the loaded documents', |
21
|
|
|
|
|
|
|
); |
22
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
# private attributes |
24
|
|
|
|
|
|
|
has _filenames => ( |
25
|
|
|
|
|
|
|
isa => 'HashRef[Str]', |
26
|
|
|
|
|
|
|
is => 'rw', |
27
|
|
|
|
|
|
|
init_arg => undef, |
28
|
|
|
|
|
|
|
default => sub { {} }, |
29
|
|
|
|
|
|
|
documentation => 'mapping zone_label->filenames to be loaded;' |
30
|
|
|
|
|
|
|
. ' automatically initialized from constructor arguments', |
31
|
|
|
|
|
|
|
); |
32
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
has _files_per_zone => ( is => 'rw', default => 0 ); |
34
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
has _file_number => ( |
36
|
|
|
|
|
|
|
isa => 'Int', |
37
|
|
|
|
|
|
|
is => 'rw', |
38
|
|
|
|
|
|
|
default => 0, |
39
|
|
|
|
|
|
|
init_arg => undef, |
40
|
|
|
|
|
|
|
documentation => 'Number of n-tuples of input files loaded so far.', |
41
|
|
|
|
|
|
|
); |
42
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
has skip_finished => ( |
44
|
|
|
|
|
|
|
isa => 'Str', |
45
|
|
|
|
|
|
|
is => 'ro', |
46
|
|
|
|
|
|
|
documentation => 'Skip input files for which a matching non-empty output file exists ' |
47
|
|
|
|
|
|
|
. '(presumably created by a previous unfinished Treex run). ' |
48
|
|
|
|
|
|
|
. 'This parameter specifies a regex substitution how to derive the output filename from the input filename. ' |
49
|
|
|
|
|
|
|
. 'It is parallel to the parameter substitute={indir}{outdir} in writers. ' |
50
|
|
|
|
|
|
|
. 'However, you need to take care of filename extensions too, ' |
51
|
|
|
|
|
|
|
. 'e.g. if converting conll to treex, you should use skip_finished={indir/(.+).conll$}{outdir/$1.treex.gz}', |
52
|
|
|
|
|
|
|
); |
53
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
#BUILD is needed for processing generic arguments - now only shortcuts of type langcode_selector |
55
|
|
|
|
|
|
|
sub BUILD { |
56
|
|
|
|
|
|
|
my ( $self, $args ) = @_; |
57
|
|
|
|
|
|
|
foreach my $arg ( keys %{$args} ) { |
58
|
|
|
|
|
|
|
my ( $lang, $sele ) = ( $arg, '' ); |
59
|
|
|
|
|
|
|
if ( $arg =~ /_/ ) { |
60
|
|
|
|
|
|
|
( $lang, $sele ) = split /_/, $arg; |
61
|
|
|
|
|
|
|
} |
62
|
|
|
|
|
|
|
if ( is_lang_code($lang) ) { |
63
|
|
|
|
|
|
|
my $files = Treex::Core::Files->new({string => $args->{$arg}}); |
64
|
|
|
|
|
|
|
if ( !$self->_files_per_zone ) { |
65
|
|
|
|
|
|
|
$self->_set_files_per_zone( $files->number_of_files ); |
66
|
|
|
|
|
|
|
} |
67
|
|
|
|
|
|
|
elsif ( $files->number_of_files != $self->_files_per_zone ) { |
68
|
|
|
|
|
|
|
log_fatal('All zones must have the same number of files: ' . $files->number_of_files . ' != ' . $self->_files_per_zone); |
69
|
|
|
|
|
|
|
} |
70
|
|
|
|
|
|
|
$self->_filenames->{$arg} = $files; |
71
|
|
|
|
|
|
|
} |
72
|
|
|
|
|
|
|
elsif ( $arg =~ /selector|language|scenario/ ) { } |
73
|
|
|
|
|
|
|
else { log_warn "$arg is not a zone label (e.g. en_src)"; } |
74
|
|
|
|
|
|
|
} |
75
|
|
|
|
|
|
|
if (my $regex = $self->skip_finished){ |
76
|
|
|
|
|
|
|
foreach my $zone (keys %{$self->_filenames}) { |
77
|
|
|
|
|
|
|
my $filenames_ref = $self->_filenames->{$zone}->filenames; |
78
|
|
|
|
|
|
|
my @filtered_filenames; |
79
|
|
|
|
|
|
|
my $eval_string = '$filename =~ s' . $regex . '; 1;'; |
80
|
|
|
|
|
|
|
for my $input_filename (@$filenames_ref){ |
81
|
|
|
|
|
|
|
my $filename = $input_filename; |
82
|
|
|
|
|
|
|
eval $eval_string or log_fatal "Failed to eval $eval_string"; ## no critic qw(BuiltinFunctions::ProhibitStringyEval) |
83
|
|
|
|
|
|
|
if (! -s $filename){ |
84
|
|
|
|
|
|
|
push @filtered_filenames, $input_filename; |
85
|
|
|
|
|
|
|
#say "not finished: $input_filename -> $filename"; |
86
|
|
|
|
|
|
|
} #else {say "finished: $input_filename -> $filename";} |
87
|
|
|
|
|
|
|
} |
88
|
|
|
|
|
|
|
$self->_filenames->{$zone} = Treex::Core::Files->new({filenames => \@filtered_filenames}); |
89
|
|
|
|
|
|
|
my $input_number = @$filenames_ref; |
90
|
|
|
|
|
|
|
my $filtered_number = @filtered_filenames; |
91
|
|
|
|
|
|
|
my $finished_number = $input_number - $filtered_number; |
92
|
|
|
|
|
|
|
log_info "$finished_number files out of $input_number were finished, reading only the remaining $filtered_number."; |
93
|
|
|
|
|
|
|
} |
94
|
|
|
|
|
|
|
} |
95
|
|
|
|
|
|
|
return; |
96
|
|
|
|
|
|
|
} |
97
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
sub current_filenames { |
99
|
|
|
|
|
|
|
my ($self) = @_; |
100
|
|
|
|
|
|
|
my $n = $self->_file_number; |
101
|
|
|
|
|
|
|
return if $n == 0 || $n > $self->_files_per_zone; |
102
|
|
|
|
|
|
|
my %result = map { $_ => $self->_filenames->{$_}->filenames->[ $n - 1 ] } keys %{ $self->_filenames }; |
103
|
|
|
|
|
|
|
return \%result; |
104
|
|
|
|
|
|
|
} |
105
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
sub next_filenames { |
107
|
|
|
|
|
|
|
my ($self) = @_; |
108
|
|
|
|
|
|
|
$self->_set_file_number( $self->_file_number + 1 ); |
109
|
|
|
|
|
|
|
return $self->current_filenames; |
110
|
|
|
|
|
|
|
} |
111
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
sub new_document { |
113
|
|
|
|
|
|
|
my ( $self, $load_from ) = @_; |
114
|
|
|
|
|
|
|
my %filenames = %{$self->current_filenames()}; |
115
|
|
|
|
|
|
|
log_fatal "next_filenames() must be called before new_document()" if !%filenames; |
116
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
my ( $stem, $file_number ) = ( '', '' ); |
118
|
|
|
|
|
|
|
my ( $volume, $dirs, $file ); |
119
|
|
|
|
|
|
|
if ( $self->file_stem ) { |
120
|
|
|
|
|
|
|
( $stem, $file_number ) = ( $self->file_stem, undef ); |
121
|
|
|
|
|
|
|
} |
122
|
|
|
|
|
|
|
else { # Magical heuristics how to choose default name for a document loaded from several files |
123
|
|
|
|
|
|
|
foreach my $zone_label ( keys %filenames ) { |
124
|
|
|
|
|
|
|
my $filename = $filenames{$zone_label}; |
125
|
|
|
|
|
|
|
( $volume, $dirs, $file ) = File::Spec->splitpath($filename); |
126
|
|
|
|
|
|
|
|
127
|
|
|
|
|
|
|
# Delete file extension, e.g. |
128
|
|
|
|
|
|
|
# file.01.conll -> file.01 |
129
|
|
|
|
|
|
|
# cs42.treex.gz -> cs42 |
130
|
|
|
|
|
|
|
$file =~ s/\.[^.]+(\.gz)?$//; |
131
|
|
|
|
|
|
|
|
132
|
|
|
|
|
|
|
# Substitute standard input for noname. |
133
|
|
|
|
|
|
|
$file =~ s/^-$/noname/; |
134
|
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
# Heuristically delete indication of language&selector from the filename. |
136
|
|
|
|
|
|
|
my ( $lang, $sele ) = ( $zone_label, '' ); |
137
|
|
|
|
|
|
|
if ( $zone_label =~ /_/ ) { |
138
|
|
|
|
|
|
|
( $lang, $sele ) = split /_/, $zone_label; |
139
|
|
|
|
|
|
|
} |
140
|
|
|
|
|
|
|
$file =~ s/[_-]?($lang|$sele|$zone_label)[_-]?//gi; |
141
|
|
|
|
|
|
|
if ( !$file && !$stem ) { |
142
|
|
|
|
|
|
|
$file = 'noname'; |
143
|
|
|
|
|
|
|
$file_number = undef; |
144
|
|
|
|
|
|
|
} |
145
|
|
|
|
|
|
|
if ( $stem !~ /$file/ ) { |
146
|
|
|
|
|
|
|
if ( $stem ne '' ) { |
147
|
|
|
|
|
|
|
$stem .= '_'; |
148
|
|
|
|
|
|
|
} |
149
|
|
|
|
|
|
|
$stem .= $file; |
150
|
|
|
|
|
|
|
} |
151
|
|
|
|
|
|
|
} |
152
|
|
|
|
|
|
|
} |
153
|
|
|
|
|
|
|
|
154
|
|
|
|
|
|
|
$self->_set_doc_number( $self->doc_number + 1 ); |
155
|
|
|
|
|
|
|
return Treex::Core::Document->new( |
156
|
|
|
|
|
|
|
{ |
157
|
|
|
|
|
|
|
file_stem => $stem, |
158
|
|
|
|
|
|
|
loaded_from => join( ',', values %filenames ), |
159
|
|
|
|
|
|
|
defined $file_number ? ( file_number => $file_number ) : (), |
160
|
|
|
|
|
|
|
defined $dirs ? ( path => $volume . $dirs ) : (), |
161
|
|
|
|
|
|
|
defined $load_from ? ( filename => $load_from ) : (), |
162
|
|
|
|
|
|
|
} |
163
|
|
|
|
|
|
|
); |
164
|
|
|
|
|
|
|
} |
165
|
|
|
|
|
|
|
|
166
|
|
|
|
|
|
|
sub number_of_documents { |
167
|
|
|
|
|
|
|
my $self = shift; |
168
|
|
|
|
|
|
|
return $self->_files_per_zone; |
169
|
|
|
|
|
|
|
} |
170
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
after 'restart' => sub { |
172
|
|
|
|
|
|
|
my $self = shift; |
173
|
|
|
|
|
|
|
$self->_set_file_number(0); |
174
|
|
|
|
|
|
|
}; |
175
|
|
|
|
|
|
|
|
176
|
|
|
|
|
|
|
1; |
177
|
|
|
|
|
|
|
|
178
|
|
|
|
|
|
|
__END__ |
179
|
|
|
|
|
|
|
|
180
|
|
|
|
|
|
|
=for Pod::Coverage BUILD |
181
|
|
|
|
|
|
|
|
182
|
|
|
|
|
|
|
=head1 NAME |
183
|
|
|
|
|
|
|
|
184
|
|
|
|
|
|
|
Treex::Block::Read::BaseAlignedReader - abstract ancestor for parallel-corpora document readers |
185
|
|
|
|
|
|
|
|
186
|
|
|
|
|
|
|
=head1 VERSION |
187
|
|
|
|
|
|
|
|
188
|
|
|
|
|
|
|
version 2.20151102 |
189
|
|
|
|
|
|
|
|
190
|
|
|
|
|
|
|
=head1 SYNOPSIS |
191
|
|
|
|
|
|
|
|
192
|
|
|
|
|
|
|
# in scenarios |
193
|
|
|
|
|
|
|
Read::MyAlignedFormat en=english.txt de=german.txt |
194
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
# Zones can differ also in selectors, any number of zones can be read |
196
|
|
|
|
|
|
|
Read::MyAlignedFormat en_ref=ref1,ref2 en_moses=mos1,mos2 en_tectomt=tmt1,tmt2 |
197
|
|
|
|
|
|
|
|
198
|
|
|
|
|
|
|
=head1 DESCRIPTION |
199
|
|
|
|
|
|
|
|
200
|
|
|
|
|
|
|
This class serves as a common ancestor for document readers |
201
|
|
|
|
|
|
|
that read more zones at once -- usually parallel sentences in two (or more) languages. |
202
|
|
|
|
|
|
|
The readers take parameters named as the zones and values of the parameters |
203
|
|
|
|
|
|
|
is a space or comma separated list of filenames to be loaded into the given zone. |
204
|
|
|
|
|
|
|
The class is designed to implement the L<Treex::Core::DocumentReader> interface. |
205
|
|
|
|
|
|
|
|
206
|
|
|
|
|
|
|
In derived classes you need to define the C<next_document> method, |
207
|
|
|
|
|
|
|
and you can use C<next_filenames> and C<new_document> methods. |
208
|
|
|
|
|
|
|
|
209
|
|
|
|
|
|
|
=head1 ATTRIBUTES |
210
|
|
|
|
|
|
|
|
211
|
|
|
|
|
|
|
=over |
212
|
|
|
|
|
|
|
|
213
|
|
|
|
|
|
|
=item any parameter in a form of a valid I<zone_label> |
214
|
|
|
|
|
|
|
|
215
|
|
|
|
|
|
|
space or comma separated list of filenames, or C<-> for STDIN. |
216
|
|
|
|
|
|
|
|
217
|
|
|
|
|
|
|
=item file_stem (optional) |
218
|
|
|
|
|
|
|
|
219
|
|
|
|
|
|
|
How to name the loaded documents. |
220
|
|
|
|
|
|
|
This attribute will be saved to the same-named |
221
|
|
|
|
|
|
|
attribute in documents and it will be used in document writers |
222
|
|
|
|
|
|
|
to decide where to save the files. |
223
|
|
|
|
|
|
|
|
224
|
|
|
|
|
|
|
=back |
225
|
|
|
|
|
|
|
|
226
|
|
|
|
|
|
|
=head1 METHODS |
227
|
|
|
|
|
|
|
|
228
|
|
|
|
|
|
|
=over |
229
|
|
|
|
|
|
|
|
230
|
|
|
|
|
|
|
=item next_document |
231
|
|
|
|
|
|
|
|
232
|
|
|
|
|
|
|
This method must be overriden in derived classes. |
233
|
|
|
|
|
|
|
(The implementation in this class just issues fatal error.) |
234
|
|
|
|
|
|
|
|
235
|
|
|
|
|
|
|
=item next_filenames |
236
|
|
|
|
|
|
|
|
237
|
|
|
|
|
|
|
Returns a hashref of filenames (full paths) to be loaded. |
238
|
|
|
|
|
|
|
The keys of the hash are zone labels, the values are the filenames. |
239
|
|
|
|
|
|
|
|
240
|
|
|
|
|
|
|
=item new_document($load_from?) |
241
|
|
|
|
|
|
|
|
242
|
|
|
|
|
|
|
Returns a new empty document with pre-filled attributes |
243
|
|
|
|
|
|
|
C<loaded_from>, C<file_stem>, C<file_number> and C<path> |
244
|
|
|
|
|
|
|
which are guessed based on C<current_filenames>. |
245
|
|
|
|
|
|
|
|
246
|
|
|
|
|
|
|
=item current_filenames |
247
|
|
|
|
|
|
|
|
248
|
|
|
|
|
|
|
returns the last filenames returned by C<next_filenames> |
249
|
|
|
|
|
|
|
|
250
|
|
|
|
|
|
|
=item number_of_documents |
251
|
|
|
|
|
|
|
|
252
|
|
|
|
|
|
|
Returns the number of documents that will be read by this reader. |
253
|
|
|
|
|
|
|
|
254
|
|
|
|
|
|
|
=back |
255
|
|
|
|
|
|
|
|
256
|
|
|
|
|
|
|
=head1 SEE ALSO |
257
|
|
|
|
|
|
|
|
258
|
|
|
|
|
|
|
L<Treex::Block::Read::BaseReader> |
259
|
|
|
|
|
|
|
L<Treex::Block::Read::BaseAlignedTextReader> |
260
|
|
|
|
|
|
|
|
261
|
|
|
|
|
|
|
=head1 AUTHOR |
262
|
|
|
|
|
|
|
|
263
|
|
|
|
|
|
|
Martin Popel |
264
|
|
|
|
|
|
|
|
265
|
|
|
|
|
|
|
=head1 COPYRIGHT AND LICENSE |
266
|
|
|
|
|
|
|
|
267
|
|
|
|
|
|
|
Copyright © 2011 by Institute of Formal and Applied Linguistics, Charles University in Prague |
268
|
|
|
|
|
|
|
|
269
|
|
|
|
|
|
|
This module is free software; you can redistribute it and/or modify it under the same terms as Perl itself. |