File Coverage

blib/lib/Treex/Block/Read/BaseAlignedReader.pm
Criterion Covered Total %
statement 13 15 86.6
branch n/a
condition n/a
subroutine 5 5 100.0
pod n/a
total 18 20 90.0


line stmt bran cond sub pod time code
1             package Treex::Block::Read::BaseAlignedReader;
2             $Treex::Block::Read::BaseAlignedReader::VERSION = '2.20151102';
3 3     3   23424 use strict;
  3         6  
  3         85  
4 3     3   13 use warnings;
  3         7  
  3         86  
5 3     3   846 use Moose;
  3         469642  
  3         19  
6 3     3   20545 use Treex::Core::Common;
  3         591326  
  3         21  
7             with 'Treex::Core::DocumentReader';
8 3     3   19597 use Treex::Core::Document;
  0            
  0            
9              
10             sub next_document {
11             my ($self) = @_;
12             return log_fatal "method next_document must be overriden in " . ref($self);
13             }
14              
15             has selector => ( isa => 'Treex::Type::Selector', is => 'ro', default => '' );
16              
17             has file_stem => (
18             isa => 'Str',
19             is => 'ro',
20             documentation => 'how to name the loaded documents',
21             );
22              
23             # private attributes
24             has _filenames => (
25             isa => 'HashRef[Str]',
26             is => 'rw',
27             init_arg => undef,
28             default => sub { {} },
29             documentation => 'mapping zone_label->filenames to be loaded;'
30             . ' automatically initialized from constructor arguments',
31             );
32              
33             has _files_per_zone => ( is => 'rw', default => 0 );
34              
35             has _file_number => (
36             isa => 'Int',
37             is => 'rw',
38             default => 0,
39             init_arg => undef,
40             documentation => 'Number of n-tuples of input files loaded so far.',
41             );
42              
43             has skip_finished => (
44             isa => 'Str',
45             is => 'ro',
46             documentation => 'Skip input files for which a matching non-empty output file exists '
47             . '(presumably created by a previous unfinished Treex run). '
48             . 'This parameter specifies a regex substitution how to derive the output filename from the input filename. '
49             . 'It is parallel to the parameter substitute={indir}{outdir} in writers. '
50             . 'However, you need to take care of filename extensions too, '
51             . 'e.g. if converting conll to treex, you should use skip_finished={indir/(.+).conll$}{outdir/$1.treex.gz}',
52             );
53              
54             #BUILD is needed for processing generic arguments - now only shortcuts of type langcode_selector
55             sub BUILD {
56             my ( $self, $args ) = @_;
57             foreach my $arg ( keys %{$args} ) {
58             my ( $lang, $sele ) = ( $arg, '' );
59             if ( $arg =~ /_/ ) {
60             ( $lang, $sele ) = split /_/, $arg;
61             }
62             if ( is_lang_code($lang) ) {
63             my $files = Treex::Core::Files->new({string => $args->{$arg}});
64             if ( !$self->_files_per_zone ) {
65             $self->_set_files_per_zone( $files->number_of_files );
66             }
67             elsif ( $files->number_of_files != $self->_files_per_zone ) {
68             log_fatal('All zones must have the same number of files: ' . $files->number_of_files . ' != ' . $self->_files_per_zone);
69             }
70             $self->_filenames->{$arg} = $files;
71             }
72             elsif ( $arg =~ /selector|language|scenario/ ) { }
73             else { log_warn "$arg is not a zone label (e.g. en_src)"; }
74             }
75             if (my $regex = $self->skip_finished){
76             foreach my $zone (keys %{$self->_filenames}) {
77             my $filenames_ref = $self->_filenames->{$zone}->filenames;
78             my @filtered_filenames;
79             my $eval_string = '$filename =~ s' . $regex . '; 1;';
80             for my $input_filename (@$filenames_ref){
81             my $filename = $input_filename;
82             eval $eval_string or log_fatal "Failed to eval $eval_string"; ## no critic qw(BuiltinFunctions::ProhibitStringyEval)
83             if (! -s $filename){
84             push @filtered_filenames, $input_filename;
85             #say "not finished: $input_filename -> $filename";
86             } #else {say "finished: $input_filename -> $filename";}
87             }
88             $self->_filenames->{$zone} = Treex::Core::Files->new({filenames => \@filtered_filenames});
89             my $input_number = @$filenames_ref;
90             my $filtered_number = @filtered_filenames;
91             my $finished_number = $input_number - $filtered_number;
92             log_info "$finished_number files out of $input_number were finished, reading only the remaining $filtered_number.";
93             }
94             }
95             return;
96             }
97              
98             sub current_filenames {
99             my ($self) = @_;
100             my $n = $self->_file_number;
101             return if $n == 0 || $n > $self->_files_per_zone;
102             my %result = map { $_ => $self->_filenames->{$_}->filenames->[ $n - 1 ] } keys %{ $self->_filenames };
103             return \%result;
104             }
105              
106             sub next_filenames {
107             my ($self) = @_;
108             $self->_set_file_number( $self->_file_number + 1 );
109             return $self->current_filenames;
110             }
111              
112             sub new_document {
113             my ( $self, $load_from ) = @_;
114             my %filenames = %{$self->current_filenames()};
115             log_fatal "next_filenames() must be called before new_document()" if !%filenames;
116              
117             my ( $stem, $file_number ) = ( '', '' );
118             my ( $volume, $dirs, $file );
119             if ( $self->file_stem ) {
120             ( $stem, $file_number ) = ( $self->file_stem, undef );
121             }
122             else { # Magical heuristics how to choose default name for a document loaded from several files
123             foreach my $zone_label ( keys %filenames ) {
124             my $filename = $filenames{$zone_label};
125             ( $volume, $dirs, $file ) = File::Spec->splitpath($filename);
126              
127             # Delete file extension, e.g.
128             # file.01.conll -> file.01
129             # cs42.treex.gz -> cs42
130             $file =~ s/\.[^.]+(\.gz)?$//;
131              
132             # Substitute standard input for noname.
133             $file =~ s/^-$/noname/;
134              
135             # Heuristically delete indication of language&selector from the filename.
136             my ( $lang, $sele ) = ( $zone_label, '' );
137             if ( $zone_label =~ /_/ ) {
138             ( $lang, $sele ) = split /_/, $zone_label;
139             }
140             $file =~ s/[_-]?($lang|$sele|$zone_label)[_-]?//gi;
141             if ( !$file && !$stem ) {
142             $file = 'noname';
143             $file_number = undef;
144             }
145             if ( $stem !~ /$file/ ) {
146             if ( $stem ne '' ) {
147             $stem .= '_';
148             }
149             $stem .= $file;
150             }
151             }
152             }
153              
154             $self->_set_doc_number( $self->doc_number + 1 );
155             return Treex::Core::Document->new(
156             {
157             file_stem => $stem,
158             loaded_from => join( ',', values %filenames ),
159             defined $file_number ? ( file_number => $file_number ) : (),
160             defined $dirs ? ( path => $volume . $dirs ) : (),
161             defined $load_from ? ( filename => $load_from ) : (),
162             }
163             );
164             }
165              
166             sub number_of_documents {
167             my $self = shift;
168             return $self->_files_per_zone;
169             }
170              
171             after 'restart' => sub {
172             my $self = shift;
173             $self->_set_file_number(0);
174             };
175              
176             1;
177              
178             __END__
179              
180             =for Pod::Coverage BUILD
181              
182             =head1 NAME
183              
184             Treex::Block::Read::BaseAlignedReader - abstract ancestor for parallel-corpora document readers
185              
186             =head1 VERSION
187              
188             version 2.20151102
189              
190             =head1 SYNOPSIS
191              
192             # in scenarios
193             Read::MyAlignedFormat en=english.txt de=german.txt
194              
195             # Zones can differ also in selectors, any number of zones can be read
196             Read::MyAlignedFormat en_ref=ref1,ref2 en_moses=mos1,mos2 en_tectomt=tmt1,tmt2
197              
198             =head1 DESCRIPTION
199              
200             This class serves as a common ancestor for document readers
201             that read more zones at once -- usually parallel sentences in two (or more) languages.
202             The readers take parameters named as the zones and values of the parameters
203             is a space or comma separated list of filenames to be loaded into the given zone.
204             The class is designed to implement the L<Treex::Core::DocumentReader> interface.
205              
206             In derived classes you need to define the C<next_document> method,
207             and you can use C<next_filenames> and C<new_document> methods.
208              
209             =head1 ATTRIBUTES
210              
211             =over
212              
213             =item any parameter in a form of a valid I<zone_label>
214              
215             space or comma separated list of filenames, or C<-> for STDIN.
216              
217             =item file_stem (optional)
218              
219             How to name the loaded documents.
220             This attribute will be saved to the same-named
221             attribute in documents and it will be used in document writers
222             to decide where to save the files.
223              
224             =back
225              
226             =head1 METHODS
227              
228             =over
229              
230             =item next_document
231              
232             This method must be overriden in derived classes.
233             (The implementation in this class just issues fatal error.)
234              
235             =item next_filenames
236              
237             Returns a hashref of filenames (full paths) to be loaded.
238             The keys of the hash are zone labels, the values are the filenames.
239              
240             =item new_document($load_from?)
241              
242             Returns a new empty document with pre-filled attributes
243             C<loaded_from>, C<file_stem>, C<file_number> and C<path>
244             which are guessed based on C<current_filenames>.
245              
246             =item current_filenames
247              
248             returns the last filenames returned by C<next_filenames>
249              
250             =item number_of_documents
251              
252             Returns the number of documents that will be read by this reader.
253              
254             =back
255              
256             =head1 SEE ALSO
257              
258             L<Treex::Block::Read::BaseReader>
259             L<Treex::Block::Read::BaseAlignedTextReader>
260              
261             =head1 AUTHOR
262              
263             Martin Popel
264              
265             =head1 COPYRIGHT AND LICENSE
266              
267             Copyright © 2011 by Institute of Formal and Applied Linguistics, Charles University in Prague
268              
269             This module is free software; you can redistribute it and/or modify it under the same terms as Perl itself.