File Coverage

blib/lib/Treex/Block/Read/BaseAlignedReader.pm
Criterion Covered Total %
statement 7 9 77.7
branch n/a
condition n/a
subroutine 3 3 100.0
pod n/a
total 10 12 83.3


line stmt bran cond sub pod time code
1             package Treex::Block::Read::BaseAlignedReader;
2             $Treex::Block::Read::BaseAlignedReader::VERSION = '0.13095';
3 1     1   15957 use strict;
  1         2  
  1         37  
4 1     1   3 use warnings;
  1         2  
  1         24  
5 1     1   990 use Moose;
  0            
  0            
6             use Treex::Core::Common;
7             with 'Treex::Core::DocumentReader';
8             use Treex::Core::Document;
9              
10             sub next_document {
11             my ($self) = @_;
12             return log_fatal "method next_document must be overriden in " . ref($self);
13             }
14              
15             has selector => ( isa => 'Treex::Type::Selector', is => 'ro', default => '' );
16              
17             has file_stem => (
18             isa => 'Str',
19             is => 'ro',
20             documentation => 'how to name the loaded documents',
21             );
22              
23             # private attributes
24             has _filenames => (
25             isa => 'HashRef[Str]',
26             is => 'rw',
27             init_arg => undef,
28             default => sub { {} },
29             documentation => 'mapping zone_label->filenames to be loaded;'
30             . ' automatically initialized from constructor arguments',
31             );
32              
33             has _files_per_zone => ( is => 'rw', default => 0 );
34              
35             has _file_number => (
36             isa => 'Int',
37             is => 'rw',
38             default => 0,
39             init_arg => undef,
40             documentation => 'Number of n-tuples of input files loaded so far.',
41             );
42              
43             #BUILD is needed for processing generic arguments - now only shortcuts of type langcode_selector
44             sub BUILD {
45             my ( $self, $args ) = @_;
46             foreach my $arg ( keys %{$args} ) {
47             my ( $lang, $sele ) = ( $arg, '' );
48             if ( $arg =~ /_/ ) {
49             ( $lang, $sele ) = split /_/, $arg;
50             }
51             if ( is_lang_code($lang) ) {
52             my $files = Treex::Core::Files->new({string => $args->{$arg}});
53             if ( !$self->_files_per_zone ) {
54             $self->_set_files_per_zone( $files->number_of_files );
55             }
56             elsif ( $files->number_of_files != $self->_files_per_zone ) {
57             log_fatal('All zones must have the same number of files: ' . $files->number_of_files . ' != ' . $self->_files_per_zone);
58             }
59             $self->_filenames->{$arg} = $files;
60             }
61             elsif ( $arg =~ /selector|language|scenario/ ) { }
62             else { log_warn "$arg is not a zone label (e.g. en_src)"; }
63             }
64             return;
65             }
66              
67             sub current_filenames {
68             my ($self) = @_;
69             my $n = $self->_file_number;
70             return if $n == 0 || $n > $self->_files_per_zone;
71             my %result = map { $_ => $self->_filenames->{$_}->filenames->[ $n - 1 ] } keys %{ $self->_filenames };
72             return \%result;
73             }
74              
75             sub next_filenames {
76             my ($self) = @_;
77             $self->_set_file_number( $self->_file_number + 1 );
78             return $self->current_filenames;
79             }
80              
81             sub new_document {
82             my ( $self, $load_from ) = @_;
83             my %filenames = %{$self->current_filenames()};
84             log_fatal "next_filenames() must be called before new_document()" if !%filenames;
85              
86             my ( $stem, $file_number ) = ( '', '' );
87             my ( $volume, $dirs, $file );
88             if ( $self->file_stem ) {
89             ( $stem, $file_number ) = ( $self->file_stem, undef );
90             }
91             else { # Magical heuristics how to choose default name for a document loaded from several files
92             foreach my $zone_label ( keys %filenames ) {
93             my $filename = $filenames{$zone_label};
94             ( $volume, $dirs, $file ) = File::Spec->splitpath($filename);
95              
96             # Delete file extension, e.g.
97             # file.01.conll -> file.01
98             # cs42.treex.gz -> cs42
99             $file =~ s/\.[^.]+(\.gz)?$//;
100              
101             # Substitute standard input for noname.
102             $file =~ s/^-$/noname/;
103              
104             # Heuristically delete indication of language&selector from the filename.
105             my ( $lang, $sele ) = ( $zone_label, '' );
106             if ( $zone_label =~ /_/ ) {
107             ( $lang, $sele ) = split /_/, $zone_label;
108             }
109             $file =~ s/[_-]?($lang|$sele|$zone_label)[_-]?//gi;
110             if ( !$file && !$stem ) {
111             $file = 'noname';
112             $file_number = undef;
113             }
114             if ( $stem !~ /$file/ ) {
115             if ( $stem ne '' ) {
116             $stem .= '_';
117             }
118             $stem .= $file;
119             }
120             }
121             }
122              
123             $self->_set_doc_number( $self->doc_number + 1 );
124             return Treex::Core::Document->new(
125             {
126             file_stem => $stem,
127             loaded_from => join( ',', values %filenames ),
128             defined $file_number ? ( file_number => $file_number ) : (),
129             defined $dirs ? ( path => $volume . $dirs ) : (),
130             defined $load_from ? ( filename => $load_from ) : (),
131             }
132             );
133             }
134              
135             sub number_of_documents {
136             my $self = shift;
137             return $self->_files_per_zone;
138             }
139              
140             after 'restart' => sub {
141             my $self = shift;
142             $self->_set_file_number(0);
143             };
144              
145             1;
146              
147             __END__