| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package Treex::Block::Read::BaseAlignedReader; |
|
2
|
|
|
|
|
|
|
$Treex::Block::Read::BaseAlignedReader::VERSION = '0.13095'; |
|
3
|
1
|
|
|
1
|
|
15957
|
use strict; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
37
|
|
|
4
|
1
|
|
|
1
|
|
3
|
use warnings; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
24
|
|
|
5
|
1
|
|
|
1
|
|
990
|
use Moose; |
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
6
|
|
|
|
|
|
|
use Treex::Core::Common; |
|
7
|
|
|
|
|
|
|
with 'Treex::Core::DocumentReader'; |
|
8
|
|
|
|
|
|
|
use Treex::Core::Document; |
|
9
|
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
sub next_document { |
|
11
|
|
|
|
|
|
|
my ($self) = @_; |
|
12
|
|
|
|
|
|
|
return log_fatal "method next_document must be overriden in " . ref($self); |
|
13
|
|
|
|
|
|
|
} |
|
14
|
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
has selector => ( isa => 'Treex::Type::Selector', is => 'ro', default => '' ); |
|
16
|
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
has file_stem => ( |
|
18
|
|
|
|
|
|
|
isa => 'Str', |
|
19
|
|
|
|
|
|
|
is => 'ro', |
|
20
|
|
|
|
|
|
|
documentation => 'how to name the loaded documents', |
|
21
|
|
|
|
|
|
|
); |
|
22
|
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
# private attributes |
|
24
|
|
|
|
|
|
|
has _filenames => ( |
|
25
|
|
|
|
|
|
|
isa => 'HashRef[Str]', |
|
26
|
|
|
|
|
|
|
is => 'rw', |
|
27
|
|
|
|
|
|
|
init_arg => undef, |
|
28
|
|
|
|
|
|
|
default => sub { {} }, |
|
29
|
|
|
|
|
|
|
documentation => 'mapping zone_label->filenames to be loaded;' |
|
30
|
|
|
|
|
|
|
. ' automatically initialized from constructor arguments', |
|
31
|
|
|
|
|
|
|
); |
|
32
|
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
has _files_per_zone => ( is => 'rw', default => 0 ); |
|
34
|
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
has _file_number => ( |
|
36
|
|
|
|
|
|
|
isa => 'Int', |
|
37
|
|
|
|
|
|
|
is => 'rw', |
|
38
|
|
|
|
|
|
|
default => 0, |
|
39
|
|
|
|
|
|
|
init_arg => undef, |
|
40
|
|
|
|
|
|
|
documentation => 'Number of n-tuples of input files loaded so far.', |
|
41
|
|
|
|
|
|
|
); |
|
42
|
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
#BUILD is needed for processing generic arguments - now only shortcuts of type langcode_selector |
|
44
|
|
|
|
|
|
|
sub BUILD { |
|
45
|
|
|
|
|
|
|
my ( $self, $args ) = @_; |
|
46
|
|
|
|
|
|
|
foreach my $arg ( keys %{$args} ) { |
|
47
|
|
|
|
|
|
|
my ( $lang, $sele ) = ( $arg, '' ); |
|
48
|
|
|
|
|
|
|
if ( $arg =~ /_/ ) { |
|
49
|
|
|
|
|
|
|
( $lang, $sele ) = split /_/, $arg; |
|
50
|
|
|
|
|
|
|
} |
|
51
|
|
|
|
|
|
|
if ( is_lang_code($lang) ) { |
|
52
|
|
|
|
|
|
|
my $files = Treex::Core::Files->new({string => $args->{$arg}}); |
|
53
|
|
|
|
|
|
|
if ( !$self->_files_per_zone ) { |
|
54
|
|
|
|
|
|
|
$self->_set_files_per_zone( $files->number_of_files ); |
|
55
|
|
|
|
|
|
|
} |
|
56
|
|
|
|
|
|
|
elsif ( $files->number_of_files != $self->_files_per_zone ) { |
|
57
|
|
|
|
|
|
|
log_fatal('All zones must have the same number of files: ' . $files->number_of_files . ' != ' . $self->_files_per_zone); |
|
58
|
|
|
|
|
|
|
} |
|
59
|
|
|
|
|
|
|
$self->_filenames->{$arg} = $files; |
|
60
|
|
|
|
|
|
|
} |
|
61
|
|
|
|
|
|
|
elsif ( $arg =~ /selector|language|scenario/ ) { } |
|
62
|
|
|
|
|
|
|
else { log_warn "$arg is not a zone label (e.g. en_src)"; } |
|
63
|
|
|
|
|
|
|
} |
|
64
|
|
|
|
|
|
|
return; |
|
65
|
|
|
|
|
|
|
} |
|
66
|
|
|
|
|
|
|
|
|
67
|
|
|
|
|
|
|
sub current_filenames { |
|
68
|
|
|
|
|
|
|
my ($self) = @_; |
|
69
|
|
|
|
|
|
|
my $n = $self->_file_number; |
|
70
|
|
|
|
|
|
|
return if $n == 0 || $n > $self->_files_per_zone; |
|
71
|
|
|
|
|
|
|
my %result = map { $_ => $self->_filenames->{$_}->filenames->[ $n - 1 ] } keys %{ $self->_filenames }; |
|
72
|
|
|
|
|
|
|
return \%result; |
|
73
|
|
|
|
|
|
|
} |
|
74
|
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
sub next_filenames { |
|
76
|
|
|
|
|
|
|
my ($self) = @_; |
|
77
|
|
|
|
|
|
|
$self->_set_file_number( $self->_file_number + 1 ); |
|
78
|
|
|
|
|
|
|
return $self->current_filenames; |
|
79
|
|
|
|
|
|
|
} |
|
80
|
|
|
|
|
|
|
|
|
81
|
|
|
|
|
|
|
sub new_document { |
|
82
|
|
|
|
|
|
|
my ( $self, $load_from ) = @_; |
|
83
|
|
|
|
|
|
|
my %filenames = %{$self->current_filenames()}; |
|
84
|
|
|
|
|
|
|
log_fatal "next_filenames() must be called before new_document()" if !%filenames; |
|
85
|
|
|
|
|
|
|
|
|
86
|
|
|
|
|
|
|
my ( $stem, $file_number ) = ( '', '' ); |
|
87
|
|
|
|
|
|
|
my ( $volume, $dirs, $file ); |
|
88
|
|
|
|
|
|
|
if ( $self->file_stem ) { |
|
89
|
|
|
|
|
|
|
( $stem, $file_number ) = ( $self->file_stem, undef ); |
|
90
|
|
|
|
|
|
|
} |
|
91
|
|
|
|
|
|
|
else { # Magical heuristics how to choose default name for a document loaded from several files |
|
92
|
|
|
|
|
|
|
foreach my $zone_label ( keys %filenames ) { |
|
93
|
|
|
|
|
|
|
my $filename = $filenames{$zone_label}; |
|
94
|
|
|
|
|
|
|
( $volume, $dirs, $file ) = File::Spec->splitpath($filename); |
|
95
|
|
|
|
|
|
|
|
|
96
|
|
|
|
|
|
|
# Delete file extension, e.g. |
|
97
|
|
|
|
|
|
|
# file.01.conll -> file.01 |
|
98
|
|
|
|
|
|
|
# cs42.treex.gz -> cs42 |
|
99
|
|
|
|
|
|
|
$file =~ s/\.[^.]+(\.gz)?$//; |
|
100
|
|
|
|
|
|
|
|
|
101
|
|
|
|
|
|
|
# Substitute standard input for noname. |
|
102
|
|
|
|
|
|
|
$file =~ s/^-$/noname/; |
|
103
|
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
# Heuristically delete indication of language&selector from the filename. |
|
105
|
|
|
|
|
|
|
my ( $lang, $sele ) = ( $zone_label, '' ); |
|
106
|
|
|
|
|
|
|
if ( $zone_label =~ /_/ ) { |
|
107
|
|
|
|
|
|
|
( $lang, $sele ) = split /_/, $zone_label; |
|
108
|
|
|
|
|
|
|
} |
|
109
|
|
|
|
|
|
|
$file =~ s/[_-]?($lang|$sele|$zone_label)[_-]?//gi; |
|
110
|
|
|
|
|
|
|
if ( !$file && !$stem ) { |
|
111
|
|
|
|
|
|
|
$file = 'noname'; |
|
112
|
|
|
|
|
|
|
$file_number = undef; |
|
113
|
|
|
|
|
|
|
} |
|
114
|
|
|
|
|
|
|
if ( $stem !~ /$file/ ) { |
|
115
|
|
|
|
|
|
|
if ( $stem ne '' ) { |
|
116
|
|
|
|
|
|
|
$stem .= '_'; |
|
117
|
|
|
|
|
|
|
} |
|
118
|
|
|
|
|
|
|
$stem .= $file; |
|
119
|
|
|
|
|
|
|
} |
|
120
|
|
|
|
|
|
|
} |
|
121
|
|
|
|
|
|
|
} |
|
122
|
|
|
|
|
|
|
|
|
123
|
|
|
|
|
|
|
$self->_set_doc_number( $self->doc_number + 1 ); |
|
124
|
|
|
|
|
|
|
return Treex::Core::Document->new( |
|
125
|
|
|
|
|
|
|
{ |
|
126
|
|
|
|
|
|
|
file_stem => $stem, |
|
127
|
|
|
|
|
|
|
loaded_from => join( ',', values %filenames ), |
|
128
|
|
|
|
|
|
|
defined $file_number ? ( file_number => $file_number ) : (), |
|
129
|
|
|
|
|
|
|
defined $dirs ? ( path => $volume . $dirs ) : (), |
|
130
|
|
|
|
|
|
|
defined $load_from ? ( filename => $load_from ) : (), |
|
131
|
|
|
|
|
|
|
} |
|
132
|
|
|
|
|
|
|
); |
|
133
|
|
|
|
|
|
|
} |
|
134
|
|
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
sub number_of_documents { |
|
136
|
|
|
|
|
|
|
my $self = shift; |
|
137
|
|
|
|
|
|
|
return $self->_files_per_zone; |
|
138
|
|
|
|
|
|
|
} |
|
139
|
|
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
after 'restart' => sub { |
|
141
|
|
|
|
|
|
|
my $self = shift; |
|
142
|
|
|
|
|
|
|
$self->_set_file_number(0); |
|
143
|
|
|
|
|
|
|
}; |
|
144
|
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
1; |
|
146
|
|
|
|
|
|
|
|
|
147
|
|
|
|
|
|
|
__END__ |