line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Treex::Core::DocumentReader; |
2
|
|
|
|
|
|
|
$Treex::Core::DocumentReader::VERSION = '2.20210102'; |
3
|
3
|
|
|
3
|
|
2459
|
use Moose::Role; |
|
3
|
|
|
|
|
11
|
|
|
3
|
|
|
|
|
33
|
|
4
|
|
|
|
|
|
|
|
5
|
|
|
|
|
|
|
# with Moose >= 2.00, this must be present also in roles |
6
|
3
|
|
|
3
|
|
13790
|
use MooseX::SemiAffordanceAccessor 0.09; |
|
3
|
|
|
|
|
98
|
|
|
3
|
|
|
|
|
32
|
|
7
|
|
|
|
|
|
|
|
8
|
|
|
|
|
|
|
requires 'next_document'; |
9
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
requires 'number_of_documents'; |
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
# attrs for distributed processing |
13
|
|
|
|
|
|
|
# TODO: check jobs >= jobindex > 0 |
14
|
|
|
|
|
|
|
has jobs => ( |
15
|
|
|
|
|
|
|
is => 'rw', |
16
|
|
|
|
|
|
|
isa => 'Int', |
17
|
|
|
|
|
|
|
documentation => 'number of jobs for parallel processing', |
18
|
|
|
|
|
|
|
); |
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
has jobindex => ( |
21
|
|
|
|
|
|
|
is => 'rw', |
22
|
|
|
|
|
|
|
isa => 'Int', |
23
|
|
|
|
|
|
|
documentation => 'ordinal number of the current job in parallel processing', |
24
|
|
|
|
|
|
|
); |
25
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
# TODO: this should not be needed in future |
27
|
|
|
|
|
|
|
has outdir => ( |
28
|
|
|
|
|
|
|
is => 'rw', |
29
|
|
|
|
|
|
|
isa => 'Str', |
30
|
|
|
|
|
|
|
); |
31
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
has doc_number => ( |
33
|
|
|
|
|
|
|
isa => 'Int', |
34
|
|
|
|
|
|
|
is => 'ro', |
35
|
|
|
|
|
|
|
writer => '_set_doc_number', |
36
|
|
|
|
|
|
|
default => 0, |
37
|
|
|
|
|
|
|
init_arg => undef, |
38
|
|
|
|
|
|
|
documentation => 'Number of documents loaded so far, i.e.' |
39
|
|
|
|
|
|
|
. ' the ordinal number of the current (most recently loaded) document.', |
40
|
|
|
|
|
|
|
); |
41
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
has consumer => ( |
43
|
|
|
|
|
|
|
isa => 'Treex::Block::Read::ConsumerReader', |
44
|
|
|
|
|
|
|
is => 'rw' |
45
|
|
|
|
|
|
|
); |
46
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
sub next_document_for_this_job { |
48
|
2
|
|
|
2
|
1
|
260
|
my ($self) = @_; |
49
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
# In parallel execution, the file name is sent from the head to the workers via TCP |
51
|
|
|
|
|
|
|
# and only one doc per file is allowed, so we can override the file list to contain just |
52
|
|
|
|
|
|
|
# the file to be processed and set the $self->file_number counter to 0 – just before the file |
53
|
|
|
|
|
|
|
# to be processed (we will get another file name and reset it again next time). |
54
|
|
|
|
|
|
|
# |
55
|
|
|
|
|
|
|
# $self->doc_number is set to the number of processed files minus 1 since it will be increased |
56
|
|
|
|
|
|
|
# in next_document(). |
57
|
|
|
|
|
|
|
# |
58
|
|
|
|
|
|
|
# This is an ugly hack (next_filename _set_file_number is defined only in BaseReader and BaseAlignedReader), |
59
|
|
|
|
|
|
|
# but this code must be specified here in next_document_for_this_job because the method next_filename |
60
|
|
|
|
|
|
|
# may be overriden or may not be used at all (e.g., BaseTextReader delegates its functionality |
61
|
|
|
|
|
|
|
# to Treex::Core::Files). |
62
|
|
|
|
|
|
|
|
63
|
2
|
50
|
|
|
|
90
|
if ( $self->consumer ) { |
64
|
0
|
|
|
|
|
0
|
my $res = $self->consumer->call("next_filename"); |
65
|
0
|
0
|
|
|
|
0
|
if ($res) { |
66
|
0
|
|
|
|
|
0
|
$self->_set_file_number(0); |
67
|
0
|
|
|
|
|
0
|
$self->_set_doc_number( $res->{file_number} - 1 ); |
68
|
|
|
|
|
|
|
|
69
|
|
|
|
|
|
|
# $res->{result} contains the next file name for plain readers, |
70
|
|
|
|
|
|
|
# a hashref: zone -> file name for aligned readers |
71
|
0
|
0
|
|
|
|
0
|
if (ref($res->{result}) eq 'HASH'){ |
72
|
|
|
|
|
|
|
# here we assume that all zones exist in _filenames |
73
|
|
|
|
|
|
|
# (they should since all arguments are passed on to jobs) |
74
|
0
|
|
|
|
|
0
|
while (my ($zone, $filename) = each %{$res->{result}}){ |
|
0
|
|
|
|
|
0
|
|
75
|
0
|
|
|
|
|
0
|
$self->_filenames->{$zone}->_set_filenames( [ $filename ] ); |
76
|
|
|
|
|
|
|
} |
77
|
|
|
|
|
|
|
} |
78
|
|
|
|
|
|
|
else { |
79
|
0
|
|
|
|
|
0
|
$self->from->_set_filenames( [ $res->{result} ] ); |
80
|
|
|
|
|
|
|
} |
81
|
|
|
|
|
|
|
} |
82
|
|
|
|
|
|
|
|
83
|
|
|
|
|
|
|
# Martin Majliš had the following for BaseAlignedReader but I see no reason for it. |
84
|
|
|
|
|
|
|
# elsif ($self->_files_per_zone){ |
85
|
|
|
|
|
|
|
# $self->_set_file_number($self->_files_per_zone + 2); |
86
|
|
|
|
|
|
|
#} |
87
|
|
|
|
|
|
|
else { |
88
|
0
|
|
|
|
|
0
|
return; |
89
|
|
|
|
|
|
|
} |
90
|
|
|
|
|
|
|
} |
91
|
|
|
|
|
|
|
|
92
|
2
|
|
|
|
|
12
|
my $doc = $self->next_document(); |
93
|
|
|
|
|
|
|
|
94
|
|
|
|
|
|
|
# TODO this is not very elegant |
95
|
|
|
|
|
|
|
# and it is also wrong, because if next_document issues some warnings, |
96
|
|
|
|
|
|
|
# these are printed into a wrong file. |
97
|
|
|
|
|
|
|
# However, I don't know how to get the correct doc_number before executing next_document. |
98
|
|
|
|
|
|
|
# Regarding perlcritic ProtectPrivateSubs: |
99
|
|
|
|
|
|
|
# I consider _redirect_output as internal for Treex::Core modules. |
100
|
|
|
|
|
|
|
# print STDERR "DOC: " . $doc . " : " . $self->doc_number . ", JOB: " . $self->jobindex . "\n"; |
101
|
|
|
|
|
|
|
|
102
|
2
|
50
|
66
|
|
|
43
|
if ( $doc && $self->jobindex ) { |
103
|
0
|
|
|
|
|
0
|
Treex::Core::Parallel::Node::_redirect_output( $self->outdir, $self->doc_number, $self->jobindex ); ## no critic (ProtectPrivateSubs) |
104
|
|
|
|
|
|
|
} |
105
|
|
|
|
|
|
|
|
106
|
2
|
|
|
|
|
10
|
return $doc; |
107
|
|
|
|
|
|
|
} |
108
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
sub number_of_documents_per_this_job { |
110
|
1
|
|
|
1
|
1
|
4
|
my ($self) = @_; |
111
|
1
|
50
|
|
|
|
11
|
my $total = $self->number_of_documents() or return; |
112
|
1
|
50
|
|
|
|
39
|
return $total if !$self->jobs; |
113
|
0
|
|
|
|
|
|
my $rest = $total % $self->jobs; |
114
|
0
|
|
|
|
|
|
my $div = ( $total - $rest ) / $self->jobs; |
115
|
0
|
0
|
|
|
|
|
return $div + ( $rest >= $self->jobindex ? 1 : 0 ); |
116
|
|
|
|
|
|
|
} |
117
|
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
sub restart { |
119
|
0
|
|
|
0
|
1
|
|
my ($self) = @_; |
120
|
0
|
|
|
|
|
|
$self->_set_doc_number(0); |
121
|
0
|
|
|
|
|
|
return; |
122
|
|
|
|
|
|
|
} |
123
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
# Readers usually do not need any share files, |
125
|
|
|
|
|
|
|
# but all blocks should implement this method |
126
|
|
|
|
|
|
|
# and readers do not extend Treex::Core::Block. |
127
|
|
|
|
|
|
|
sub get_required_share_files { |
128
|
0
|
|
|
0
|
0
|
|
my ($self) = @_; |
129
|
0
|
|
|
|
|
|
return (); |
130
|
|
|
|
|
|
|
} |
131
|
|
|
|
|
|
|
|
132
|
|
|
|
|
|
|
1; |
133
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
__END__ |
135
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
=encoding utf-8 |
137
|
|
|
|
|
|
|
|
138
|
|
|
|
|
|
|
=head1 NAME |
139
|
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
Treex::Core::DocumentReader - interface for all document readers |
141
|
|
|
|
|
|
|
|
142
|
|
|
|
|
|
|
=head1 VERSION |
143
|
|
|
|
|
|
|
|
144
|
|
|
|
|
|
|
version 2.20210102 |
145
|
|
|
|
|
|
|
|
146
|
|
|
|
|
|
|
=head1 DESCRIPTION |
147
|
|
|
|
|
|
|
|
148
|
|
|
|
|
|
|
Document readers are a Treex concept how to load documents to be processed by Treex. |
149
|
|
|
|
|
|
|
The documents can be stored in files (in various formats) or read from C<STDIN> |
150
|
|
|
|
|
|
|
or retrieved from a socket etc. |
151
|
|
|
|
|
|
|
|
152
|
|
|
|
|
|
|
=head1 METHODS |
153
|
|
|
|
|
|
|
|
154
|
|
|
|
|
|
|
=head2 To be implemented |
155
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
These methods must be implemented in classes that consume this role. |
157
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
=over |
159
|
|
|
|
|
|
|
|
160
|
|
|
|
|
|
|
=item next_document |
161
|
|
|
|
|
|
|
|
162
|
|
|
|
|
|
|
Return next document (L<Treex::Core::Document>). |
163
|
|
|
|
|
|
|
|
164
|
|
|
|
|
|
|
=item number_of_documents |
165
|
|
|
|
|
|
|
|
166
|
|
|
|
|
|
|
Total number of documents that will be produced by this reader. |
167
|
|
|
|
|
|
|
If the number is unknown in advance, C<undef> should be returned. |
168
|
|
|
|
|
|
|
|
169
|
|
|
|
|
|
|
=back |
170
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
=head2 Already implemented |
172
|
|
|
|
|
|
|
|
173
|
|
|
|
|
|
|
=over |
174
|
|
|
|
|
|
|
|
175
|
|
|
|
|
|
|
=item is_current_document_for_this_job |
176
|
|
|
|
|
|
|
|
177
|
|
|
|
|
|
|
Is the document that was most recently returned by C<$self->next_document()> |
178
|
|
|
|
|
|
|
supposed to be processed by this job? |
179
|
|
|
|
|
|
|
Job indices and document numbers are 1-based, so e.g. for |
180
|
|
|
|
|
|
|
C<jobs = 5, jobindex = 3> we want to load documents with numbers 3,8,13,18,... |
181
|
|
|
|
|
|
|
C<jobs = 5, jobindex = 5> we want to load documents with numbers 5,10,15,20,... |
182
|
|
|
|
|
|
|
i.e. those documents where C<(doc_number-1) % jobs == (jobindex-1)>. |
183
|
|
|
|
|
|
|
|
184
|
|
|
|
|
|
|
=item next_document_for_this_job |
185
|
|
|
|
|
|
|
|
186
|
|
|
|
|
|
|
Returns a next document which should be processed by this job. |
187
|
|
|
|
|
|
|
If C<jobindex> is set, returns "modulo number of jobs". |
188
|
|
|
|
|
|
|
See C<is_current_document_for_this_job>. |
189
|
|
|
|
|
|
|
|
190
|
|
|
|
|
|
|
=item number_of_documents_per_this_job |
191
|
|
|
|
|
|
|
|
192
|
|
|
|
|
|
|
Total number of documents that will be produced by this reader for this job. |
193
|
|
|
|
|
|
|
It's computed based on C<number_of_documents>, C<jobindex> and C<jobs>. |
194
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
=item restart |
196
|
|
|
|
|
|
|
|
197
|
|
|
|
|
|
|
Start reading again from the first document. |
198
|
|
|
|
|
|
|
This implementation just sets the attribute C<doc_number> to zero. |
199
|
|
|
|
|
|
|
You can add additional behavior using the Moose C<after 'restart'> construct. |
200
|
|
|
|
|
|
|
|
201
|
|
|
|
|
|
|
=back |
202
|
|
|
|
|
|
|
|
203
|
|
|
|
|
|
|
=head1 SEE ALSO |
204
|
|
|
|
|
|
|
|
205
|
|
|
|
|
|
|
L<Treex::Block::Read::Sentences> |
206
|
|
|
|
|
|
|
L<Treex::Block::Read::Text> |
207
|
|
|
|
|
|
|
L<Treex::Block::Read::Treex> |
208
|
|
|
|
|
|
|
|
209
|
|
|
|
|
|
|
|
210
|
|
|
|
|
|
|
=head1 AUTHOR |
211
|
|
|
|
|
|
|
|
212
|
|
|
|
|
|
|
Martin Popel <popel@ufal.mff.cuni.cz> |
213
|
|
|
|
|
|
|
|
214
|
|
|
|
|
|
|
=head1 COPYRIGHT AND LICENSE |
215
|
|
|
|
|
|
|
|
216
|
|
|
|
|
|
|
Copyright © 2011 by Institute of Formal and Applied Linguistics, Charles University in Prague |
217
|
|
|
|
|
|
|
|
218
|
|
|
|
|
|
|
This module is free software; you can redistribute it and/or modify it under the same terms as Perl itself. |