File Coverage

blib/lib/Treex/Block/Read/BaseTextReader.pm
Criterion Covered Total %
statement 9 24 37.5
branch 0 8 0.0
condition 0 3 0.0
subroutine 3 5 60.0
pod n/a
total 12 40 30.0


line stmt bran cond sub pod time code
1             package Treex::Block::Read::BaseTextReader;
2             $Treex::Block::Read::BaseTextReader::VERSION = '2.20150928';
3 2     2   1573 use Moose;
  2         4  
  2         19  
4 2     2   11563 use Treex::Core::Common;
  2         4  
  2         18  
5             extends 'Treex::Block::Read::BaseReader';
6             #use File::Slurp 9999;
7 2     2   9808 use PerlIO::via::gzip;
  2         4  
  2         761  
8              
9             # By default read from STDIN
10             has '+from' => (
11             default => '-',
12             handles => [qw(current_filename current_filehandle file_number _set_file_number next_filehandle)],
13             );
14              
15             has lines_per_doc => ( isa => 'Int', is => 'ro', default => 0 );
16             has merge_files => ( isa => 'Bool', is => 'ro', default => 0 );
17             has encoding => ( isa => 'Str', is => 'ro', default => 'utf8' );
18              
19             sub BUILD {
20 0     0     my ($self) = @_;
21 0 0         if ( $self->lines_per_doc ) {
22 0           $self->set_is_one_doc_per_file(0);
23             }
24 0           return;
25             }
26              
27             sub next_document_text {
28 0     0     my ($self) = @_;
29 0 0         if ( $self->is_one_doc_per_file ) {
30 0           return $self->from->next_file_text();
31             }
32              
33 0           my $text = '';
34             LINE:
35 0           for my $line ( 1 .. $self->lines_per_doc ) {
36 0           $line = $self->from->next_line();
37 0 0         if (!defined $line){
38 0 0 0       return if $text eq '' && !$self->from->has_next_file();
39 0           last LINE;
40             }
41            
42 0           $text .= $line;
43             }
44 0           return $text;
45             }
46              
47             1;
48              
49             __END__
50              
51             =pod
52              
53             =encoding utf-8
54              
55             =for Pod::Coverage BUILD
56              
57             =head1 NAME
58              
59             Treex::Block::Read::BaseTextReader - abstract ancestor for document readers
60              
61             =head1 VERSION
62              
63             version 2.20150928
64              
65             =head1 DESCRIPTION
66              
67             This class serves as an common ancestor for document readers,
68             that have parameter C<from> with a space or comma separated list of filenames
69             to be loaded and load the documents from plain text files.
70             It is designed to implement the L<Treex::Core::DocumentReader> interface.
71              
72             In derived classes you need to define the C<next_document> method,
73             and you can use C<next_document_text> and C<new_document> methods.
74              
75             =head1 ATTRIBUTES
76              
77             =over
78              
79             =item language (required)
80              
81             =item lines_per_doc
82              
83             If you want to split one file to more documents.
84             The default is 0 which means, don't split.
85              
86             =item merge_files
87              
88             Merge the content of all files (specified in C<from> attribute) into one stream.
89             Useful in combination with C<lines_per_doc> to get equally-sized documents
90             even from non-equally-sized files.
91              
92             =item encoding
93              
94             What is the encoding of the input files. E.g. C<utf8> (the default), C<cp1250> etc.
95              
96             =back
97              
98             =head1 METHODS
99              
100             =over
101              
102             =item next_document_text
103              
104             Returns a content of each file (specified in C<from> attribute) as a text string.
105              
106             =item next_filehandle
107              
108             Helper method - you can use this instead of C<next_document_text>
109             if you don't want to load the whole text into memory
110             (but do e.g. SAX-like parsing).
111              
112             =back
113              
114             =head1 SEE
115              
116             L<Treex::Block::Read::BaseReader>
117             L<Treex::Block::Read::Text>
118              
119             =head1 AUTHOR
120              
121             Martin Popel
122              
123             =head1 COPYRIGHT AND LICENSE
124              
125             Copyright © 2011-2012 by Institute of Formal and Applied Linguistics, Charles University in Prague
126              
127             This module is free software; you can redistribute it and/or modify it under the same terms as Perl itself.