File Coverage

blib/lib/Treex/Block/Read/BaseTextReader.pm

Criterion	Covered	Total	%
statement	9	24	37.5
branch	0	8	0.0
condition	0	3	0.0
subroutine	3	5	60.0
pod			n/a
total	12	40	30.0

line	stmt	bran	cond	sub	time	code
1						package Treex::Block::Read::BaseTextReader;
2						$Treex::Block::Read::BaseTextReader::VERSION = '2.20150928';
3	2			2	1573	use Moose;
	2				4
	2				19
4	2			2	11563	use Treex::Core::Common;
	2				4
	2				18
5						extends 'Treex::Block::Read::BaseReader';
6						#use File::Slurp 9999;
7	2			2	9808	use PerlIO::via::gzip;
	2				4
	2				761
8
9						# By default read from STDIN
10						has '+from' => (
11						default => '-',
12						handles => [qw(current_filename current_filehandle file_number _set_file_number next_filehandle)],
13						);
14
15						has lines_per_doc => ( isa => 'Int', is => 'ro', default => 0 );
16						has merge_files => ( isa => 'Bool', is => 'ro', default => 0 );
17						has encoding => ( isa => 'Str', is => 'ro', default => 'utf8' );
18
19						sub BUILD {
20	0			0		my ($self) = @_;
21	0	0				if ( $self->lines_per_doc ) {
22	0					$self->set_is_one_doc_per_file(0);
23						}
24	0					return;
25						}
26
27						sub next_document_text {
28	0			0		my ($self) = @_;
29	0	0				if ( $self->is_one_doc_per_file ) {
30	0					return $self->from->next_file_text();
31						}
32
33	0					my $text = '';
34						LINE:
35	0					for my $line ( 1 .. $self->lines_per_doc ) {
36	0					$line = $self->from->next_line();
37	0	0				if (!defined $line){
38	0	0	0			return if $text eq '' && !$self->from->has_next_file();
39	0					last LINE;
40						}
41
42	0					$text .= $line;
43						}
44	0					return $text;
45						}
46
47						1;
48
49						__END__
50
51						=pod
52
53						=encoding utf-8
54
55						=for Pod::Coverage BUILD
56
57						=head1 NAME
58
59						Treex::Block::Read::BaseTextReader - abstract ancestor for document readers
60
61						=head1 VERSION
62
63						version 2.20150928
64
65						=head1 DESCRIPTION
66
67						This class serves as an common ancestor for document readers,
68						that have parameter C<from> with a space or comma separated list of filenames
69						to be loaded and load the documents from plain text files.
70						It is designed to implement the L<Treex::Core::DocumentReader> interface.
71
72						In derived classes you need to define the C<next_document> method,
73						and you can use C<next_document_text> and C<new_document> methods.
74
75						=head1 ATTRIBUTES
76
77						=over
78
79						=item language (required)
80
81						=item lines_per_doc
82
83						If you want to split one file to more documents.
84						The default is 0 which means, don't split.
85
86						=item merge_files
87
88						Merge the content of all files (specified in C<from> attribute) into one stream.
89						Useful in combination with C<lines_per_doc> to get equally-sized documents
90						even from non-equally-sized files.
91
92						=item encoding
93
94						What is the encoding of the input files. E.g. C<utf8> (the default), C<cp1250> etc.
95
96						=back
97
98						=head1 METHODS
99
100						=over
101
102						=item next_document_text
103
104						Returns a content of each file (specified in C<from> attribute) as a text string.
105
106						=item next_filehandle
107
108						Helper method - you can use this instead of C<next_document_text>
109						if you don't want to load the whole text into memory
110						(but do e.g. SAX-like parsing).
111
112						=back
113
114						=head1 SEE
115
116						L<Treex::Block::Read::BaseReader>
117						L<Treex::Block::Read::Text>
118
119						=head1 AUTHOR
120
121						Martin Popel
122
123						=head1 COPYRIGHT AND LICENSE
124
125						Copyright Â© 2011-2012 by Institute of Formal and Applied Linguistics, Charles University in Prague
126
127						This module is free software; you can redistribute it and/or modify it under the same terms as Perl itself.