File Coverage

blib/lib/Treex/Block/Read/BaseTextReader.pm

Criterion	Covered	Total	%
statement	16	26	61.5
branch	3	10	30.0
condition	0	3	0.0
subroutine	5	5	100.0
pod	1	2	50.0
total	25	46	54.3

line	stmt	bran	cond	sub	pod	time	code
1							package Treex::Block::Read::BaseTextReader;
2							$Treex::Block::Read::BaseTextReader::VERSION = '2.20210102';
3	2			2		1275	use Moose;
	2					5
	2					14
4	2			2		13562	use Treex::Core::Common;
	2					13
	2					15
5							extends 'Treex::Block::Read::BaseReader';
6							#use File::Slurp 9999;
7	2			2		11612	use PerlIO::via::gzip;
	2					7
	2					859
8
9							# By default read from STDIN
10							has '+from' => (
11							default => '-',
12							handles => [qw(current_filename current_filehandle file_number _set_file_number next_filehandle)],
13							);
14
15							has lines_per_doc => ( isa => 'Int', is => 'ro', default => 0 );
16							has merge_files => ( isa => 'Bool', is => 'ro', default => 0 );
17							has encoding => ( isa => 'Str', is => 'ro', default => 'utf8' );
18
19							sub BUILD {
20	8			8	0	96	my ($self) = @_;
21	8	50				288	if ( $self->lines_per_doc ) {
22	0					0	$self->set_is_one_doc_per_file(0);
23							}
24	8	50				298	if ($self->encoding ne 'utf8'){
25	0					0	$self->from->set_encoding($self->encoding);
26							}
27	8					29	return;
28							}
29
30							sub next_document_text {
31	2			2	1	5	my ($self) = @_;
32	2	50				107	if ( $self->is_one_doc_per_file ) {
33	2					67	return $self->from->next_file_text();
34							}
35
36	0						my $text = '';
37							LINE:
38	0						for my $line ( 1 .. $self->lines_per_doc ) {
39	0						$line = $self->from->next_line();
40	0	0					if (!defined $line){
41	0	0	0				return if $text eq '' && !$self->from->has_next_file();
42	0						last LINE;
43							}
44
45	0						$text .= $line;
46							}
47	0						return $text;
48							}
49
50							1;
51
52							__END__
53
54							=pod
55
56							=encoding utf-8
57
58							=for Pod::Coverage BUILD
59
60							=head1 NAME
61
62							Treex::Block::Read::BaseTextReader - abstract ancestor for document readers
63
64							=head1 VERSION
65
66							version 2.20210102
67
68							=head1 DESCRIPTION
69
70							This class serves as an common ancestor for document readers,
71							that have parameter C<from> with a space or comma separated list of filenames
72							to be loaded and load the documents from plain text files.
73							It is designed to implement the L<Treex::Core::DocumentReader> interface.
74
75							In derived classes you need to define the C<next_document> method,
76							and you can use C<next_document_text> and C<new_document> methods.
77
78							=head1 ATTRIBUTES
79
80							=over
81
82							=item language (required)
83
84							=item lines_per_doc
85
86							If you want to split one file to more documents.
87							The default is 0 which means, don't split.
88
89							=item merge_files
90
91							Merge the content of all files (specified in C<from> attribute) into one stream.
92							Useful in combination with C<lines_per_doc> to get equally-sized documents
93							even from non-equally-sized files.
94
95							=item encoding
96
97							What is the encoding of the input files. E.g. C<utf8> (the default), C<cp1250> etc.
98
99							=back
100
101							=head1 METHODS
102
103							=over
104
105							=item next_document_text
106
107							Returns a content of each file (specified in C<from> attribute) as a text string.
108
109							=item next_filehandle
110
111							Helper method - you can use this instead of C<next_document_text>
112							if you don't want to load the whole text into memory
113							(but do e.g. SAX-like parsing).
114
115							=back
116
117							=head1 SEE
118
119							L<Treex::Block::Read::BaseReader>
120							L<Treex::Block::Read::Text>
121
122							=head1 AUTHOR
123
124							Martin Popel
125
126							=head1 COPYRIGHT AND LICENSE
127
128							Copyright © 2011-2012 by Institute of Formal and Applied Linguistics, Charles University in Prague
129
130							This module is free software; you can redistribute it and/or modify it under the same terms as Perl itself.