File Coverage

blib/lib/File/Sip.pm

Criterion	Covered	Total	%
statement	47	48	97.9
branch	9	10	90.0
condition	5	8	62.5
subroutine	8	8	100.0
pod	1	1	100.0
total	70	75	93.3

line	stmt	bran	cond	sub	pod	time	code
1							package File::Sip;
2
3							#ABSTRACT: file parser intended for big files that don't fit into main memory.
4
5
6	1			1		24437	use Moo;
	1					16714
	1					6
7	1			1		1647	use Carp 'croak';
	1					1
	1					64
8	1			1		780	use IO::File;
	1					9923
	1					117
9	1			1		6377	use Encode qw(decode);
	1					16599
	1					133
10	1			1		15	use feature ':5.10';
	1					3
	1					1649
11
12
13							has path => (
14							is => 'ro',
15							required => 1,
16							);
17
18
19							has line_separator => (
20							is => 'ro',
21							default => sub {qw/(\015\012\|\015\|\012)/},
22							);
23
24
25							has is_utf8 => (
26							is => 'ro',
27							default => sub {1},
28							);
29
30
31							# internal cursor for iterations
32							has _read_line_position => (
33							is => 'rw',
34							default => sub {0},
35							);
36
37							sub read_line {
38	15			15	1	10109	my ( $self, $line_number ) = @_;
39
40	15		100			75	$line_number //= $self->_read_line_position;
41	15					425	my $fh = $self->_fh;
42	15					547	my $line_index = $self->index->[$line_number];
43	15	100				152	return if !defined $line_index;
44
45	14	100				291	my $previous_line_index =
46							( $line_number == 0 ) ? 0 : $self->index->[ $line_number - 1 ];
47
48	14					75	my $line;
49	14					127	seek( $fh, $previous_line_index, 0 );
50	14					109	read( $fh, $line, $line_index - $previous_line_index );
51
52	14	100				44	$self->_read_line_position( $line_number + 1 ) if @_ == 1;
53
54	14	50	33			122	return decode( "utf8", $line ) if defined $line && $self->is_utf8;
55	0					0	return $line;
56							}
57
58							# file handle return by IO::File
59							has _fh => (
60							is => 'ro',
61							lazy => 1,
62							default => sub {
63							my ($self) = @_;
64							my $open_file_param = "<:crlf";
65							IO::File->new( $self->path, $open_file_param )
66							or croak "Failed to open file '" . $self->path . "' : '$!'";
67							}
68							);
69
70							# File stat array
71							has _stat => (
72							is => 'lazy',
73							);
74
75							sub _build__stat {
76	1			1		403	my ($self) = @_;
77	1					25	my @stat = stat( $self->_fh );
78	1					30	return \@stat;
79							}
80
81
82							has index => (
83							is => 'rw',
84							lazy => 1,
85							builder => 1,
86							);
87
88							sub _build_index {
89	1			1		432	my ($self) = @_;
90	1					2	my $index = [];
91
92	1					3	my ($blocksize) = @{ $self->_stat }[11];
	1					4
93	1		50			4	$blocksize \|\|= 8192;
94
95	1					2	my $buffer = '';
96	1					3	my $offset = 0;
97	1					1	my $line_number = 0;
98
99							# make sure we jump to the begining of the file
100	1					25	seek( $self->_fh, 0, SEEK_SET );
101
102							# build the index, char by char, splitting on the line separator
103	1					18	my $line_sep = $self->line_separator;
104	1					25	while ( my $count = read( $self->_fh, $buffer, $blocksize ) ) {
105	1					51	for my $i ( 0 .. $count ) {
106	80					96	my $char = substr $buffer, $i, 1;
107	80	100				284	if ( $char =~ /$line_sep/ ) {
108	7					23	$index->[ $line_number++ ] = $offset + $i + 1;
109							}
110							}
111	1					29	$offset += $count;
112							}
113
114							# reset the cursor at the begining of the file and return the index
115	1					35	seek( $self->_fh, 0, SEEK_SET );
116	1					15	return $index;
117							}
118
119							1;
120
121
122							=pod
123
124							=head1 NAME
125
126							File::Sip - file parser intended for big files that don't fit into main memory.
127
128							=head1 VERSION
129
130							version 0.003
131
132							=head1 DESCRIPTION
133
134							In most of the cases, you don't want to use this, but L instead.
135
136							This class is able to read a line from a file without loading the whole file in
137							memory. When you want to deal with files of millions of lines, on a limited
138							environment, brute force isn't an option.
139
140							An index of all the lines in the file is built in order to be able to access
141							their starting position depending on their line number.
142
143							The memory used is then limited to the size of the index plus the size of the
144							line that is read (until the line separator character is reached).
145
146							It also provides a way to nicely iterate over all the lines of the file, using
147							only the amount of memory needed to store one line at a time, not the whole file.
148
149							=head1 ATTRIBUTES
150
151							=head2 path
152
153							Required, file path as a string.
154
155							=head2 line_separator
156
157							Optional, regular expression of the newline seperator, default is
158							C.
159
160							=head2 is_utf8
161
162							Optional, flag to tell if the file is utf8-encoded, default is true.
163
164							If true, the line returned by C will be decoded.
165
166							=head2 index
167
168							Index that contains positions of all lines of the file, usage:
169
170							$sip->index->[ $line_number ] = $seek_position;
171
172							=head1 METHODS
173
174							=head2 read_line
175
176							Return the line content at the given position (terminated by C).
177
178							my $line = $sip->read_line( $line_number );
179
180							It's also possible to read the entire file, line by line without providing a
181							line number to the method, until C is returned:
182
183							while (my $line = $sip->read_line()) {
184							# do something with $line
185							}
186
187							=head1 ACKNOWLEDGMENT
188
189							This module was written at Weborama when dealing with huge raw files, where huge
190							means "oh no, it really won't fit anymore in this compute slot!" (which are
191							limited in main-memory).
192
193							=head1 BENCHMARK
194
195							C is not faster than in-memory parsers like L but
196							it has a lower memory footprint. With small files, it's not obvious (when the file
197							is small, the cost of the index is almost equal to the cost of all the
198							characters of the file).
199							But when the file gets bigger, the gain in main memory grows.
200
201							With files bigger than few megabytes, C will consume up to 20 times less
202							memory than L. This factor of 20 appears to be an asymptotic limit
203							as size of studied files grows.
204
205							If you want to estimate the memory size of a running process that uses C, you
206							can then assume that the size of the index will be around 1/20th of the size of
207							the processed file.
208
209							=head1 AUTHORS
210
211							This module has been written at Weborama by Alexis Sukrieh and Bin Shu.
212
213							=head1 AUTHOR
214
215							Alexis Sukrieh
216
217							=head1 COPYRIGHT AND LICENSE
218
219							This software is copyright (c) 2014 by Weborama.
220
221							This is free software; you can redistribute it and/or modify it under
222							the same terms as the Perl 5 programming language system itself.
223
224							=cut
225
226
227							__END__