File Coverage

blib/lib/File/MergeSort.pm
Criterion Covered Total %
statement 103 106 97.1
branch 29 38 76.3
condition 16 24 66.6
subroutine 13 13 100.0
pod 3 3 100.0
total 164 184 89.1


line stmt bran cond sub pod time code
1             package File::MergeSort;
2              
3 3     3   420462 use 5.006; # 5.6.0
  3         14  
  3         169  
4 3     3   20 use strict;
  3         6  
  3         112  
5 3     3   18 use warnings;
  3         18  
  3         91  
6              
7 3     3   17 use Carp;
  3         4  
  3         242  
8 3     3   18 use IO::File;
  3         12  
  3         3332  
9              
10             our $VERSION = '1.23';
11              
12             my $have_io_zlib;
13              
14             BEGIN {
15 3     3   7 eval { require IO::Zlib; };
  3         1012  
16 3 50       113172 unless ( $@ ) {
17 3         20 require IO::Zlib;
18 3         5309 $have_io_zlib++;
19             }
20             }
21              
22             ### PRIVATE METHODS
23              
24             sub _open_file {
25 15     15   25 my $self = shift;
26 15   33     39 my $file = shift || croak 'No filename specified';
27              
28 15         20 my $fh;
29              
30 15 100       69 if ( $file =~ /[.](z|gz)$/ ) { # Files matching .z or .gz
31 5 50       12 if ( $have_io_zlib ) {
32 5 50       33 $fh = IO::Zlib->new( $file, 'rb' ) or croak "Failed to open file $file: $!";
33             } else {
34 0         0 croak 'IO::Zlib not available, cannot handle compressed files. Stopping';
35             }
36             } else {
37 10 100       77 $fh = IO::File->new( $file, '<' ) or croak "Failed to open file $file: $!";
38             }
39              
40 14         16546 return $fh;
41             }
42              
43             # Yes, I'm really closing filehandles, just trying to be consistent
44             # with the _open_file counterpart.
45             sub _close_file {
46 14     14   23 my $self = shift;
47 14         15 my $fh = shift;
48              
49 14 50       113 $fh->close() or croak "Problems closing filehandle: $!";
50              
51 14         1001 return 1;
52             }
53              
54             sub _get_line {
55 82     82   142 my $self = shift;
56 82   33     168 my $fh = shift || croak 'No filehandle supplied';
57              
58 82         79 my $line;
59              
60 82 100       171 if ( $self->{'skip_empty'} ) {
61 19   100     19 do {
62 23         198 $line = <$fh>;
63             } until ( ! defined $line || $line !~ /^$/ );
64             } else {
65 63         318 $line = <$fh>;
66             }
67 82         4680 return $line;
68             }
69              
70             # Given a line of input and a code reference that extracts a value
71             # from the line, return an index value that can be used to compare the
72             # lines.
73             sub _get_index {
74 68     68   81 my $self = shift;
75 68   33     142 my $line = shift || croak 'No line supplied';
76              
77 68         98 my $code_ref = $self->{'index'};
78 68         148 my $index = $code_ref->( $line );
79              
80 68 50       587 if ( $index ) {
81 68         214 return $index;
82             } else {
83 0         0 croak 'Unable to return an index. Stopping';
84             }
85             }
86              
87             ### PUBLIC METHODS
88              
89             sub new {
90 12     12 1 5458 my $class = shift;
91 12         22 my $files_ref = shift; # ref to array of files.
92 12         133 my $index_ref = shift; # ref to sub that will extract index value from line
93 12         19 my $opts_ref = shift; # ref to hash of options, optional.
94              
95 12 100 100     55 unless ( ref $files_ref eq 'ARRAY' && @{ $files_ref } ) {
  9         52  
96 4         515 croak 'Array reference of input files required';
97             }
98              
99 8 100       30 unless ( ref $index_ref eq 'CODE' ) {
100 1         176 croak 'Code reference required for merge key extraction';
101             }
102              
103 7 50 66     53 if ( $opts_ref && ref $opts_ref ne 'HASH' ) {
104 0         0 croak 'Options should be supplied as a hash reference';
105             }
106              
107 7 100       51 my $self = { index => $index_ref,
108             stack => [],
109             skip_empty => $opts_ref->{'skip_empty_lines'} ? 1 : 0,
110             };
111              
112 7         21 bless $self, $class;
113              
114 7         13 my @files;
115 7         10 my $i = 0;
116 7         13 foreach my $file ( @{ $files_ref } ) {
  7         21  
117 15         47 my $fh = $self->_open_file( $file );
118 14         43 my $l = $self->_get_line( $fh );
119 14         37 my $idx = $self->_get_index( $l );
120              
121 14         71 my $f = { 'fh' => $fh,
122             'line' => $l,
123             'index' => $idx,
124             'pref' => $i++, # preference: take the records from the files in the order specified.
125             };
126              
127 14         42 push @files, $f;
128             }
129              
130             # Now that the first records are complete for each file, sort them
131             # by merge key then file order. Create a sorted array of hashrefs
132             # based on the index values of each file.
133 6 50       41 $self->{'stack'} = [ sort { $a->{'index'} cmp $b->{'index'}
  9         60  
134             || $a->{'pref'} <=> $b->{'pref'}
135             } @files ];
136              
137 6         27 return $self;
138             }
139              
140             sub next_line {
141 74     74 1 277 my $self = shift;
142              
143 74         92 my $pick = shift @{ $self->{'stack'} };
  74         295  
144 74 100       293 return unless $pick;
145              
146 68         99 my $line = $pick->{'line'};
147              
148             # Re-populate invalidated data in the shifted structure, before
149             # reinserting into stack.
150 68         307 my $nextline = $self->_get_line( $pick->{'fh'} );
151              
152 68 100       130 if ( $nextline ) {
153 54         83 $pick->{'line'} = $nextline;
154 54         108 $pick->{'index'} = $self->_get_index( $nextline );
155             } else {
156             # File exhausted, close and return last line, no need to
157             # proceed onto juggling stack.
158 14         39 $self->_close_file( $pick->{'fh'} );
159 14         85 return $line;
160             }
161              
162             # Re-organise the 'stack' structure to insert the newly fetched
163             # data into the correct position for the next call. Since it
164             # begins as a sorted array, and we only need to insert one element
165             # in the appropriate position in the array, we can abandon the
166             # loop as soon as we hit the right spot.
167             # There may be room for optimisation here. Algorithms and/or tips
168             # welcome.
169              
170             # Scan the array for the point where:
171              
172             # * The index of the element to insert is the less than than the
173             # element in the array
174              
175             # ...or...
176              
177             # * The index of the element to insert is the same as that in the
178             # array, but the preference of the element to insert is lower -
179             # this is so that data is consistently fed in from the source
180             # files in the order specified in the constuctor.
181              
182             # Previous behaviour can be had with last if $_->{'index'} ge $pick->{'index'};
183 54         66 my $i = 0;
184              
185 54         211 foreach ( @{ $self->{'stack'} } ) {
  54         264  
186 56 100 100     327 if ( $_->{'index'} gt $pick->{'index'}
      66        
187             || ( $_->{'index'} eq $pick->{'index'} && $pick->{'pref'} <= $_->{'pref'} )
188             ) {
189 27         36 last;
190             }
191 29         223 $i++;
192             }
193              
194             # And stuff the fresh data in the appropriate place.
195 54         65 splice @{ $self->{'stack'} }, $i, 0, $pick;
  54         119  
196              
197 54         171 return $line;
198             }
199              
200             # Dump the contents of the file to either STDOUT (default).
201             sub dump {
202 4     4 1 13526 my $self = shift;
203 4         11 my $file = shift; # optional
204              
205 4         7 my $lines = 0;
206              
207 4 100       17 if ( $file ) {
208 2 50       267 open my $fh, '>', $file or croak "Unable to create output file $file: $!";
209              
210 2         10 while ( my $line = $self->next_line() ) {
211 18         56 print $fh $line;
212 18         48 $lines++;
213             }
214              
215 2 50       224 close $fh or croak "Problems closing output file $file: $!";
216             } else {
217 2         12 while ( my $line = $self->next_line() ) {
218 23         5198 print $line;
219 23         86 $lines++;
220             }
221             }
222              
223 4         14 return $lines;
224             }
225              
226             1;
227              
228             __END__