line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
#!/usr/bin/perl |
2
|
|
|
|
|
|
|
# Copyright 2003, James Mastros, |
3
|
|
|
|
|
|
|
# Released under the same terms as perl itself. |
4
|
|
|
|
|
|
|
# See bottom for full copyright info. |
5
|
|
|
|
|
|
|
package Sort::Merge; |
6
|
|
|
|
|
|
|
|
7
|
3
|
|
|
3
|
|
118953
|
use warnings; |
|
3
|
|
|
|
|
8
|
|
|
3
|
|
|
|
|
137
|
|
8
|
3
|
|
|
3
|
|
16
|
use strict; |
|
3
|
|
|
|
|
5
|
|
|
3
|
|
|
|
|
98
|
|
9
|
3
|
|
|
3
|
|
14443
|
use Data::Dumper; |
|
3
|
|
|
|
|
48580
|
|
|
3
|
|
|
|
|
1517
|
|
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
our $VERSION = 0.01; |
12
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
# "Big" data structure is sources. |
14
|
|
|
|
|
|
|
# This is an array(ref) of arrayrefs. |
15
|
|
|
|
|
|
|
# Each inner array has three elements. |
16
|
|
|
|
|
|
|
# The first is a coderef, which should return the next two elements when called. |
17
|
|
|
|
|
|
|
# The second is the sort key for the next data point. |
18
|
|
|
|
|
|
|
# The third is the data point itself. |
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
# Given a sources array, makes sure that each has it's next datapoint loaded, |
21
|
|
|
|
|
|
|
# and returns the new sources list. |
22
|
|
|
|
|
|
|
sub prime { |
23
|
51
|
|
|
51
|
0
|
50
|
my @sources=@{shift @_}; |
|
51
|
|
|
|
|
86
|
|
24
|
51
|
|
|
|
|
52
|
my $indexes=shift; |
25
|
51
|
|
100
|
|
|
106
|
$indexes ||= [0..$#sources]; |
26
|
51
|
|
|
|
|
62
|
my @indexes=@$indexes; |
27
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
# print "In prime: ", Dumper(\@sources); |
29
|
51
|
|
|
|
|
71
|
foreach my $source (@sources[@$indexes]) { |
30
|
55
|
50
|
|
|
|
166
|
if (not defined $source->[1]) { |
31
|
|
|
|
|
|
|
# print "Priming.\n"; |
32
|
55
|
|
|
|
|
106
|
my ($key, $data) = $source->[0]->(); |
33
|
55
|
100
|
|
|
|
261
|
if (!defined $key) { |
34
|
|
|
|
|
|
|
# print "Source dead.\n"; |
35
|
|
|
|
|
|
|
# We can't just delete it here, so mark it undef, and filter them out later. |
36
|
8
|
|
|
|
|
12
|
$source=undef; |
37
|
|
|
|
|
|
|
} |
38
|
|
|
|
|
|
|
# print "Key: $key, Data: $data\n"; |
39
|
55
|
|
|
|
|
62
|
$source->[1] = $key; |
40
|
55
|
|
|
|
|
170
|
$source->[2] = $data; |
41
|
|
|
|
|
|
|
} |
42
|
|
|
|
|
|
|
} |
43
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
# print "Before purge: ", Dumper(\@sources); |
45
|
51
|
|
|
|
|
68
|
@sources = grep {defined $_->[0]} @sources; |
|
131
|
|
|
|
|
226
|
|
46
|
|
|
|
|
|
|
# print "After purge: ", Dumper(\@sources); |
47
|
|
|
|
|
|
|
|
48
|
51
|
|
|
|
|
118
|
return @sources; |
49
|
|
|
|
|
|
|
} |
50
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
# Given a sources arrayref, and a coderef to call with each data point, sort. |
52
|
|
|
|
|
|
|
sub sort_inner { |
53
|
5
|
|
|
5
|
0
|
8
|
my $sources=shift; |
54
|
5
|
|
|
|
|
10
|
my $output=shift; |
55
|
|
|
|
|
|
|
|
56
|
5
|
|
|
|
|
11
|
my @sources=@$sources; |
57
|
5
|
|
|
|
|
7
|
my $lastsource; |
58
|
5
|
|
|
|
|
21
|
while (@sources) { |
59
|
51
|
|
|
|
|
88
|
@sources=prime(\@sources, $lastsource); |
60
|
51
|
100
|
|
|
|
106
|
last if !@sources; |
61
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
# print Dumper \@sources; |
63
|
|
|
|
|
|
|
|
64
|
47
|
|
|
|
|
52
|
my($firstsource,$earliesttime)=(0,~0); |
65
|
|
|
|
|
|
|
|
66
|
47
|
|
|
|
|
62
|
foreach (0..$#sources) { |
67
|
123
|
100
|
|
|
|
218
|
if ($earliesttime > $sources[$_][1]) { |
68
|
69
|
|
|
|
|
60
|
$earliesttime=$sources[$_][1]; |
69
|
69
|
|
|
|
|
77
|
$firstsource=$_; |
70
|
|
|
|
|
|
|
} |
71
|
|
|
|
|
|
|
} |
72
|
47
|
|
|
|
|
94
|
$output->($sources[$firstsource]); |
73
|
47
|
|
|
|
|
148
|
$sources[$firstsource][1] = undef; |
74
|
47
|
|
|
|
|
52
|
$sources[$firstsource][2] = undef; |
75
|
47
|
|
|
|
|
115
|
$lastsource=[$firstsource]; |
76
|
|
|
|
|
|
|
} |
77
|
|
|
|
|
|
|
} |
78
|
|
|
|
|
|
|
|
79
|
|
|
|
|
|
|
# Given an arrayref of input coderefs, and a single output coderef, sort. |
80
|
|
|
|
|
|
|
sub sort_coderefs { |
81
|
5
|
|
|
5
|
1
|
3028
|
my $inputs=shift; |
82
|
5
|
|
|
|
|
10
|
my $output=shift; |
83
|
5
|
|
|
|
|
7
|
my @sources; |
84
|
|
|
|
|
|
|
|
85
|
5
|
|
|
|
|
15
|
@sources = map {[$_]} @$inputs; |
|
8
|
|
|
|
|
26
|
|
86
|
5
|
|
|
|
|
20
|
return sort_inner(\@sources, $output); |
87
|
|
|
|
|
|
|
} |
88
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
=head1 NAME |
90
|
|
|
|
|
|
|
|
91
|
|
|
|
|
|
|
Sort::Merge - general merge sort |
92
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
=head1 SYNOPSIS |
94
|
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
use Sort::Merge; |
96
|
|
|
|
|
|
|
use IO::File; |
97
|
|
|
|
|
|
|
my @output; |
98
|
|
|
|
|
|
|
Sort::Merge::sort_coderefs([sub{our $i=0; return ($i++ x 2)}, |
99
|
|
|
|
|
|
|
sub{our $j=0; return ($j++*5 x 2)}], |
100
|
|
|
|
|
|
|
sub{print shift->[1}); |
101
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
=head1 DESCRIPTION |
103
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
Merge sorting is a technique for merging together multiple input |
105
|
|
|
|
|
|
|
streams, each of which is sorted, into one larger output stream, still |
106
|
|
|
|
|
|
|
sorted. This module implements that algorithm. |
107
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
Note the large caveat above: the inputs must already be sorted. If |
109
|
|
|
|
|
|
|
the inputs aren't sorted, you'll need to sort them yourself first, in |
110
|
|
|
|
|
|
|
which case a merge sort may not be the right algorithm for the job. |
111
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
Sort::Merge was designed to give you, the programmer, control over |
113
|
|
|
|
|
|
|
where your data comes from, and where it goes -- it's simply supposed |
114
|
|
|
|
|
|
|
to sort it, and not fetch it from files, parse out sort keys, etc, |
115
|
|
|
|
|
|
|
etc. There's at least one merge-sorting module already on CPAN, |
116
|
|
|
|
|
|
|
File::MergeSort, but I wanted one that I could use to sort together |
117
|
|
|
|
|
|
|
multiple files that were of different formats, and not line-oriented |
118
|
|
|
|
|
|
|
formats either. |
119
|
|
|
|
|
|
|
|
120
|
|
|
|
|
|
|
The module I got out doesn't even require that the inputs are files at |
121
|
|
|
|
|
|
|
all, or that the output gets printed. It also streams the output, so |
122
|
|
|
|
|
|
|
you needn't wait until the sort is complete before printing the first |
123
|
|
|
|
|
|
|
piece of output. (The fact that this is possible at all is one of |
124
|
|
|
|
|
|
|
more useful properties of merge sorts vs. other sorts.) |
125
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
Most (OK, at the moment all) of the available interfaces require you to |
127
|
|
|
|
|
|
|
write your input and output handlers in terms of coderefs. If this is |
128
|
|
|
|
|
|
|
too much work for you, I encourage you to not use this module, or, |
129
|
|
|
|
|
|
|
alternatively, to propose another interface to me. I'm actively |
130
|
|
|
|
|
|
|
looking for more and better interfaces. |
131
|
|
|
|
|
|
|
|
132
|
|
|
|
|
|
|
=head2 Sources |
133
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
There are two types of coderefs Sort::Merge makes you write, sources |
135
|
|
|
|
|
|
|
and the output. |
136
|
|
|
|
|
|
|
|
137
|
|
|
|
|
|
|
Sources are sources of input data. They are called with no |
138
|
|
|
|
|
|
|
parameters. They are expected to return a two-element list if they |
139
|
|
|
|
|
|
|
have more data. |
140
|
|
|
|
|
|
|
|
141
|
|
|
|
|
|
|
The first element should be a sort key, which will be compared |
142
|
|
|
|
|
|
|
lexographicly (using cmp). If you wish to think in terms of numerical |
143
|
|
|
|
|
|
|
sort keys, simply run it through chr, pack, or sprintf to get a |
144
|
|
|
|
|
|
|
representation that will sort properly. (Passing it to chr requires |
145
|
|
|
|
|
|
|
that the number is a nonnegative integer, and may warn for some |
146
|
|
|
|
|
|
|
values, but is quite fast.) |
147
|
|
|
|
|
|
|
|
148
|
|
|
|
|
|
|
The second element may be any arbitrary scalar. This is your |
149
|
|
|
|
|
|
|
datapoint. It is passed to the output subroutine without being |
150
|
|
|
|
|
|
|
examined. It can be a line of text, an arrayref or hashref, or even |
151
|
|
|
|
|
|
|
an object, or undef for all Sort::Merge cares. |
152
|
|
|
|
|
|
|
|
153
|
|
|
|
|
|
|
If the source has run out of data, it should return nothing (an empty |
154
|
|
|
|
|
|
|
list, not C). If it wishes to signal an error and abort |
155
|
|
|
|
|
|
|
processing, it can simply C; the error will be propagated. |
156
|
|
|
|
|
|
|
|
157
|
|
|
|
|
|
|
=head2 Output |
158
|
|
|
|
|
|
|
|
159
|
|
|
|
|
|
|
Because I'm lazy, the output callback gets passed the source |
160
|
|
|
|
|
|
|
structure. That is, it gets an arrayref of three values. The first |
161
|
|
|
|
|
|
|
is the coderef, the second is the key, the third is the data. |
162
|
|
|
|
|
|
|
|
163
|
|
|
|
|
|
|
To put it another way, it gets an arrayref of the coderef, followed by |
164
|
|
|
|
|
|
|
what the coderef has returned. |
165
|
|
|
|
|
|
|
|
166
|
|
|
|
|
|
|
=head2 C |
167
|
|
|
|
|
|
|
|
168
|
|
|
|
|
|
|
C takes an array-ref of source coderefs, and a single output coderef. |
169
|
|
|
|
|
|
|
|
170
|
|
|
|
|
|
|
The output coderef is called for each element, in sorted order, and |
171
|
|
|
|
|
|
|
the function returns empty-list. |
172
|
|
|
|
|
|
|
|
173
|
|
|
|
|
|
|
=head1 SEE ALSO |
174
|
|
|
|
|
|
|
|
175
|
|
|
|
|
|
|
L, the source code. |
176
|
|
|
|
|
|
|
|
177
|
|
|
|
|
|
|
=head1 COPYRIGHT AND DISCLAIMERS |
178
|
|
|
|
|
|
|
|
179
|
|
|
|
|
|
|
Copyright (c) 2003 James M. Mastros. All rights reserved. |
180
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
This library is free software; you can redistribute it and/or modify it |
182
|
|
|
|
|
|
|
under the same terms as Perl itself. |
183
|
|
|
|
|
|
|
|
184
|
|
|
|
|
|
|
This program is distributed in the hope that it will be useful, but |
185
|
|
|
|
|
|
|
without any warranty; without even the implied warranty of |
186
|
|
|
|
|
|
|
merchantability or fitness for a particular purpose. |
187
|
|
|
|
|
|
|
|
188
|
|
|
|
|
|
|
=head1 AUTHOR |
189
|
|
|
|
|
|
|
|
190
|
|
|
|
|
|
|
James Mastros Ejames@mastros.bizE, L |
191
|
|
|
|
|
|
|
|
192
|
|
|
|
|
|
|
=cut |