line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Data::Prepare; |
2
|
|
|
|
|
|
|
|
3
|
1
|
|
|
1
|
|
92671
|
use strict; |
|
1
|
|
|
|
|
11
|
|
|
1
|
|
|
|
|
26
|
|
4
|
1
|
|
|
1
|
|
5
|
use warnings; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
22
|
|
5
|
1
|
|
|
1
|
|
4
|
use Exporter 'import'; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
2029
|
|
6
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
our $VERSION = '0.005'; |
8
|
|
|
|
|
|
|
our @EXPORT_OK = qw( |
9
|
|
|
|
|
|
|
cols_non_empty |
10
|
|
|
|
|
|
|
non_unique_cols |
11
|
|
|
|
|
|
|
key_to_index |
12
|
|
|
|
|
|
|
make_pk_map |
13
|
|
|
|
|
|
|
pk_col_counts |
14
|
|
|
|
|
|
|
pk_match |
15
|
|
|
|
|
|
|
chop_lines |
16
|
|
|
|
|
|
|
chop_cols |
17
|
|
|
|
|
|
|
header_merge |
18
|
|
|
|
|
|
|
pk_insert |
19
|
|
|
|
|
|
|
); |
20
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
sub chop_lines { |
22
|
4
|
|
|
4
|
1
|
21312
|
my ($choplines, $data) = @_; |
23
|
4
|
|
|
|
|
38
|
splice @$data, $_, 1 for @$choplines; |
24
|
|
|
|
|
|
|
} |
25
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
sub chop_cols { |
27
|
1
|
|
|
1
|
1
|
3
|
my ($chopcols, $data) = @_; |
28
|
1
|
|
|
|
|
6
|
for my $c (sort {$b <=> $a} @$chopcols) { |
|
28
|
|
|
|
|
30
|
|
29
|
15
|
|
|
|
|
729
|
splice @$_, $c, 1 for @$data; |
30
|
|
|
|
|
|
|
} |
31
|
|
|
|
|
|
|
} |
32
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
my %where2offset = (up => -1, self => 0, down => 1); |
34
|
|
|
|
|
|
|
sub header_merge { |
35
|
3
|
|
|
3
|
1
|
8
|
my ($merge_spec, $data) = @_; |
36
|
3
|
|
|
|
|
9
|
for my $spec (@$merge_spec) { |
37
|
10
|
|
|
|
|
30
|
my ($do, $l, $matchfrom, $matchto) = @$spec{qw(do line matchfrom matchto)}; |
38
|
10
|
|
|
|
|
39
|
my ($from_row, $to_row) = map $data->[$l + $where2offset{$spec->{$_}}], qw(from to); |
39
|
10
|
|
100
|
|
|
35
|
my ($kept, $fromspec, $justone, $which_index) = ('', ($spec->{fromspec} || '')); |
40
|
10
|
100
|
100
|
|
|
46
|
if (($spec->{tospec} || '') =~ /^index:(\d+)/) { |
41
|
4
|
|
|
|
|
6
|
$justone = 1; |
42
|
4
|
|
|
|
|
10
|
$which_index = $1; |
43
|
|
|
|
|
|
|
} |
44
|
10
|
|
|
|
|
21
|
for my $i (0..$#$to_row) { |
45
|
76
|
|
100
|
|
|
132
|
$kept = $from_row->[$i] || $kept; |
46
|
76
|
100
|
100
|
|
|
304
|
next if defined $matchto and $to_row->[$i] !~ /$matchto/; |
47
|
50
|
100
|
100
|
|
|
115
|
next if defined $matchfrom and $from_row->[$i] !~ /$matchfrom/; |
48
|
48
|
100
|
|
|
|
104
|
my $basic_from = |
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
49
|
|
|
|
|
|
|
$fromspec eq 'lastnonblank' ? $kept : |
50
|
|
|
|
|
|
|
$fromspec eq 'left' ? $from_row->[$i - 1] : |
51
|
|
|
|
|
|
|
$fromspec =~ /^literal:(.*)/ ? $1 : |
52
|
|
|
|
|
|
|
$from_row->[$justone ? $which_index : $i]; |
53
|
48
|
100
|
|
|
|
68
|
my $basic_to = $to_row->[$justone ? $which_index : $i]; |
54
|
48
|
0
|
|
|
|
94
|
my $what = |
|
|
50
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
55
|
|
|
|
|
|
|
$do->[0] eq 'overwrite' ? $basic_from : |
56
|
|
|
|
|
|
|
$do->[0] eq 'prepend' ? $basic_from . $do->[1] . $basic_to : |
57
|
|
|
|
|
|
|
$do->[0] eq 'append' ? $basic_to . $do->[1] . $basic_from : |
58
|
|
|
|
|
|
|
die "Unknown action '$do->[0]'"; |
59
|
48
|
100
|
|
|
|
57
|
if ($justone) { |
60
|
4
|
|
|
|
|
7
|
$to_row->[$which_index] = $what; |
61
|
4
|
|
|
|
|
11
|
last; |
62
|
|
|
|
|
|
|
} else { |
63
|
44
|
|
|
|
|
62
|
$to_row->[$i] = $what; |
64
|
|
|
|
|
|
|
} |
65
|
|
|
|
|
|
|
} |
66
|
|
|
|
|
|
|
} |
67
|
|
|
|
|
|
|
} |
68
|
|
|
|
|
|
|
|
69
|
|
|
|
|
|
|
sub pk_insert { |
70
|
1
|
|
|
1
|
1
|
5
|
my ($spec, $data, $pk_map, $stopwords) = @_; |
71
|
1
|
|
|
|
|
4
|
my ($ch, $lc, $pkc, $fb) = (@$spec{qw(column_heading local_column pk_column use_fallback)}); |
72
|
1
|
|
|
|
|
4
|
my $key_index = key_to_index($data->[0])->{$lc}; |
73
|
1
|
50
|
|
|
|
6
|
die "undef index for key '$lc'" if !defined $key_index; |
74
|
1
|
|
|
|
|
3
|
unshift @{ $data->[0] }, $ch; |
|
1
|
|
|
|
|
3
|
|
75
|
1
|
|
|
|
|
2
|
my $exact_map = $pk_map->{$pkc}; |
76
|
1
|
|
|
|
|
9
|
for my $row (@$data[ 1..$#$data ]) { |
77
|
196
|
|
|
|
|
287
|
my $key_val = $row->[ $key_index ]; |
78
|
196
|
|
|
|
|
274
|
my $pkv = $exact_map->{ $key_val }; |
79
|
196
|
100
|
66
|
|
|
412
|
unshift(@$row, $pkv), next if defined $pkv or !$fb; |
80
|
20
|
|
|
|
|
40
|
($pkv) = pk_match($key_val, $pk_map, $stopwords); |
81
|
20
|
|
|
|
|
49
|
unshift(@$row, $pkv); |
82
|
|
|
|
|
|
|
} |
83
|
|
|
|
|
|
|
} |
84
|
|
|
|
|
|
|
|
85
|
|
|
|
|
|
|
sub cols_non_empty { |
86
|
1
|
|
|
1
|
1
|
13768
|
my ($data) = @_; |
87
|
1
|
|
|
|
|
3
|
my @col_non_empty; |
88
|
1
|
|
|
|
|
2
|
for my $line (@$data) { |
89
|
199
|
|
100
|
|
|
3612
|
$col_non_empty[$_] ||= 0 for 0..$#$line; |
90
|
199
|
|
|
|
|
1388
|
$col_non_empty[$_]++ for grep length $line->[$_], 0..$#$line; |
91
|
|
|
|
|
|
|
} |
92
|
1
|
|
|
|
|
8
|
@col_non_empty; |
93
|
|
|
|
|
|
|
} |
94
|
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
sub non_unique_cols { |
96
|
1
|
|
|
1
|
1
|
3
|
my ($data) = @_; |
97
|
1
|
|
|
|
|
4
|
my ($line, %col2count) = $data->[0]; |
98
|
1
|
|
|
|
|
5
|
$col2count{$_}++ for @$line; |
99
|
1
|
|
|
|
|
7
|
delete @col2count{ grep $col2count{$_} == 1, keys %col2count }; |
100
|
1
|
|
|
|
|
3
|
\%col2count; |
101
|
|
|
|
|
|
|
} |
102
|
|
|
|
|
|
|
|
103
|
|
|
|
|
|
|
sub key_to_index { |
104
|
4
|
|
|
4
|
1
|
10
|
my ($row) = @_; |
105
|
4
|
|
|
|
|
111
|
+{ map +($row->[$_] => $_), 0..$#$row }; |
106
|
|
|
|
|
|
|
} |
107
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
sub make_pk_map { |
109
|
1
|
|
|
1
|
1
|
12971
|
my ($data, $pk_colkey, $other_colkeys) = @_; |
110
|
1
|
|
|
|
|
4
|
my $k2i = key_to_index($data->[0]); |
111
|
1
|
|
|
|
|
6
|
my @invalid = grep !defined $k2i->{$_}, $pk_colkey, @$other_colkeys; |
112
|
1
|
50
|
|
|
|
3
|
die "Invalid keys (@invalid)" if @invalid; |
113
|
1
|
|
|
|
|
2
|
my $pk_colnum = $k2i->{$pk_colkey}; |
114
|
1
|
|
|
|
|
1
|
my %altcol2value2pk; |
115
|
1
|
|
|
|
|
3
|
for my $i (1..$#$data) { |
116
|
250
|
|
|
|
|
255
|
my $row = $data->[$i]; |
117
|
250
|
100
|
|
|
|
334
|
next if !length(my $pk_val = $row->[$pk_colnum]); |
118
|
249
|
|
|
|
|
273
|
for my $alt_k ($pk_colkey, @$other_colkeys) { |
119
|
1494
|
100
|
|
|
|
2232
|
next if !length(my $alt_v = $row->[$k2i->{$alt_k}]); |
120
|
1387
|
|
|
|
|
2177
|
$altcol2value2pk{$alt_k}{$alt_v} = $pk_val; |
121
|
|
|
|
|
|
|
} |
122
|
|
|
|
|
|
|
} |
123
|
1
|
|
|
|
|
10
|
\%altcol2value2pk; |
124
|
|
|
|
|
|
|
} |
125
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
sub pk_col_counts { |
127
|
1
|
|
|
1
|
1
|
3867
|
my ($data, $pk_map) = @_; |
128
|
1
|
|
|
|
|
3
|
my $k2i = key_to_index($data->[0]); |
129
|
1
|
|
|
|
|
3
|
my (%col2code2exact, @no_exact_match); |
130
|
1
|
|
|
|
|
3
|
for my $i (1..$#$data) { |
131
|
196
|
|
|
|
|
261
|
my ($row, $exact_match) = $data->[$i]; |
132
|
196
|
|
|
|
|
440
|
for my $possible_col (keys %$k2i) { |
133
|
3724
|
|
|
|
|
4431
|
my $val = $row->[ $k2i->{$possible_col} ]; |
134
|
3724
|
|
|
|
|
8381
|
my @match_codes_yes = grep exists $pk_map->{$_}{$val}, keys %$pk_map; |
135
|
3724
|
|
|
|
|
4939
|
$col2code2exact{$possible_col}{$_}++ for @match_codes_yes; |
136
|
3724
|
|
100
|
|
|
6259
|
$exact_match ||= @match_codes_yes; |
137
|
|
|
|
|
|
|
} |
138
|
196
|
100
|
|
|
|
369
|
push @no_exact_match, $row if !$exact_match; |
139
|
|
|
|
|
|
|
} |
140
|
1
|
|
|
|
|
11
|
(\%col2code2exact, \@no_exact_match); |
141
|
|
|
|
|
|
|
} |
142
|
|
|
|
|
|
|
|
143
|
|
|
|
|
|
|
sub _match_register { |
144
|
1440
|
|
|
1440
|
|
2217
|
my ($matches, $code, $this_map, $pk_val2count, $pk_col2pk_value2count, $pk_val2from) = @_; |
145
|
|
|
|
|
|
|
$pk_val2count->{$_}++, |
146
|
|
|
|
|
|
|
$pk_col2pk_value2count->{$code}{$_}++ |
147
|
1440
|
|
|
|
|
3495
|
for map $this_map->{$_}, @$matches; |
148
|
1440
|
|
|
|
|
2530
|
for (@$matches) { |
149
|
|
|
|
|
|
|
# track longest matched-value per PK, to tie-break on shortest one |
150
|
858
|
|
|
|
|
1014
|
my $this_pk_val = $this_map->{$_}; |
151
|
|
|
|
|
|
|
$pk_val2from->{$this_pk_val} = $_ if |
152
|
858
|
100
|
100
|
|
|
2841
|
length($pk_val2from->{$this_pk_val}||'') < length; |
153
|
|
|
|
|
|
|
} |
154
|
|
|
|
|
|
|
} |
155
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
sub pk_match { |
157
|
216
|
|
|
216
|
1
|
421
|
my ($value, $pk_map, $stopwords) = @_; |
158
|
216
|
|
|
|
|
587
|
my %stopword; @stopword{@$stopwords} = (); |
|
216
|
|
|
|
|
476
|
|
159
|
216
|
|
|
|
|
696
|
my @val_words = grep length, split /[^A-Za-z]/, $value; |
160
|
216
|
50
|
|
|
|
853
|
my $val_pat = join '.*', map +(/[A-Z]{2,}/ ? split //, $_ : $_), @val_words; |
161
|
216
|
|
|
|
|
780
|
@val_words = grep !exists $stopword{$_}, map lc, grep length > 2, @val_words; |
162
|
216
|
|
|
|
|
310
|
my (%pk_col2pk_value2count, %pk_val2count, %pk_val2from); |
163
|
216
|
|
|
|
|
470
|
for my $code (keys %$pk_map) { |
164
|
1296
|
|
|
|
|
1635
|
my $this_map = $pk_map->{$code}; |
165
|
1
|
|
|
1
|
|
633
|
my @matches = grep /$val_pat/i, keys %$this_map; |
|
1
|
|
|
|
|
13
|
|
|
1
|
|
|
|
|
12
|
|
|
1296
|
|
|
|
|
73023
|
|
166
|
1296
|
|
|
|
|
30636
|
_match_register(\@matches, $code, $this_map, \%pk_val2count, \%pk_col2pk_value2count, \%pk_val2from); |
167
|
|
|
|
|
|
|
} |
168
|
216
|
100
|
|
|
|
959
|
if ((my @abbrev_parts = grep length, split /\s*[\(,]\s*/, $value) > 1) { |
169
|
12
|
|
|
|
|
145
|
s/(.*?)[^A-Za-z]+(.*?)/$1.*$2/g for @abbrev_parts; |
170
|
12
|
|
|
|
|
47
|
my $suff_pref_pat = join '.*', reverse @abbrev_parts; |
171
|
12
|
|
|
|
|
27
|
for my $code (keys %$pk_map) { |
172
|
72
|
|
|
|
|
114
|
my $this_map = $pk_map->{$code}; |
173
|
72
|
|
|
|
|
4578
|
my @matches = grep /$suff_pref_pat/i, keys %$this_map; |
174
|
72
|
|
|
|
|
632
|
_match_register(\@matches, $code, $this_map, \%pk_val2count, \%pk_col2pk_value2count); |
175
|
72
|
|
|
|
|
7455
|
@matches = grep /^$suff_pref_pat/i, keys %$this_map; |
176
|
72
|
|
|
|
|
596
|
_match_register(\@matches, $code, $this_map, \%pk_val2count, \%pk_col2pk_value2count); |
177
|
|
|
|
|
|
|
} |
178
|
|
|
|
|
|
|
} |
179
|
216
|
50
|
|
|
|
421
|
if (!keys %pk_col2pk_value2count) { |
180
|
0
|
|
|
|
|
0
|
for my $code (keys %$pk_map) { |
181
|
0
|
|
|
|
|
0
|
my $this_map = $pk_map->{$code}; |
182
|
0
|
|
|
|
|
0
|
for my $word (@val_words) { |
183
|
0
|
|
|
|
|
0
|
my @matches = grep /\b\Q$word\E\b/i, keys %$this_map; |
184
|
0
|
|
|
|
|
0
|
_match_register(\@matches, $code, $this_map, \%pk_val2count, \%pk_col2pk_value2count, \%pk_val2from); |
185
|
|
|
|
|
|
|
} |
186
|
|
|
|
|
|
|
} |
187
|
|
|
|
|
|
|
} |
188
|
|
|
|
|
|
|
my ($best) = sort { |
189
|
216
|
|
|
|
|
465
|
$pk_val2count{$b} <=> $pk_val2count{$a} |
190
|
|
|
|
|
|
|
|| |
191
|
30
|
50
|
66
|
|
|
112
|
length($pk_val2from{$a}) <=> length($pk_val2from{$b}) |
192
|
|
|
|
|
|
|
|| |
193
|
|
|
|
|
|
|
$a cmp $b |
194
|
|
|
|
|
|
|
} keys %pk_val2count; |
195
|
216
|
|
66
|
|
|
363
|
my @pk_cols_unique_best = sort grep keys %{ $pk_col2pk_value2count{$_} } == 1 && $pk_col2pk_value2count{$_}{$best}, keys %pk_col2pk_value2count; |
196
|
216
|
|
|
|
|
1474
|
($best, \@pk_cols_unique_best); |
197
|
|
|
|
|
|
|
} |
198
|
|
|
|
|
|
|
|
199
|
|
|
|
|
|
|
1; |
200
|
|
|
|
|
|
|
|
201
|
|
|
|
|
|
|
=encoding utf8 |
202
|
|
|
|
|
|
|
|
203
|
|
|
|
|
|
|
=head1 NAME |
204
|
|
|
|
|
|
|
|
205
|
|
|
|
|
|
|
Data::Prepare - prepare CSV (etc) data for automatic processing |
206
|
|
|
|
|
|
|
|
207
|
|
|
|
|
|
|
=head1 SYNOPSIS |
208
|
|
|
|
|
|
|
|
209
|
|
|
|
|
|
|
use Text::CSV qw(csv); |
210
|
|
|
|
|
|
|
use Data::Prepare qw( |
211
|
|
|
|
|
|
|
cols_non_empty non_unique_cols |
212
|
|
|
|
|
|
|
chop_lines chop_cols header_merge |
213
|
|
|
|
|
|
|
); |
214
|
|
|
|
|
|
|
my $data = csv(in => 'unclean.csv', encoding => "UTF-8"); |
215
|
|
|
|
|
|
|
chop_cols([0, 2], $data); |
216
|
|
|
|
|
|
|
header_merge($spec, $data); |
217
|
|
|
|
|
|
|
chop_lines(\@lines, $data); # mutates the data |
218
|
|
|
|
|
|
|
|
219
|
|
|
|
|
|
|
# or: |
220
|
|
|
|
|
|
|
my @non_empty_counts = cols_non_empty($data); |
221
|
|
|
|
|
|
|
print Dumper(non_unique_cols($data)); |
222
|
|
|
|
|
|
|
|
223
|
|
|
|
|
|
|
=head1 DESCRIPTION |
224
|
|
|
|
|
|
|
|
225
|
|
|
|
|
|
|
A module with utility functions for turning spreadsheets published for |
226
|
|
|
|
|
|
|
human consumption into ones suitable for automatic processing. Intended |
227
|
|
|
|
|
|
|
to be used by the supplied L script. See that script's |
228
|
|
|
|
|
|
|
documentation for a suggested workflow. |
229
|
|
|
|
|
|
|
|
230
|
|
|
|
|
|
|
All the functions are exportable, none are exported by default. |
231
|
|
|
|
|
|
|
All the C<$data> inputs are an array-ref-of-array-refs. |
232
|
|
|
|
|
|
|
|
233
|
|
|
|
|
|
|
=head1 FUNCTIONS |
234
|
|
|
|
|
|
|
|
235
|
|
|
|
|
|
|
=head2 chop_cols |
236
|
|
|
|
|
|
|
|
237
|
|
|
|
|
|
|
chop_cols([0, 2], $data); |
238
|
|
|
|
|
|
|
|
239
|
|
|
|
|
|
|
Uses C to delete each zero-based column index. The example above |
240
|
|
|
|
|
|
|
deletes the first and third columns. |
241
|
|
|
|
|
|
|
|
242
|
|
|
|
|
|
|
=head2 chop_lines |
243
|
|
|
|
|
|
|
|
244
|
|
|
|
|
|
|
chop_lines([ 0, (-1) x $n ], $data); |
245
|
|
|
|
|
|
|
|
246
|
|
|
|
|
|
|
Uses C to delete each zero-based line index, in the order |
247
|
|
|
|
|
|
|
given. The example above deletes the first, and last C<$n>, lines. |
248
|
|
|
|
|
|
|
|
249
|
|
|
|
|
|
|
=head2 header_merge |
250
|
|
|
|
|
|
|
|
251
|
|
|
|
|
|
|
header_merge([ |
252
|
|
|
|
|
|
|
{ line => 1, from => 'up', fromspec => 'lastnonblank', to => 'self', matchto => 'HH', do => [ 'overwrite' ] }, |
253
|
|
|
|
|
|
|
{ line => 1, from => 'self', matchfrom => '.', to => 'down', do => [ 'prepend', ' ' ] }, |
254
|
|
|
|
|
|
|
{ line => 2, from => 'self', fromspec => 'left', to => 'self', matchto => 'Year', do => [ 'prepend', '/' ] }, |
255
|
|
|
|
|
|
|
{ line => 2, from => 'self', fromspec => 'literal:Country', to => 'self', tospec => 'index:0', do => [ 'overwrite' ] }, |
256
|
|
|
|
|
|
|
], $data); |
257
|
|
|
|
|
|
|
# Turns: |
258
|
|
|
|
|
|
|
# [ |
259
|
|
|
|
|
|
|
# [ '', 'Proportion of households with', '', '', '' ], |
260
|
|
|
|
|
|
|
# [ '', '(HH1)', 'Year', '(HH2)', 'Year' ], |
261
|
|
|
|
|
|
|
# [ '', 'Radio', 'of data', 'TV', 'of data' ], |
262
|
|
|
|
|
|
|
# ] |
263
|
|
|
|
|
|
|
# into (after a further chop_lines to remove the first two): |
264
|
|
|
|
|
|
|
# [ |
265
|
|
|
|
|
|
|
# [ |
266
|
|
|
|
|
|
|
# 'Country', |
267
|
|
|
|
|
|
|
# 'Proportion of households with Radio', 'Proportion of households with Radio/Year of data', |
268
|
|
|
|
|
|
|
# 'Proportion of households with TV', 'Proportion of households with TV/Year of data' |
269
|
|
|
|
|
|
|
# ] |
270
|
|
|
|
|
|
|
# ] |
271
|
|
|
|
|
|
|
|
272
|
|
|
|
|
|
|
Applies the given transformations to the given data, so you can make the |
273
|
|
|
|
|
|
|
given data have the first row be your desired headers for the columns. |
274
|
|
|
|
|
|
|
As shown in the above example, this does not delete lines so further |
275
|
|
|
|
|
|
|
operations may be needed. |
276
|
|
|
|
|
|
|
|
277
|
|
|
|
|
|
|
Broadly, each hash-ref specifies one operation, which acts on a single |
278
|
|
|
|
|
|
|
(specified) line-number. It scans along that line from left to right, |
279
|
|
|
|
|
|
|
unless C matches C in which case only one operation |
280
|
|
|
|
|
|
|
is done. |
281
|
|
|
|
|
|
|
|
282
|
|
|
|
|
|
|
The above merge operations in YAML format: |
283
|
|
|
|
|
|
|
|
284
|
|
|
|
|
|
|
spec: |
285
|
|
|
|
|
|
|
- do: |
286
|
|
|
|
|
|
|
- overwrite |
287
|
|
|
|
|
|
|
from: up |
288
|
|
|
|
|
|
|
fromspec: lastnonblank |
289
|
|
|
|
|
|
|
line: 2 |
290
|
|
|
|
|
|
|
matchto: HH |
291
|
|
|
|
|
|
|
to: self |
292
|
|
|
|
|
|
|
- do: |
293
|
|
|
|
|
|
|
- prepend |
294
|
|
|
|
|
|
|
- ' ' |
295
|
|
|
|
|
|
|
from: self |
296
|
|
|
|
|
|
|
line: 2 |
297
|
|
|
|
|
|
|
matchfrom: . |
298
|
|
|
|
|
|
|
to: down |
299
|
|
|
|
|
|
|
- do: |
300
|
|
|
|
|
|
|
- prepend |
301
|
|
|
|
|
|
|
- / |
302
|
|
|
|
|
|
|
from: self |
303
|
|
|
|
|
|
|
fromspec: left |
304
|
|
|
|
|
|
|
line: 3 |
305
|
|
|
|
|
|
|
matchto: Year |
306
|
|
|
|
|
|
|
to: self |
307
|
|
|
|
|
|
|
- do: |
308
|
|
|
|
|
|
|
- overwrite |
309
|
|
|
|
|
|
|
from: self |
310
|
|
|
|
|
|
|
fromspec: literal:Country |
311
|
|
|
|
|
|
|
line: 3 |
312
|
|
|
|
|
|
|
to: self |
313
|
|
|
|
|
|
|
tospec: index:0 |
314
|
|
|
|
|
|
|
|
315
|
|
|
|
|
|
|
This turns the first three lines of data excerpted from the supplied example |
316
|
|
|
|
|
|
|
data (shown in CSV with spaces inserted for alignment reasons only): |
317
|
|
|
|
|
|
|
|
318
|
|
|
|
|
|
|
,Proportion of households with, , , |
319
|
|
|
|
|
|
|
,(HH1) ,Year ,(HH2),Year |
320
|
|
|
|
|
|
|
,Radio ,of data,TV ,of data |
321
|
|
|
|
|
|
|
Belize,58.7 ,2019 ,78.7 ,2019 |
322
|
|
|
|
|
|
|
|
323
|
|
|
|
|
|
|
into the following. Note that the first two lines will still be present |
324
|
|
|
|
|
|
|
(not shown), possibly modified, so you will need your chop_lines to |
325
|
|
|
|
|
|
|
remove them. The columns of the third line are shown, one per line, |
326
|
|
|
|
|
|
|
for readability: |
327
|
|
|
|
|
|
|
|
328
|
|
|
|
|
|
|
Country, |
329
|
|
|
|
|
|
|
Proportion of households with Radio, |
330
|
|
|
|
|
|
|
Proportion of households with Radio/Year of data, |
331
|
|
|
|
|
|
|
Proportion of households with TV, |
332
|
|
|
|
|
|
|
Proportion of households with TV/Year of data |
333
|
|
|
|
|
|
|
|
334
|
|
|
|
|
|
|
This achieves a single row of column-headings, with each column-heading |
335
|
|
|
|
|
|
|
being unique, and sufficiently meaningful. |
336
|
|
|
|
|
|
|
|
337
|
|
|
|
|
|
|
=head2 pk_insert |
338
|
|
|
|
|
|
|
|
339
|
|
|
|
|
|
|
pk_insert({ |
340
|
|
|
|
|
|
|
column_heading => 'ISO3CODE', |
341
|
|
|
|
|
|
|
local_column => 'Country', |
342
|
|
|
|
|
|
|
pk_column => 'official_name_en', |
343
|
|
|
|
|
|
|
}, $data, $pk_map, $stopwords); |
344
|
|
|
|
|
|
|
|
345
|
|
|
|
|
|
|
In YAML format, this is the same configuration: |
346
|
|
|
|
|
|
|
|
347
|
|
|
|
|
|
|
pk_insert: |
348
|
|
|
|
|
|
|
- files: |
349
|
|
|
|
|
|
|
- examples/CoreHouseholdIndicators.csv |
350
|
|
|
|
|
|
|
spec: |
351
|
|
|
|
|
|
|
column_heading: ISO3CODE |
352
|
|
|
|
|
|
|
local_column: Country |
353
|
|
|
|
|
|
|
pk_column: official_name_en |
354
|
|
|
|
|
|
|
use_fallback: true |
355
|
|
|
|
|
|
|
|
356
|
|
|
|
|
|
|
And the C<$pk_map> made with L, inserts the |
357
|
|
|
|
|
|
|
C in front of the current zero-th column, mapping the |
358
|
|
|
|
|
|
|
value of the C column as looked up from the specified column |
359
|
|
|
|
|
|
|
of the C file, and if C is true, also tries |
360
|
|
|
|
|
|
|
L if no exact match is found. In that case, C |
361
|
|
|
|
|
|
|
must be specified in the configuration |
362
|
|
|
|
|
|
|
|
363
|
|
|
|
|
|
|
=head2 cols_non_empty |
364
|
|
|
|
|
|
|
|
365
|
|
|
|
|
|
|
my @col_non_empty = cols_non_empty($data); |
366
|
|
|
|
|
|
|
|
367
|
|
|
|
|
|
|
In the given data, iterates through all rows and returns a list of |
368
|
|
|
|
|
|
|
quantities of non-blank entries in each column. This can be useful to spot |
369
|
|
|
|
|
|
|
columns with only a couple of entries, which are more usefully chopped. |
370
|
|
|
|
|
|
|
|
371
|
|
|
|
|
|
|
=head2 non_unique_cols |
372
|
|
|
|
|
|
|
|
373
|
|
|
|
|
|
|
my $col2count = non_unique_cols($data); |
374
|
|
|
|
|
|
|
|
375
|
|
|
|
|
|
|
Takes the first row of the given data, and returns a hash-ref mapping |
376
|
|
|
|
|
|
|
any non-unique column-names to the number of times they appear. |
377
|
|
|
|
|
|
|
|
378
|
|
|
|
|
|
|
=head2 key_to_index |
379
|
|
|
|
|
|
|
|
380
|
|
|
|
|
|
|
Given an array-ref (probably the first row of a CSV file, i.e. column |
381
|
|
|
|
|
|
|
headings), returns a hash-ref mapping the cell values to their zero-based |
382
|
|
|
|
|
|
|
index. |
383
|
|
|
|
|
|
|
|
384
|
|
|
|
|
|
|
=head2 make_pk_map |
385
|
|
|
|
|
|
|
|
386
|
|
|
|
|
|
|
my $altcol2value2pk = make_pk_map($data, $pk_colkey, \@other_colkeys); |
387
|
|
|
|
|
|
|
|
388
|
|
|
|
|
|
|
Given C<$data>, the heading of the primary-key column, and an array-ref |
389
|
|
|
|
|
|
|
of headings of alternative key columns, returns a hash-ref mapping each |
390
|
|
|
|
|
|
|
of those alternative key columns (plus the C<$pk_colkey>) to a map from |
391
|
|
|
|
|
|
|
that column's value to the relevant row's primary-key value. |
392
|
|
|
|
|
|
|
|
393
|
|
|
|
|
|
|
This is most conveniently represented in YAML format: |
394
|
|
|
|
|
|
|
|
395
|
|
|
|
|
|
|
pk_spec: |
396
|
|
|
|
|
|
|
file: examples/country-codes.csv |
397
|
|
|
|
|
|
|
primary_key: ISO3166-1-Alpha-3 |
398
|
|
|
|
|
|
|
alt_keys: |
399
|
|
|
|
|
|
|
- ISO3166-1-Alpha-2 |
400
|
|
|
|
|
|
|
- UNTERM English Short |
401
|
|
|
|
|
|
|
- UNTERM English Formal |
402
|
|
|
|
|
|
|
- official_name_en |
403
|
|
|
|
|
|
|
- CLDR display name |
404
|
|
|
|
|
|
|
stopwords: |
405
|
|
|
|
|
|
|
- islands |
406
|
|
|
|
|
|
|
- china |
407
|
|
|
|
|
|
|
- northern |
408
|
|
|
|
|
|
|
|
409
|
|
|
|
|
|
|
=head2 pk_col_counts |
410
|
|
|
|
|
|
|
|
411
|
|
|
|
|
|
|
my ($colname2potential_key2count, $no_exact_match) = pk_col_counts($data, $pk_map); |
412
|
|
|
|
|
|
|
|
413
|
|
|
|
|
|
|
Given C<$data> and a primary-key (etc) map created by the above, returns |
414
|
|
|
|
|
|
|
a tuple of a hash-ref mapping each column that gave any matches to a |
415
|
|
|
|
|
|
|
further hash-ref mapping each of the potential key columns given above |
416
|
|
|
|
|
|
|
to how many matches it gave, and an array-ref of rows that had no exact |
417
|
|
|
|
|
|
|
matches. |
418
|
|
|
|
|
|
|
|
419
|
|
|
|
|
|
|
=head2 pk_match |
420
|
|
|
|
|
|
|
|
421
|
|
|
|
|
|
|
my ($best, $pk_cols_unique_best) = pk_match($value, $pk_map, $stopwords); |
422
|
|
|
|
|
|
|
|
423
|
|
|
|
|
|
|
Given a value, C<$pk_map>, and an array-ref of case-insensitive stopwords, |
424
|
|
|
|
|
|
|
returns its best match for the right primary-key value, and an array-ref |
425
|
|
|
|
|
|
|
of which primary-key columns in the C<$pk_map> matched the given value |
426
|
|
|
|
|
|
|
exactly once. |
427
|
|
|
|
|
|
|
|
428
|
|
|
|
|
|
|
The latter is useful for analysis purposes to select which primary-key |
429
|
|
|
|
|
|
|
column to use for this data-set. |
430
|
|
|
|
|
|
|
|
431
|
|
|
|
|
|
|
The algorithm used for this best-match: |
432
|
|
|
|
|
|
|
|
433
|
|
|
|
|
|
|
=over |
434
|
|
|
|
|
|
|
|
435
|
|
|
|
|
|
|
=item * |
436
|
|
|
|
|
|
|
|
437
|
|
|
|
|
|
|
Splits the value into words (or where a word is two or more capital |
438
|
|
|
|
|
|
|
letters, letters). The search allows any, or no, text, to occur between |
439
|
|
|
|
|
|
|
these entities. Each configured primary-key column's keys are searched |
440
|
|
|
|
|
|
|
for matches. |
441
|
|
|
|
|
|
|
|
442
|
|
|
|
|
|
|
=item * |
443
|
|
|
|
|
|
|
|
444
|
|
|
|
|
|
|
If there is a separating C<,> or C<(> (as commonly used for |
445
|
|
|
|
|
|
|
abbreviations), splits the value into chunks, reverses them, and then |
446
|
|
|
|
|
|
|
reassembles the chunks as above for a similar search. |
447
|
|
|
|
|
|
|
|
448
|
|
|
|
|
|
|
=item * |
449
|
|
|
|
|
|
|
|
450
|
|
|
|
|
|
|
Only if there were no matches from the previous steps, splits the value |
451
|
|
|
|
|
|
|
into words. Words that are shorter than three characters, or that occur in |
452
|
|
|
|
|
|
|
the stopword list, are omitted. Then each word is searched for as above. |
453
|
|
|
|
|
|
|
|
454
|
|
|
|
|
|
|
=item * |
455
|
|
|
|
|
|
|
|
456
|
|
|
|
|
|
|
"Votes" on which primary-key value got the most matches. Tie-breaks on |
457
|
|
|
|
|
|
|
which primary-key value matched on the shortest key in the relevant |
458
|
|
|
|
|
|
|
C<$pk_map> column, and then on the lexically lowest-valued primary-key |
459
|
|
|
|
|
|
|
value, to ensure stable return values. |
460
|
|
|
|
|
|
|
|
461
|
|
|
|
|
|
|
=back |
462
|
|
|
|
|
|
|
|
463
|
|
|
|
|
|
|
=head1 SEE ALSO |
464
|
|
|
|
|
|
|
|
465
|
|
|
|
|
|
|
L |
466
|
|
|
|
|
|
|
|
467
|
|
|
|
|
|
|
=head1 LICENSE AND COPYRIGHT |
468
|
|
|
|
|
|
|
|
469
|
|
|
|
|
|
|
Copyright (C) Ed J |
470
|
|
|
|
|
|
|
|
471
|
|
|
|
|
|
|
This library is free software; you can redistribute it and/or modify |
472
|
|
|
|
|
|
|
it under the same terms as Perl itself. |
473
|
|
|
|
|
|
|
|
474
|
|
|
|
|
|
|
=cut |