| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package Data::Prepare; |
|
2
|
|
|
|
|
|
|
|
|
3
|
1
|
|
|
1
|
|
112632
|
use strict; |
|
|
1
|
|
|
|
|
12
|
|
|
|
1
|
|
|
|
|
30
|
|
|
4
|
1
|
|
|
1
|
|
5
|
use warnings; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
27
|
|
|
5
|
1
|
|
|
1
|
|
5
|
use Exporter 'import'; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
2045
|
|
|
6
|
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
our $VERSION = '0.004'; |
|
8
|
|
|
|
|
|
|
our @EXPORT_OK = qw( |
|
9
|
|
|
|
|
|
|
cols_non_empty |
|
10
|
|
|
|
|
|
|
non_unique_cols |
|
11
|
|
|
|
|
|
|
key_to_index |
|
12
|
|
|
|
|
|
|
make_pk_map |
|
13
|
|
|
|
|
|
|
pk_col_counts |
|
14
|
|
|
|
|
|
|
pk_match |
|
15
|
|
|
|
|
|
|
chop_lines |
|
16
|
|
|
|
|
|
|
chop_cols |
|
17
|
|
|
|
|
|
|
header_merge |
|
18
|
|
|
|
|
|
|
pk_insert |
|
19
|
|
|
|
|
|
|
); |
|
20
|
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
sub chop_lines { |
|
22
|
4
|
|
|
4
|
1
|
26473
|
my ($choplines, $data) = @_; |
|
23
|
4
|
|
|
|
|
48
|
splice @$data, $_, 1 for @$choplines; |
|
24
|
|
|
|
|
|
|
} |
|
25
|
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
sub chop_cols { |
|
27
|
1
|
|
|
1
|
1
|
4
|
my ($chopcols, $data) = @_; |
|
28
|
1
|
|
|
|
|
8
|
for my $c (sort {$b <=> $a} @$chopcols) { |
|
|
28
|
|
|
|
|
37
|
|
|
29
|
15
|
|
|
|
|
922
|
splice @$_, $c, 1 for @$data; |
|
30
|
|
|
|
|
|
|
} |
|
31
|
|
|
|
|
|
|
} |
|
32
|
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
my %where2offset = (up => -1, self => 0, down => 1); |
|
34
|
|
|
|
|
|
|
sub header_merge { |
|
35
|
3
|
|
|
3
|
1
|
12
|
my ($merge_spec, $data) = @_; |
|
36
|
3
|
|
|
|
|
11
|
for my $spec (@$merge_spec) { |
|
37
|
10
|
|
|
|
|
37
|
my ($do, $l, $matchfrom, $matchto) = @$spec{qw(do line matchfrom matchto)}; |
|
38
|
10
|
|
|
|
|
52
|
my ($from_row, $to_row) = map $data->[$l + $where2offset{$spec->{$_}}], qw(from to); |
|
39
|
10
|
|
100
|
|
|
42
|
my ($kept, $fromspec, $justone, $which_index) = ('', ($spec->{fromspec} || '')); |
|
40
|
10
|
100
|
100
|
|
|
58
|
if (($spec->{tospec} || '') =~ /^index:(\d+)/) { |
|
41
|
4
|
|
|
|
|
8
|
$justone = 1; |
|
42
|
4
|
|
|
|
|
9
|
$which_index = $1; |
|
43
|
|
|
|
|
|
|
} |
|
44
|
10
|
|
|
|
|
29
|
for my $i (0..$#$to_row) { |
|
45
|
76
|
|
100
|
|
|
223
|
$kept = $from_row->[$i] || $kept; |
|
46
|
76
|
100
|
100
|
|
|
326
|
next if defined $matchto and $to_row->[$i] !~ /$matchto/; |
|
47
|
50
|
100
|
100
|
|
|
156
|
next if defined $matchfrom and $from_row->[$i] !~ /$matchfrom/; |
|
48
|
48
|
100
|
|
|
|
133
|
my $basic_from = |
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
$fromspec eq 'lastnonblank' ? $kept : |
|
50
|
|
|
|
|
|
|
$fromspec eq 'left' ? $from_row->[$i - 1] : |
|
51
|
|
|
|
|
|
|
$fromspec =~ /^literal:(.*)/ ? $1 : |
|
52
|
|
|
|
|
|
|
$from_row->[$justone ? $which_index : $i]; |
|
53
|
48
|
100
|
|
|
|
84
|
my $basic_to = $to_row->[$justone ? $which_index : $i]; |
|
54
|
48
|
0
|
|
|
|
124
|
my $what = |
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
$do->[0] eq 'overwrite' ? $basic_from : |
|
56
|
|
|
|
|
|
|
$do->[0] eq 'prepend' ? $basic_from . $do->[1] . $basic_to : |
|
57
|
|
|
|
|
|
|
$do->[0] eq 'append' ? $basic_to . $do->[1] . $basic_from : |
|
58
|
|
|
|
|
|
|
die "Unknown action '$do->[0]'"; |
|
59
|
48
|
100
|
|
|
|
78
|
if ($justone) { |
|
60
|
4
|
|
|
|
|
7
|
$to_row->[$which_index] = $what; |
|
61
|
4
|
|
|
|
|
16
|
last; |
|
62
|
|
|
|
|
|
|
} else { |
|
63
|
44
|
|
|
|
|
79
|
$to_row->[$i] = $what; |
|
64
|
|
|
|
|
|
|
} |
|
65
|
|
|
|
|
|
|
} |
|
66
|
|
|
|
|
|
|
} |
|
67
|
|
|
|
|
|
|
} |
|
68
|
|
|
|
|
|
|
|
|
69
|
|
|
|
|
|
|
sub pk_insert { |
|
70
|
1
|
|
|
1
|
1
|
6
|
my ($spec, $data, $pk_map, $stopwords) = @_; |
|
71
|
1
|
|
|
|
|
5
|
my ($ch, $lc, $pkc, $fb) = (@$spec{qw(column_heading local_column pk_column use_fallback)}); |
|
72
|
1
|
|
|
|
|
7
|
my $key_index = key_to_index($data->[0])->{$lc}; |
|
73
|
1
|
50
|
|
|
|
8
|
die "undef index for key '$lc'" if !defined $key_index; |
|
74
|
1
|
|
|
|
|
3
|
unshift @{ $data->[0] }, $ch; |
|
|
1
|
|
|
|
|
4
|
|
|
75
|
1
|
|
|
|
|
4
|
my $exact_map = $pk_map->{$pkc}; |
|
76
|
1
|
|
|
|
|
12
|
for my $row (@$data[ 1..$#$data ]) { |
|
77
|
196
|
|
|
|
|
367
|
my $key_val = $row->[ $key_index ]; |
|
78
|
196
|
|
|
|
|
337
|
my $pkv = $exact_map->{ $key_val }; |
|
79
|
196
|
100
|
66
|
|
|
473
|
unshift(@$row, $pkv), next if defined $pkv or !$fb; |
|
80
|
20
|
|
|
|
|
47
|
($pkv) = pk_match($key_val, $pk_map, $stopwords); |
|
81
|
20
|
|
|
|
|
64
|
unshift(@$row, $pkv); |
|
82
|
|
|
|
|
|
|
} |
|
83
|
|
|
|
|
|
|
} |
|
84
|
|
|
|
|
|
|
|
|
85
|
|
|
|
|
|
|
sub cols_non_empty { |
|
86
|
1
|
|
|
1
|
1
|
17777
|
my ($data) = @_; |
|
87
|
1
|
|
|
|
|
3
|
my @col_non_empty; |
|
88
|
1
|
|
|
|
|
3
|
for my $line (@$data) { |
|
89
|
199
|
|
100
|
|
|
4538
|
$col_non_empty[$_] ||= 0 for 0..$#$line; |
|
90
|
199
|
|
|
|
|
1566
|
$col_non_empty[$_]++ for grep length $line->[$_], 0..$#$line; |
|
91
|
|
|
|
|
|
|
} |
|
92
|
1
|
|
|
|
|
11
|
@col_non_empty; |
|
93
|
|
|
|
|
|
|
} |
|
94
|
|
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
sub non_unique_cols { |
|
96
|
1
|
|
|
1
|
1
|
3
|
my ($data) = @_; |
|
97
|
1
|
|
|
|
|
3
|
my ($line, %col2count) = $data->[0]; |
|
98
|
1
|
|
|
|
|
6
|
$col2count{$_}++ for @$line; |
|
99
|
1
|
|
|
|
|
7
|
delete @col2count{ grep $col2count{$_} == 1, keys %col2count }; |
|
100
|
1
|
|
|
|
|
4
|
\%col2count; |
|
101
|
|
|
|
|
|
|
} |
|
102
|
|
|
|
|
|
|
|
|
103
|
|
|
|
|
|
|
sub key_to_index { |
|
104
|
4
|
|
|
4
|
1
|
13
|
my ($row) = @_; |
|
105
|
4
|
|
|
|
|
143
|
+{ map +($row->[$_] => $_), 0..$#$row }; |
|
106
|
|
|
|
|
|
|
} |
|
107
|
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
sub make_pk_map { |
|
109
|
1
|
|
|
1
|
1
|
16685
|
my ($data, $pk_colkey, $other_colkeys) = @_; |
|
110
|
1
|
|
|
|
|
6
|
my $k2i = key_to_index($data->[0]); |
|
111
|
1
|
|
|
|
|
8
|
my @invalid = grep !defined $k2i->{$_}, $pk_colkey, @$other_colkeys; |
|
112
|
1
|
50
|
|
|
|
5
|
die "Invalid keys (@invalid)" if @invalid; |
|
113
|
1
|
|
|
|
|
2
|
my $pk_colnum = $k2i->{$pk_colkey}; |
|
114
|
1
|
|
|
|
|
2
|
my %altcol2value2pk; |
|
115
|
1
|
|
|
|
|
5
|
for my $i (1..$#$data) { |
|
116
|
250
|
|
|
|
|
324
|
my $row = $data->[$i]; |
|
117
|
250
|
100
|
|
|
|
434
|
next if !length(my $pk_val = $row->[$pk_colnum]); |
|
118
|
249
|
|
|
|
|
349
|
for my $alt_k ($pk_colkey, @$other_colkeys) { |
|
119
|
1494
|
100
|
|
|
|
2867
|
next if !length(my $alt_v = $row->[$k2i->{$alt_k}]); |
|
120
|
1387
|
|
|
|
|
2872
|
$altcol2value2pk{$alt_k}{$alt_v} = $pk_val; |
|
121
|
|
|
|
|
|
|
} |
|
122
|
|
|
|
|
|
|
} |
|
123
|
1
|
|
|
|
|
12
|
\%altcol2value2pk; |
|
124
|
|
|
|
|
|
|
} |
|
125
|
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
sub pk_col_counts { |
|
127
|
1
|
|
|
1
|
1
|
5388
|
my ($data, $pk_map) = @_; |
|
128
|
1
|
|
|
|
|
4
|
my $k2i = key_to_index($data->[0]); |
|
129
|
1
|
|
|
|
|
3
|
my (%col2code2exact, @no_exact_match); |
|
130
|
1
|
|
|
|
|
5
|
for my $i (1..$#$data) { |
|
131
|
196
|
|
|
|
|
322
|
my ($row, $exact_match) = $data->[$i]; |
|
132
|
196
|
|
|
|
|
527
|
for my $possible_col (keys %$k2i) { |
|
133
|
3724
|
|
|
|
|
5457
|
my $val = $row->[ $k2i->{$possible_col} ]; |
|
134
|
3724
|
|
|
|
|
10590
|
my @match_codes_yes = grep exists $pk_map->{$_}{$val}, keys %$pk_map; |
|
135
|
3724
|
|
|
|
|
6157
|
$col2code2exact{$possible_col}{$_}++ for @match_codes_yes; |
|
136
|
3724
|
|
100
|
|
|
7267
|
$exact_match ||= @match_codes_yes; |
|
137
|
|
|
|
|
|
|
} |
|
138
|
196
|
100
|
|
|
|
498
|
push @no_exact_match, $row if !$exact_match; |
|
139
|
|
|
|
|
|
|
} |
|
140
|
1
|
|
|
|
|
15
|
(\%col2code2exact, \@no_exact_match); |
|
141
|
|
|
|
|
|
|
} |
|
142
|
|
|
|
|
|
|
|
|
143
|
|
|
|
|
|
|
sub _match_register { |
|
144
|
1440
|
|
|
1440
|
|
2687
|
my ($matches, $code, $this_map, $pk_val2count, $pk_col2pk_value2count, $pk_val2from) = @_; |
|
145
|
|
|
|
|
|
|
$pk_val2count->{$_}++, |
|
146
|
|
|
|
|
|
|
$pk_col2pk_value2count->{$code}{$_}++ |
|
147
|
1440
|
|
|
|
|
4795
|
for map $this_map->{$_}, @$matches; |
|
148
|
1440
|
|
|
|
|
3058
|
for (@$matches) { |
|
149
|
|
|
|
|
|
|
# track longest matched-value per PK, to tie-break on shortest one |
|
150
|
858
|
|
|
|
|
1275
|
my $this_pk_val = $this_map->{$_}; |
|
151
|
|
|
|
|
|
|
$pk_val2from->{$this_pk_val} = $_ if |
|
152
|
858
|
100
|
100
|
|
|
3654
|
length($pk_val2from->{$this_pk_val}||'') < length; |
|
153
|
|
|
|
|
|
|
} |
|
154
|
|
|
|
|
|
|
} |
|
155
|
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
sub pk_match { |
|
157
|
216
|
|
|
216
|
1
|
841
|
my ($value, $pk_map, $stopwords) = @_; |
|
158
|
216
|
|
|
|
|
336
|
my %stopword; @stopword{@$stopwords} = (); |
|
|
216
|
|
|
|
|
602
|
|
|
159
|
216
|
|
|
|
|
871
|
my @val_words = grep length, split /[^A-Za-z]/, $value; |
|
160
|
216
|
50
|
|
|
|
1201
|
my $val_pat = join '.*', map +(/[A-Z]{2,}/ ? split //, $_ : $_), @val_words; |
|
161
|
216
|
|
|
|
|
964
|
@val_words = grep !exists $stopword{$_}, map lc, grep length > 2, @val_words; |
|
162
|
216
|
|
|
|
|
419
|
my (%pk_col2pk_value2count, %pk_val2count, %pk_val2from); |
|
163
|
216
|
|
|
|
|
554
|
for my $code (keys %$pk_map) { |
|
164
|
1296
|
|
|
|
|
2147
|
my $this_map = $pk_map->{$code}; |
|
165
|
1
|
|
|
1
|
|
817
|
my @matches = grep /$val_pat/i, keys %$this_map; |
|
|
1
|
|
|
|
|
32
|
|
|
|
1
|
|
|
|
|
16
|
|
|
|
1296
|
|
|
|
|
93713
|
|
|
166
|
1296
|
|
|
|
|
39154
|
_match_register(\@matches, $code, $this_map, \%pk_val2count, \%pk_col2pk_value2count, \%pk_val2from); |
|
167
|
|
|
|
|
|
|
} |
|
168
|
216
|
100
|
|
|
|
1247
|
if ((my @abbrev_parts = grep length, split /\s*[\(,]\s*/, $value) > 1) { |
|
169
|
12
|
|
|
|
|
176
|
s/(.*?)[^A-Za-z]+(.*?)/$1.*$2/g for @abbrev_parts; |
|
170
|
12
|
|
|
|
|
39
|
my $suff_pref_pat = join '.*', reverse @abbrev_parts; |
|
171
|
12
|
|
|
|
|
38
|
for my $code (keys %$pk_map) { |
|
172
|
72
|
|
|
|
|
125
|
my $this_map = $pk_map->{$code}; |
|
173
|
72
|
|
|
|
|
5873
|
my @matches = grep /$suff_pref_pat/i, keys %$this_map; |
|
174
|
72
|
|
|
|
|
787
|
_match_register(\@matches, $code, $this_map, \%pk_val2count, \%pk_col2pk_value2count); |
|
175
|
72
|
|
|
|
|
9507
|
@matches = grep /^$suff_pref_pat/i, keys %$this_map; |
|
176
|
72
|
|
|
|
|
736
|
_match_register(\@matches, $code, $this_map, \%pk_val2count, \%pk_col2pk_value2count); |
|
177
|
|
|
|
|
|
|
} |
|
178
|
|
|
|
|
|
|
} |
|
179
|
216
|
50
|
|
|
|
608
|
if (!keys %pk_col2pk_value2count) { |
|
180
|
0
|
|
|
|
|
0
|
for my $code (keys %$pk_map) { |
|
181
|
0
|
|
|
|
|
0
|
my $this_map = $pk_map->{$code}; |
|
182
|
0
|
|
|
|
|
0
|
for my $word (@val_words) { |
|
183
|
0
|
|
|
|
|
0
|
my @matches = grep /\b\Q$word\E\b/i, keys %$this_map; |
|
184
|
0
|
|
|
|
|
0
|
_match_register(\@matches, $code, $this_map, \%pk_val2count, \%pk_col2pk_value2count, \%pk_val2from); |
|
185
|
|
|
|
|
|
|
} |
|
186
|
|
|
|
|
|
|
} |
|
187
|
|
|
|
|
|
|
} |
|
188
|
|
|
|
|
|
|
my ($best) = sort { |
|
189
|
216
|
|
|
|
|
588
|
$pk_val2count{$b} <=> $pk_val2count{$a} |
|
190
|
|
|
|
|
|
|
|| |
|
191
|
29
|
50
|
66
|
|
|
137
|
length($pk_val2from{$a}) <=> length($pk_val2from{$b}) |
|
192
|
|
|
|
|
|
|
|| |
|
193
|
|
|
|
|
|
|
$a cmp $b |
|
194
|
|
|
|
|
|
|
} keys %pk_val2count; |
|
195
|
216
|
|
66
|
|
|
505
|
my @pk_cols_unique_best = sort grep keys %{ $pk_col2pk_value2count{$_} } == 1 && $pk_col2pk_value2count{$_}{$best}, keys %pk_col2pk_value2count; |
|
196
|
216
|
|
|
|
|
1890
|
($best, \@pk_cols_unique_best); |
|
197
|
|
|
|
|
|
|
} |
|
198
|
|
|
|
|
|
|
|
|
199
|
|
|
|
|
|
|
1; |
|
200
|
|
|
|
|
|
|
|
|
201
|
|
|
|
|
|
|
=encoding utf8 |
|
202
|
|
|
|
|
|
|
|
|
203
|
|
|
|
|
|
|
=head1 NAME |
|
204
|
|
|
|
|
|
|
|
|
205
|
|
|
|
|
|
|
Data::Prepare - prepare CSV (etc) data for automatic processing |
|
206
|
|
|
|
|
|
|
|
|
207
|
|
|
|
|
|
|
=head1 SYNOPSIS |
|
208
|
|
|
|
|
|
|
|
|
209
|
|
|
|
|
|
|
use Text::CSV qw(csv); |
|
210
|
|
|
|
|
|
|
use Data::Prepare qw( |
|
211
|
|
|
|
|
|
|
cols_non_empty non_unique_cols |
|
212
|
|
|
|
|
|
|
chop_lines chop_cols header_merge |
|
213
|
|
|
|
|
|
|
); |
|
214
|
|
|
|
|
|
|
my $data = csv(in => 'unclean.csv', encoding => "UTF-8"); |
|
215
|
|
|
|
|
|
|
chop_cols([0, 2], $data); |
|
216
|
|
|
|
|
|
|
header_merge($spec, $data); |
|
217
|
|
|
|
|
|
|
chop_lines(\@lines, $data); # mutates the data |
|
218
|
|
|
|
|
|
|
|
|
219
|
|
|
|
|
|
|
# or: |
|
220
|
|
|
|
|
|
|
my @non_empty_counts = cols_non_empty($data); |
|
221
|
|
|
|
|
|
|
print Dumper(non_unique_cols($data)); |
|
222
|
|
|
|
|
|
|
|
|
223
|
|
|
|
|
|
|
=head1 DESCRIPTION |
|
224
|
|
|
|
|
|
|
|
|
225
|
|
|
|
|
|
|
A module with utility functions for turning spreadsheets published for |
|
226
|
|
|
|
|
|
|
human consumption into ones suitable for automatic processing. Intended |
|
227
|
|
|
|
|
|
|
to be used by the supplied L script. See that script's |
|
228
|
|
|
|
|
|
|
documentation for a suggested workflow. |
|
229
|
|
|
|
|
|
|
|
|
230
|
|
|
|
|
|
|
All the functions are exportable, none are exported by default. |
|
231
|
|
|
|
|
|
|
All the C<$data> inputs are an array-ref-of-array-refs. |
|
232
|
|
|
|
|
|
|
|
|
233
|
|
|
|
|
|
|
=head1 FUNCTIONS |
|
234
|
|
|
|
|
|
|
|
|
235
|
|
|
|
|
|
|
=head2 chop_cols |
|
236
|
|
|
|
|
|
|
|
|
237
|
|
|
|
|
|
|
chop_cols([0, 2], $data); |
|
238
|
|
|
|
|
|
|
|
|
239
|
|
|
|
|
|
|
Uses C to delete each zero-based column index. The example above |
|
240
|
|
|
|
|
|
|
deletes the first and third columns. |
|
241
|
|
|
|
|
|
|
|
|
242
|
|
|
|
|
|
|
=head2 chop_lines |
|
243
|
|
|
|
|
|
|
|
|
244
|
|
|
|
|
|
|
chop_lines([ 0, (-1) x $n ], $data); |
|
245
|
|
|
|
|
|
|
|
|
246
|
|
|
|
|
|
|
Uses C to delete each zero-based line index, in the order |
|
247
|
|
|
|
|
|
|
given. The example above deletes the first, and last C<$n>, lines. |
|
248
|
|
|
|
|
|
|
|
|
249
|
|
|
|
|
|
|
=head2 header_merge |
|
250
|
|
|
|
|
|
|
|
|
251
|
|
|
|
|
|
|
header_merge([ |
|
252
|
|
|
|
|
|
|
{ line => 1, from => 'up', fromspec => 'lastnonblank', to => 'self', matchto => 'HH', do => [ 'overwrite' ] }, |
|
253
|
|
|
|
|
|
|
{ line => 1, from => 'self', matchfrom => '.', to => 'down', do => [ 'prepend', ' ' ] }, |
|
254
|
|
|
|
|
|
|
{ line => 2, from => 'self', fromspec => 'left', to => 'self', matchto => 'Year', do => [ 'prepend', '/' ] }, |
|
255
|
|
|
|
|
|
|
{ line => 2, from => 'self', fromspec => 'literal:Country', to => 'self', tospec => 'index:0', do => [ 'overwrite' ] }, |
|
256
|
|
|
|
|
|
|
], $data); |
|
257
|
|
|
|
|
|
|
# Turns: |
|
258
|
|
|
|
|
|
|
# [ |
|
259
|
|
|
|
|
|
|
# [ '', 'Proportion of households with', '', '', '' ], |
|
260
|
|
|
|
|
|
|
# [ '', '(HH1)', 'Year', '(HH2)', 'Year' ], |
|
261
|
|
|
|
|
|
|
# [ '', 'Radio', 'of data', 'TV', 'of data' ], |
|
262
|
|
|
|
|
|
|
# ] |
|
263
|
|
|
|
|
|
|
# into (after a further chop_lines to remove the first two): |
|
264
|
|
|
|
|
|
|
# [ |
|
265
|
|
|
|
|
|
|
# [ |
|
266
|
|
|
|
|
|
|
# 'Country', |
|
267
|
|
|
|
|
|
|
# 'Proportion of households with Radio', 'Proportion of households with Radio/Year of data', |
|
268
|
|
|
|
|
|
|
# 'Proportion of households with TV', 'Proportion of households with TV/Year of data' |
|
269
|
|
|
|
|
|
|
# ] |
|
270
|
|
|
|
|
|
|
# ] |
|
271
|
|
|
|
|
|
|
|
|
272
|
|
|
|
|
|
|
Applies the given transformations to the given data, so you can make the |
|
273
|
|
|
|
|
|
|
given data have the first row be your desired headers for the columns. |
|
274
|
|
|
|
|
|
|
As shown in the above example, this does not delete lines so further |
|
275
|
|
|
|
|
|
|
operations may be needed. |
|
276
|
|
|
|
|
|
|
|
|
277
|
|
|
|
|
|
|
Broadly, each hash-ref specifies one operation, which acts on a single |
|
278
|
|
|
|
|
|
|
(specified) line-number. It scans along that line from left to right, |
|
279
|
|
|
|
|
|
|
unless C matches C in which case only one operation |
|
280
|
|
|
|
|
|
|
is done. |
|
281
|
|
|
|
|
|
|
|
|
282
|
|
|
|
|
|
|
The above merge operations in YAML format: |
|
283
|
|
|
|
|
|
|
|
|
284
|
|
|
|
|
|
|
spec: |
|
285
|
|
|
|
|
|
|
- do: |
|
286
|
|
|
|
|
|
|
- overwrite |
|
287
|
|
|
|
|
|
|
from: up |
|
288
|
|
|
|
|
|
|
fromspec: lastnonblank |
|
289
|
|
|
|
|
|
|
line: 2 |
|
290
|
|
|
|
|
|
|
matchto: HH |
|
291
|
|
|
|
|
|
|
to: self |
|
292
|
|
|
|
|
|
|
- do: |
|
293
|
|
|
|
|
|
|
- prepend |
|
294
|
|
|
|
|
|
|
- ' ' |
|
295
|
|
|
|
|
|
|
from: self |
|
296
|
|
|
|
|
|
|
line: 2 |
|
297
|
|
|
|
|
|
|
matchfrom: . |
|
298
|
|
|
|
|
|
|
to: down |
|
299
|
|
|
|
|
|
|
- do: |
|
300
|
|
|
|
|
|
|
- prepend |
|
301
|
|
|
|
|
|
|
- / |
|
302
|
|
|
|
|
|
|
from: self |
|
303
|
|
|
|
|
|
|
fromspec: left |
|
304
|
|
|
|
|
|
|
line: 3 |
|
305
|
|
|
|
|
|
|
matchto: Year |
|
306
|
|
|
|
|
|
|
to: self |
|
307
|
|
|
|
|
|
|
- do: |
|
308
|
|
|
|
|
|
|
- overwrite |
|
309
|
|
|
|
|
|
|
from: self |
|
310
|
|
|
|
|
|
|
fromspec: literal:Country |
|
311
|
|
|
|
|
|
|
line: 3 |
|
312
|
|
|
|
|
|
|
to: self |
|
313
|
|
|
|
|
|
|
tospec: index:0 |
|
314
|
|
|
|
|
|
|
|
|
315
|
|
|
|
|
|
|
This turns the first three lines of data excerpted from the supplied example |
|
316
|
|
|
|
|
|
|
data (shown in CSV with spaces inserted for alignment reasons only): |
|
317
|
|
|
|
|
|
|
|
|
318
|
|
|
|
|
|
|
,Proportion of households with, , , |
|
319
|
|
|
|
|
|
|
,(HH1) ,Year ,(HH2),Year |
|
320
|
|
|
|
|
|
|
,Radio ,of data,TV ,of data |
|
321
|
|
|
|
|
|
|
Belize,58.7 ,2019 ,78.7 ,2019 |
|
322
|
|
|
|
|
|
|
|
|
323
|
|
|
|
|
|
|
into the following. Note that the first two lines will still be present |
|
324
|
|
|
|
|
|
|
(not shown), possibly modified, so you will need your chop_lines to |
|
325
|
|
|
|
|
|
|
remove them. The columns of the third line are shown, one per line, |
|
326
|
|
|
|
|
|
|
for readability: |
|
327
|
|
|
|
|
|
|
|
|
328
|
|
|
|
|
|
|
Country, |
|
329
|
|
|
|
|
|
|
Proportion of households with Radio, |
|
330
|
|
|
|
|
|
|
Proportion of households with Radio/Year of data, |
|
331
|
|
|
|
|
|
|
Proportion of households with TV, |
|
332
|
|
|
|
|
|
|
Proportion of households with TV/Year of data |
|
333
|
|
|
|
|
|
|
|
|
334
|
|
|
|
|
|
|
This achieves a single row of column-headings, with each column-heading |
|
335
|
|
|
|
|
|
|
being unique, and sufficiently meaningful. |
|
336
|
|
|
|
|
|
|
|
|
337
|
|
|
|
|
|
|
=head2 pk_insert |
|
338
|
|
|
|
|
|
|
|
|
339
|
|
|
|
|
|
|
pk_insert({ |
|
340
|
|
|
|
|
|
|
column_heading => 'ISO3CODE', |
|
341
|
|
|
|
|
|
|
local_column => 'Country', |
|
342
|
|
|
|
|
|
|
pk_column => 'official_name_en', |
|
343
|
|
|
|
|
|
|
}, $data, $pk_map, $stopwords); |
|
344
|
|
|
|
|
|
|
|
|
345
|
|
|
|
|
|
|
In YAML format, this is the same configuration: |
|
346
|
|
|
|
|
|
|
|
|
347
|
|
|
|
|
|
|
pk_insert: |
|
348
|
|
|
|
|
|
|
- files: |
|
349
|
|
|
|
|
|
|
- examples/CoreHouseholdIndicators.csv |
|
350
|
|
|
|
|
|
|
spec: |
|
351
|
|
|
|
|
|
|
column_heading: ISO3CODE |
|
352
|
|
|
|
|
|
|
local_column: Country |
|
353
|
|
|
|
|
|
|
pk_column: official_name_en |
|
354
|
|
|
|
|
|
|
use_fallback: true |
|
355
|
|
|
|
|
|
|
|
|
356
|
|
|
|
|
|
|
And the C<$pk_map> made with L, inserts the |
|
357
|
|
|
|
|
|
|
C in front of the current zero-th column, mapping the |
|
358
|
|
|
|
|
|
|
value of the C column as looked up from the specified column |
|
359
|
|
|
|
|
|
|
of the C file, and if C is true, also tries |
|
360
|
|
|
|
|
|
|
L if no exact match is found. In that case, C |
|
361
|
|
|
|
|
|
|
must be specified in the configuration |
|
362
|
|
|
|
|
|
|
|
|
363
|
|
|
|
|
|
|
=head2 cols_non_empty |
|
364
|
|
|
|
|
|
|
|
|
365
|
|
|
|
|
|
|
my @col_non_empty = cols_non_empty($data); |
|
366
|
|
|
|
|
|
|
|
|
367
|
|
|
|
|
|
|
In the given data, iterates through all rows and returns a list of |
|
368
|
|
|
|
|
|
|
quantities of non-blank entries in each column. This can be useful to spot |
|
369
|
|
|
|
|
|
|
columns with only a couple of entries, which are more usefully chopped. |
|
370
|
|
|
|
|
|
|
|
|
371
|
|
|
|
|
|
|
=head2 non_unique_cols |
|
372
|
|
|
|
|
|
|
|
|
373
|
|
|
|
|
|
|
my $col2count = non_unique_cols($data); |
|
374
|
|
|
|
|
|
|
|
|
375
|
|
|
|
|
|
|
Takes the first row of the given data, and returns a hash-ref mapping |
|
376
|
|
|
|
|
|
|
any non-unique column-names to the number of times they appear. |
|
377
|
|
|
|
|
|
|
|
|
378
|
|
|
|
|
|
|
=head2 key_to_index |
|
379
|
|
|
|
|
|
|
|
|
380
|
|
|
|
|
|
|
Given an array-ref (probably the first row of a CSV file, i.e. column |
|
381
|
|
|
|
|
|
|
headings), returns a hash-ref mapping the cell values to their zero-based |
|
382
|
|
|
|
|
|
|
index. |
|
383
|
|
|
|
|
|
|
|
|
384
|
|
|
|
|
|
|
=head2 make_pk_map |
|
385
|
|
|
|
|
|
|
|
|
386
|
|
|
|
|
|
|
my $altcol2value2pk = make_pk_map($data, $pk_colkey, \@other_colkeys); |
|
387
|
|
|
|
|
|
|
|
|
388
|
|
|
|
|
|
|
Given C<$data>, the heading of the primary-key column, and an array-ref |
|
389
|
|
|
|
|
|
|
of headings of alternative key columns, returns a hash-ref mapping each |
|
390
|
|
|
|
|
|
|
of those alternative key columns (plus the C<$pk_colkey>) to a map from |
|
391
|
|
|
|
|
|
|
that column's value to the relevant row's primary-key value. |
|
392
|
|
|
|
|
|
|
|
|
393
|
|
|
|
|
|
|
This is most conveniently represented in YAML format: |
|
394
|
|
|
|
|
|
|
|
|
395
|
|
|
|
|
|
|
pk_spec: |
|
396
|
|
|
|
|
|
|
file: examples/country-codes.csv |
|
397
|
|
|
|
|
|
|
primary_key: ISO3166-1-Alpha-3 |
|
398
|
|
|
|
|
|
|
alt_keys: |
|
399
|
|
|
|
|
|
|
- ISO3166-1-Alpha-2 |
|
400
|
|
|
|
|
|
|
- UNTERM English Short |
|
401
|
|
|
|
|
|
|
- UNTERM English Formal |
|
402
|
|
|
|
|
|
|
- official_name_en |
|
403
|
|
|
|
|
|
|
- CLDR display name |
|
404
|
|
|
|
|
|
|
stopwords: |
|
405
|
|
|
|
|
|
|
- islands |
|
406
|
|
|
|
|
|
|
- china |
|
407
|
|
|
|
|
|
|
- northern |
|
408
|
|
|
|
|
|
|
|
|
409
|
|
|
|
|
|
|
=head2 pk_col_counts |
|
410
|
|
|
|
|
|
|
|
|
411
|
|
|
|
|
|
|
my ($colname2potential_key2count, $no_exact_match) = pk_col_counts($data, $pk_map); |
|
412
|
|
|
|
|
|
|
|
|
413
|
|
|
|
|
|
|
Given C<$data> and a primary-key (etc) map created by the above, returns |
|
414
|
|
|
|
|
|
|
a tuple of a hash-ref mapping each column that gave any matches to a |
|
415
|
|
|
|
|
|
|
further hash-ref mapping each of the potential key columns given above |
|
416
|
|
|
|
|
|
|
to how many matches it gave, and an array-ref of rows that had no exact |
|
417
|
|
|
|
|
|
|
matches. |
|
418
|
|
|
|
|
|
|
|
|
419
|
|
|
|
|
|
|
=head2 pk_match |
|
420
|
|
|
|
|
|
|
|
|
421
|
|
|
|
|
|
|
my ($best, $pk_cols_unique_best) = pk_match($value, $pk_map, $stopwords); |
|
422
|
|
|
|
|
|
|
|
|
423
|
|
|
|
|
|
|
Given a value, C<$pk_map>, and an array-ref of case-insensitive stopwords, |
|
424
|
|
|
|
|
|
|
returns its best match for the right primary-key value, and an array-ref |
|
425
|
|
|
|
|
|
|
of which primary-key columns in the C<$pk_map> matched the given value |
|
426
|
|
|
|
|
|
|
exactly once. |
|
427
|
|
|
|
|
|
|
|
|
428
|
|
|
|
|
|
|
The latter is useful for analysis purposes to select which primary-key |
|
429
|
|
|
|
|
|
|
column to use for this data-set. |
|
430
|
|
|
|
|
|
|
|
|
431
|
|
|
|
|
|
|
The algorithm used for this best-match: |
|
432
|
|
|
|
|
|
|
|
|
433
|
|
|
|
|
|
|
=over |
|
434
|
|
|
|
|
|
|
|
|
435
|
|
|
|
|
|
|
=item * |
|
436
|
|
|
|
|
|
|
|
|
437
|
|
|
|
|
|
|
Splits the value into words (or where a word is two or more capital |
|
438
|
|
|
|
|
|
|
letters, letters). The search allows any, or no, text, to occur between |
|
439
|
|
|
|
|
|
|
these entities. Each configured primary-key column's keys are searched |
|
440
|
|
|
|
|
|
|
for matches. |
|
441
|
|
|
|
|
|
|
|
|
442
|
|
|
|
|
|
|
=item * |
|
443
|
|
|
|
|
|
|
|
|
444
|
|
|
|
|
|
|
If there is a separating C<,> or C<(> (as commonly used for |
|
445
|
|
|
|
|
|
|
abbreviations), splits the value into chunks, reverses them, and then |
|
446
|
|
|
|
|
|
|
reassembles the chunks as above for a similar search. |
|
447
|
|
|
|
|
|
|
|
|
448
|
|
|
|
|
|
|
=item * |
|
449
|
|
|
|
|
|
|
|
|
450
|
|
|
|
|
|
|
Only if there were no matches from the previous steps, splits the value |
|
451
|
|
|
|
|
|
|
into words. Words that are shorter than three characters, or that occur in |
|
452
|
|
|
|
|
|
|
the stopword list, are omitted. Then each word is searched for as above. |
|
453
|
|
|
|
|
|
|
|
|
454
|
|
|
|
|
|
|
=item * |
|
455
|
|
|
|
|
|
|
|
|
456
|
|
|
|
|
|
|
"Votes" on which primary-key value got the most matches. Tie-breaks on |
|
457
|
|
|
|
|
|
|
which primary-key value matched on the shortest key in the relevant |
|
458
|
|
|
|
|
|
|
C<$pk_map> column, and then on the lexically lowest-valued primary-key |
|
459
|
|
|
|
|
|
|
value, to ensure stable return values. |
|
460
|
|
|
|
|
|
|
|
|
461
|
|
|
|
|
|
|
=back |
|
462
|
|
|
|
|
|
|
|
|
463
|
|
|
|
|
|
|
=head1 SEE ALSO |
|
464
|
|
|
|
|
|
|
|
|
465
|
|
|
|
|
|
|
L |
|
466
|
|
|
|
|
|
|
|
|
467
|
|
|
|
|
|
|
=head1 LICENSE AND COPYRIGHT |
|
468
|
|
|
|
|
|
|
|
|
469
|
|
|
|
|
|
|
Copyright (C) Ed J |
|
470
|
|
|
|
|
|
|
|
|
471
|
|
|
|
|
|
|
This library is free software; you can redistribute it and/or modify |
|
472
|
|
|
|
|
|
|
it under the same terms as Perl itself. |
|
473
|
|
|
|
|
|
|
|
|
474
|
|
|
|
|
|
|
=cut |