line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Data::FeatureFactory; |
2
|
|
|
|
|
|
|
|
3
|
2
|
|
|
2
|
|
132317
|
use strict; |
|
2
|
|
|
|
|
4
|
|
|
2
|
|
|
|
|
64
|
|
4
|
2
|
|
|
2
|
|
12
|
use Carp; |
|
2
|
|
|
|
|
4
|
|
|
2
|
|
|
|
|
153
|
|
5
|
2
|
|
|
2
|
|
11
|
use File::Basename; |
|
2
|
|
|
|
|
8
|
|
|
2
|
|
|
|
|
144
|
|
6
|
2
|
|
|
2
|
|
10
|
use Scalar::Util; |
|
2
|
|
|
|
|
3
|
|
|
2
|
|
|
|
|
744
|
|
7
|
|
|
|
|
|
|
|
8
|
|
|
|
|
|
|
our $VERSION = '0.0404'; |
9
|
|
|
|
|
|
|
my $PATH = &{ sub { return dirname( (caller)[1] ) } }; |
10
|
|
|
|
|
|
|
my $OPEN_OPTIONS; |
11
|
|
|
|
|
|
|
our $CURRENT_FEATURE; |
12
|
|
|
|
|
|
|
my %KNOWN_FORMATS = map {;$_=>1} qw/binary normal numeric/; |
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
# check if perl can open files in utf8 |
15
|
|
|
|
|
|
|
{ |
16
|
|
|
|
|
|
|
my $fh; |
17
|
|
|
|
|
|
|
undef $@; |
18
|
2
|
|
|
2
|
|
20
|
eval { open $fh, '<:encoding(utf8)', $0 }; |
|
2
|
|
|
|
|
4
|
|
|
2
|
|
|
|
|
50
|
|
19
|
|
|
|
|
|
|
if ($@) { |
20
|
|
|
|
|
|
|
$OPEN_OPTIONS = ''; |
21
|
|
|
|
|
|
|
warn qq{the open's :encoding directive not supported by your perl ($]). Files won't be opened in utf8 format.}; |
22
|
|
|
|
|
|
|
} |
23
|
|
|
|
|
|
|
else { $OPEN_OPTIONS = ':encoding(utf8)' } |
24
|
|
|
|
|
|
|
close $fh; |
25
|
|
|
|
|
|
|
} |
26
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
sub new : method { |
28
|
30
|
|
|
30
|
0
|
39586
|
my ($class, $args) = @_; |
29
|
30
|
50
|
|
|
|
95
|
$class = ref $class if ref $class; |
30
|
30
|
50
|
|
|
|
77
|
croak "Too many parameters to $class->new" if @_ > 2; |
31
|
30
|
|
|
|
|
62
|
my $self = bless +{}, $class; |
32
|
|
|
|
|
|
|
|
33
|
30
|
100
|
|
|
|
80
|
if (defined $args) { |
34
|
3
|
50
|
|
|
|
30
|
croak "The parameter to ${class}->new must be a hashref with options or nothing" if ref $args ne 'HASH'; |
35
|
3
|
|
|
|
|
8
|
my %accepted_option = map {;$_=>1} qw(N/A); |
|
3
|
|
|
|
|
16
|
|
36
|
3
|
|
|
|
|
19
|
while (my ($k, $v) = each %$args) { |
37
|
3
|
50
|
|
|
|
14
|
if (not exists $accepted_option{$k}) { |
38
|
0
|
|
|
|
|
0
|
croak "Unexpected option '$k' passed to ${class}->new" |
39
|
|
|
|
|
|
|
} |
40
|
3
|
50
|
|
|
|
11
|
if ($k eq 'N/A') { |
41
|
3
|
|
|
|
|
34
|
$self->{'N/A'} = "$v"; |
42
|
|
|
|
|
|
|
} |
43
|
|
|
|
|
|
|
} |
44
|
|
|
|
|
|
|
} |
45
|
|
|
|
|
|
|
|
46
|
2
|
|
|
2
|
|
16
|
no strict 'refs'; |
|
2
|
|
|
|
|
5
|
|
|
2
|
|
|
|
|
1250
|
|
47
|
30
|
50
|
|
|
|
35
|
if (not defined @{$class."::features"}) { |
|
30
|
|
|
|
|
164
|
|
48
|
0
|
|
|
|
|
0
|
croak "\@${class}::features not defined"; |
49
|
|
|
|
|
|
|
} |
50
|
30
|
|
|
|
|
33
|
our @features; |
51
|
30
|
|
|
|
|
36
|
*features = \@{$class."::features"}; |
|
30
|
|
|
|
|
88
|
|
52
|
2
|
|
|
2
|
|
12
|
use strict; |
|
2
|
|
|
|
|
3
|
|
|
2
|
|
|
|
|
790
|
|
53
|
30
|
50
|
|
|
|
75
|
if (not @features) { |
54
|
0
|
|
|
|
|
0
|
warn "$class has empty set of features. Not much fun"; |
55
|
|
|
|
|
|
|
} |
56
|
30
|
|
|
|
|
168
|
$self->{'features'} = []; |
57
|
30
|
|
|
|
|
70
|
my %feat_named; |
58
|
30
|
|
|
|
|
63
|
$self->{'feat_named'} = \%feat_named; |
59
|
30
|
|
|
|
|
32
|
my @featkeys; |
60
|
30
|
|
|
|
|
47
|
$self->{'featkeys'} = \@featkeys; |
61
|
30
|
|
|
|
|
1116
|
$self->{'caller_path'} = dirname( (caller)[1] ); |
62
|
|
|
|
|
|
|
|
63
|
30
|
|
|
|
|
66
|
my %supported_option = ( map {;$_=>1} qw(code default format label name postproc range type values values_file) ); |
|
300
|
|
|
|
|
546
|
|
64
|
30
|
|
|
|
|
97
|
my %accepted_option = ( map {;$_=>1} qw(cat2num cat2num_dyna num2cat num2cat_dyna num_values_fh values_ordered) ); |
|
180
|
|
|
|
|
336
|
|
65
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
# parse the @features array |
67
|
30
|
|
|
|
|
76
|
for my $original_feature (@features) { |
68
|
110
|
|
|
|
|
395
|
my $feature = { %$original_feature }; |
69
|
110
|
100
|
|
|
|
325
|
if (not exists $feature->{'name'}) { |
70
|
1
|
|
|
|
|
217
|
croak q{There was a feature without a name. Each record in the @features array must be a hashref with a 'name' field at least}; |
71
|
|
|
|
|
|
|
} |
72
|
109
|
|
|
|
|
182
|
my $name = $feature->{'name'}; |
73
|
109
|
100
|
|
|
|
223
|
if (exists $feat_named{$name}) { |
74
|
1
|
|
|
|
|
177
|
croak "Feature $name specified twice in \@${class}::features"; |
75
|
|
|
|
|
|
|
} |
76
|
108
|
|
|
|
|
121
|
push @{ $self->{'features'} }, $feature; |
|
108
|
|
|
|
|
277
|
|
77
|
108
|
|
|
|
|
285
|
$feat_named{$name} = $feature; |
78
|
108
|
|
|
|
|
156
|
push @featkeys, $name; |
79
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
# Check if there aren't illegal options |
81
|
108
|
|
|
|
|
306
|
for (keys %$feature) { |
82
|
299
|
100
|
|
|
|
695
|
if (not exists $supported_option{$_}) { |
83
|
2
|
100
|
|
|
|
8
|
if (exists $accepted_option{$_}) { |
84
|
1
|
|
|
|
|
13
|
warn "Option '$_' you specified for feature '$name' is not supported. Be sure you know what you are doing" |
85
|
|
|
|
|
|
|
} |
86
|
|
|
|
|
|
|
else { |
87
|
1
|
|
|
|
|
169
|
croak "Unrecognized option '$_' specified for feature '$name'"; |
88
|
|
|
|
|
|
|
} |
89
|
|
|
|
|
|
|
} |
90
|
|
|
|
|
|
|
} |
91
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
# Check if a postprocessing subroutine is declared |
93
|
|
|
|
|
|
|
# If it's a CODEref, we're OK. Else try to load it. |
94
|
107
|
100
|
100
|
|
|
398
|
if (exists $feature->{'postproc'} and ref $feature->{'postproc'} ne 'CODE') { |
95
|
4
|
|
|
|
|
5
|
my $postproc = $feature->{'postproc'}; |
96
|
2
|
|
|
2
|
|
14
|
no strict 'refs'; |
|
2
|
|
|
|
|
3
|
|
|
2
|
|
|
|
|
4781
|
|
97
|
4
|
|
|
|
|
7
|
my $postprocsub = \&{$postproc}; |
|
4
|
|
|
|
|
57
|
|
98
|
4
|
|
|
|
|
7
|
undef $@; |
99
|
4
|
|
|
|
|
6
|
eval { $postprocsub->() }; |
|
4
|
|
|
|
|
35
|
|
100
|
4
|
100
|
|
|
|
127
|
if ($@ =~ /Undefined subroutine/) { |
|
|
50
|
|
|
|
|
|
101
|
3
|
|
|
|
|
13
|
my ($package_name) = $postproc =~ /^( (?: \w+:: )+ )/x; |
102
|
3
|
|
|
|
|
4
|
my $ppname; |
103
|
3
|
100
|
66
|
|
|
16
|
if (defined $package_name and length $package_name > 0) { |
104
|
2
|
|
|
|
|
8
|
$package_name =~ s/::$//; |
105
|
2
|
|
|
|
|
14
|
local @INC = (@INC, $self->{'caller_path'}); |
106
|
2
|
|
|
|
|
3
|
undef $@; |
107
|
2
|
|
|
|
|
135
|
eval "require $package_name"; |
108
|
2
|
50
|
|
|
|
23660
|
if ($@) { |
109
|
0
|
|
|
|
|
0
|
warn "Failed loading module '$package_name'"; |
110
|
|
|
|
|
|
|
} |
111
|
2
|
|
|
|
|
11
|
$ppname = $postproc; |
112
|
|
|
|
|
|
|
} |
113
|
|
|
|
|
|
|
else { |
114
|
1
|
|
|
|
|
4
|
$ppname = $class.'::'.$postproc; |
115
|
|
|
|
|
|
|
} |
116
|
3
|
|
|
|
|
7
|
$postprocsub = \&{$ppname}; |
|
3
|
|
|
|
|
8
|
|
117
|
3
|
|
|
|
|
5
|
undef $@; |
118
|
3
|
|
|
|
|
7
|
eval { $postprocsub->() }; |
|
3
|
|
|
|
|
6
|
|
119
|
3
|
50
|
|
|
|
506
|
if ($@ =~ /^Undefined subroutine/) { |
120
|
0
|
|
|
|
|
0
|
croak "Couldn't load postprocessing function '$postproc' ($@)" |
121
|
|
|
|
|
|
|
} |
122
|
|
|
|
|
|
|
} |
123
|
|
|
|
|
|
|
elsif ($@ =~ /^Undefined subroutine/) { |
124
|
0
|
|
|
|
|
0
|
croak "Couldn't load postprocessing function '$postproc' ($@)" |
125
|
|
|
|
|
|
|
} |
126
|
4
|
|
|
|
|
12
|
$feature->{'postproc'} = $postprocsub; |
127
|
|
|
|
|
|
|
} |
128
|
|
|
|
|
|
|
|
129
|
|
|
|
|
|
|
# Check if values are specified and if they are a list of values. |
130
|
107
|
100
|
|
|
|
233
|
if (exists $feature->{'values'}) { |
131
|
29
|
100
|
|
|
|
71
|
if (exists $feature->{'values_file'}) { |
132
|
1
|
|
|
|
|
236
|
croak "Values specified both explicitly and by file for '$name'" |
133
|
|
|
|
|
|
|
} |
134
|
28
|
|
|
|
|
48
|
my $values = $feature->{'values'}; |
135
|
28
|
100
|
|
|
|
105
|
if (ref $values eq 'HASH') { # OK, do nothing |
|
|
50
|
|
|
|
|
|
136
|
|
|
|
|
|
|
} |
137
|
|
|
|
|
|
|
elsif (ref $values eq 'ARRAY') { # Convert the list to a hash. |
138
|
24
|
|
|
|
|
44
|
my %values = map {;$_ => 1} @$values; |
|
228
|
|
|
|
|
573
|
|
139
|
24
|
|
|
|
|
69
|
$feature->{'values_ordered'} = $values; |
140
|
24
|
|
|
|
|
54
|
$feature->{'values'} = \%values; |
141
|
|
|
|
|
|
|
} |
142
|
|
|
|
|
|
|
else { |
143
|
0
|
|
|
|
|
0
|
my $type; |
144
|
0
|
0
|
|
|
|
0
|
if (ref $values) { |
145
|
0
|
|
|
|
|
0
|
$type = lc(ref $values).'ref'; |
146
|
|
|
|
|
|
|
} |
147
|
|
|
|
|
|
|
else { |
148
|
0
|
|
|
|
|
0
|
$type = lc(ref \$values); |
149
|
|
|
|
|
|
|
} |
150
|
0
|
|
|
|
|
0
|
croak "The values must be specified as an arrayref or hashref, not $type" |
151
|
|
|
|
|
|
|
} |
152
|
|
|
|
|
|
|
} |
153
|
|
|
|
|
|
|
|
154
|
106
|
100
|
|
|
|
266
|
if (exists $feature->{'values_file'}) { |
155
|
4
|
|
|
|
|
19
|
my $values_fn = $feature->{'values_file'}; |
156
|
4
|
|
|
|
|
143
|
my $opened = open my $values_fh, '<'.$OPEN_OPTIONS, $values_fn; |
157
|
4
|
50
|
|
|
|
21
|
if (not $opened) { |
158
|
4
|
50
|
|
|
|
179
|
open $values_fh, '<'.$OPEN_OPTIONS, $self->{'caller_path'}.'/'.$values_fn |
159
|
|
|
|
|
|
|
or croak "Couldn't open file '$values_fn' specifying values for $name" |
160
|
|
|
|
|
|
|
} |
161
|
4
|
|
|
|
|
249
|
my %values; |
162
|
|
|
|
|
|
|
my @values; |
163
|
4
|
|
|
|
|
154
|
while (<$values_fh>) { |
164
|
104
|
|
|
|
|
168
|
chomp; |
165
|
104
|
|
|
|
|
285
|
$values{$_} = 1; |
166
|
104
|
|
|
|
|
361
|
push @values, $_; |
167
|
|
|
|
|
|
|
} |
168
|
4
|
|
|
|
|
369
|
close $values_fh; |
169
|
4
|
|
|
|
|
17
|
$feature->{'values'} = \%values; |
170
|
4
|
|
|
|
|
33
|
$feature->{'values_ordered'} = \@values; |
171
|
|
|
|
|
|
|
} |
172
|
|
|
|
|
|
|
|
173
|
106
|
100
|
|
|
|
212
|
if (exists $feature->{'range'}) { |
174
|
13
|
100
|
|
|
|
31
|
if (exists $feature->{'values'}) { |
175
|
2
|
|
|
|
|
363
|
croak "Both range and values specified for feature '$name'" |
176
|
|
|
|
|
|
|
} |
177
|
11
|
100
|
|
|
|
233
|
$feature->{'range'} =~ /^ (.+?) \s* \.{2,} \s* (.+) $/x |
178
|
|
|
|
|
|
|
or croak "Malformed range '$$feature{range}' of feature '$name'. Should be in format '0 .. 5'"; |
179
|
10
|
|
|
|
|
55
|
my $l = $1+0; |
180
|
10
|
|
|
|
|
40
|
my $r = $2+0; |
181
|
10
|
100
|
|
|
|
36
|
if (not $l < $r) { |
182
|
1
|
|
|
|
|
156
|
croak "Invalid range '$$feature{range}' specified for feature '$name'. The left boundary must be lesser than the right one" |
183
|
|
|
|
|
|
|
} |
184
|
|
|
|
|
|
|
|
185
|
9
|
100
|
|
|
|
46
|
if ($feature->{'type'} =~ /^int/i) { |
|
|
50
|
|
|
|
|
|
186
|
7
|
|
|
|
|
29
|
$feature->{'values'} = {map {;$_ => 1} $l .. $r}; |
|
37
|
|
|
|
|
90
|
|
187
|
7
|
|
|
|
|
30
|
$feature->{'values_ordered'} = [$l .. $r]; |
188
|
|
|
|
|
|
|
} |
189
|
|
|
|
|
|
|
elsif ($feature->{'type'} =~ /^num/i) { |
190
|
2
|
|
|
|
|
5
|
$feature->{'range_l'} = $l; |
191
|
2
|
|
|
|
|
6
|
$feature->{'range_r'} = $r; |
192
|
|
|
|
|
|
|
} |
193
|
|
|
|
|
|
|
} |
194
|
|
|
|
|
|
|
|
195
|
102
|
100
|
|
|
|
217
|
if (exists $feature->{'default'}) { |
196
|
14
|
50
|
66
|
|
|
43
|
if (not exists $feature->{'values'} and not exists $feature->{'range_l'}) { |
197
|
0
|
|
|
|
|
0
|
croak "Default value '$$feature{default}' but no values specified for feature '$name'" |
198
|
|
|
|
|
|
|
} |
199
|
|
|
|
|
|
|
} |
200
|
|
|
|
|
|
|
|
201
|
102
|
100
|
|
|
|
223
|
if (exists $feature->{'type'}) { |
202
|
41
|
|
|
|
|
90
|
my $type = lc substr $feature->{'type'}, 0, 3; |
203
|
41
|
|
|
|
|
71
|
my $type_OK = grep {$type eq $_} qw(boo int num cat); |
|
164
|
|
|
|
|
270
|
|
204
|
41
|
50
|
|
|
|
85
|
if (not $type_OK) { |
205
|
0
|
|
|
|
|
0
|
croak "The type of a feature, if given, should be 'integer', 'numeric', or 'categorial'" |
206
|
|
|
|
|
|
|
} |
207
|
41
|
|
|
|
|
78
|
$feature->{'type'} = $type; |
208
|
|
|
|
|
|
|
|
209
|
|
|
|
|
|
|
# check if the values comply to the type |
210
|
41
|
100
|
|
|
|
183
|
if ($type eq 'boo') { |
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
211
|
15
|
100
|
|
|
|
34
|
if (exists $feature->{'values'}) { |
212
|
5
|
50
|
|
|
|
10
|
my @values = exists $feature->{'values_ordered'} ? @{ $feature->{'values_ordered'} } : values(%{ $feature->{'values'} }); |
|
5
|
|
|
|
|
12
|
|
|
0
|
|
|
|
|
0
|
|
213
|
5
|
100
|
|
|
|
12
|
if (@values > 2) { |
214
|
1
|
|
|
|
|
2
|
my $num_values = @values; |
215
|
1
|
|
|
|
|
154
|
croak "More than two values ($num_values) specified for feature '$name'" |
216
|
|
|
|
|
|
|
} |
217
|
4
|
|
|
|
|
5
|
my ($false, $true); |
218
|
|
|
|
|
|
|
# boolify the values |
219
|
4
|
|
|
|
|
7
|
for (@values) { |
220
|
7
|
100
|
|
|
|
11
|
if ($_) { |
221
|
4
|
100
|
|
|
|
9
|
if (defined $true) { |
222
|
1
|
|
|
|
|
257
|
croak "True value (literal: '$true', '$_') for feature '$name' specified multiple times" |
223
|
|
|
|
|
|
|
} |
224
|
3
|
|
|
|
|
3
|
$true = $_; |
225
|
|
|
|
|
|
|
} |
226
|
|
|
|
|
|
|
else { |
227
|
3
|
100
|
|
|
|
9
|
if (defined $false) { |
228
|
1
|
|
|
|
|
162
|
croak "False value (literal: '$false', '$_') for feature '$name' specified multiple times" |
229
|
|
|
|
|
|
|
} |
230
|
2
|
|
|
|
|
2
|
$false = $_; |
231
|
|
|
|
|
|
|
} |
232
|
5
|
100
|
|
|
|
12
|
$_ = $_ ? 1 : 0; |
233
|
|
|
|
|
|
|
} |
234
|
2
|
50
|
|
|
|
6
|
if (exists $feature->{'values_ordered'}) { |
235
|
2
|
|
|
|
|
4
|
$feature->{'values_ordered'} = \@values; |
236
|
|
|
|
|
|
|
} |
237
|
2
|
|
|
|
|
3
|
$feature->{'values'} = +{ map {;$_=>1} @values }; |
|
3
|
|
|
|
|
10
|
|
238
|
|
|
|
|
|
|
} |
239
|
|
|
|
|
|
|
else { |
240
|
10
|
|
|
|
|
37
|
$feature->{'values'} = {0 => 1, 1 => 1}; |
241
|
10
|
|
|
|
|
25
|
$feature->{'values_ordered'} = [0,1]; |
242
|
|
|
|
|
|
|
} |
243
|
12
|
100
|
|
|
|
41
|
if (exists $feature->{'default'}) { |
244
|
2
|
|
|
|
|
3
|
my $def = $feature->{'default'}; |
245
|
2
|
|
|
|
|
3
|
my @vals = values %{ $feature->{'values'} }; |
|
2
|
|
|
|
|
6
|
|
246
|
2
|
100
|
|
|
|
6
|
if (@vals > 1) { |
247
|
1
|
|
|
|
|
169
|
croak "Default value '$def' specified for boolean feature '$name' which has both values allowed" |
248
|
|
|
|
|
|
|
} |
249
|
1
|
50
|
25
|
|
|
10
|
unless ($def xor $vals[0]) { |
250
|
1
|
50
|
|
|
|
7
|
my $val = $def ? 'true' : 'false'; |
251
|
1
|
|
|
|
|
173
|
croak "Default and allowed value are both $val for feature '$name'"; |
252
|
|
|
|
|
|
|
} |
253
|
0
|
0
|
|
|
|
0
|
$feature->{'default'} = $def ? 1 : 0; |
254
|
|
|
|
|
|
|
} |
255
|
|
|
|
|
|
|
} |
256
|
|
|
|
|
|
|
elsif ($type eq 'int') { |
257
|
14
|
100
|
|
|
|
59
|
if (exists $feature->{'values'}) { |
258
|
7
|
50
|
|
|
|
23
|
my @values = exists $feature->{'values_ordered'} ? @{ $feature->{'values_ordered'} } : values(%{ $feature->{'values'} }); |
|
7
|
|
|
|
|
24
|
|
|
0
|
|
|
|
|
0
|
|
259
|
|
|
|
|
|
|
# integrify the values |
260
|
7
|
|
|
|
|
15
|
for (@values) { |
261
|
37
|
|
|
|
|
52
|
$_ = int $_; |
262
|
|
|
|
|
|
|
} |
263
|
7
|
50
|
|
|
|
20
|
if (exists $feature->{'values_ordered'}) { |
264
|
7
|
|
|
|
|
16
|
$feature->{'values_ordered'} = \@values; |
265
|
|
|
|
|
|
|
} |
266
|
7
|
|
|
|
|
15
|
$feature->{'values'} = +{ map {;$_=>1} @values }; |
|
37
|
|
|
|
|
81
|
|
267
|
|
|
|
|
|
|
} |
268
|
14
|
100
|
|
|
|
57
|
if (exists $feature->{'default'}) { |
269
|
1
|
|
|
|
|
40
|
$feature->{'default'} = int $feature->{'default'}; |
270
|
|
|
|
|
|
|
} |
271
|
|
|
|
|
|
|
} |
272
|
|
|
|
|
|
|
elsif ($type eq 'num') { |
273
|
|
|
|
|
|
|
# numify the features, producing warnings if used |
274
|
9
|
50
|
|
|
|
30
|
if (exists $feature->{'values'}) { |
275
|
0
|
0
|
|
|
|
0
|
my @values = exists $feature->{'values_ordered'} ? @{ $feature->{'values_ordered'} } : values(%{ $feature->{'values'} }); |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
276
|
0
|
|
|
|
|
0
|
for (@values) { |
277
|
0
|
|
|
|
|
0
|
$_ += 0; |
278
|
|
|
|
|
|
|
} |
279
|
0
|
0
|
|
|
|
0
|
if (exists $feature->{'values_ordered'}) { |
280
|
0
|
|
|
|
|
0
|
$feature->{'values_ordered'} = \@values; |
281
|
|
|
|
|
|
|
} |
282
|
0
|
|
|
|
|
0
|
$feature->{'values'} = +{ map {;$_=>1} @values }; |
|
0
|
|
|
|
|
0
|
|
283
|
|
|
|
|
|
|
} |
284
|
9
|
100
|
|
|
|
29
|
if (exists $feature->{'default'}) { |
285
|
1
|
|
|
|
|
3
|
$feature->{'default'} += 0; |
286
|
|
|
|
|
|
|
} |
287
|
|
|
|
|
|
|
} |
288
|
|
|
|
|
|
|
} |
289
|
|
|
|
|
|
|
|
290
|
97
|
100
|
|
|
|
280
|
if (exists $feature->{'format'}) { |
291
|
9
|
|
|
|
|
21
|
my $format = $feature->{'format'}; |
292
|
9
|
100
|
|
|
|
51
|
if (not $format =~ /^ (?: normal | numeric | binary ) $/x) { |
293
|
1
|
|
|
|
|
245
|
croak "Invalid format '$format' specified for feature '$name'. Please specify 'normal', 'numeric' or 'binary'" |
294
|
|
|
|
|
|
|
} |
295
|
8
|
100
|
66
|
|
|
33
|
if (not exists $feature->{'values'} and $format eq 'binary') { |
296
|
1
|
|
|
|
|
228
|
croak "Feature '$name' has format: 'binary' specified but doesn't have values specified" |
297
|
|
|
|
|
|
|
} |
298
|
|
|
|
|
|
|
} |
299
|
|
|
|
|
|
|
|
300
|
|
|
|
|
|
|
# find the actual code of the feature |
301
|
95
|
|
|
|
|
100
|
my $code; |
302
|
2
|
|
|
2
|
|
14
|
no strict 'refs'; |
|
2
|
|
|
|
|
5
|
|
|
2
|
|
|
|
|
12650
|
|
303
|
95
|
100
|
100
|
|
|
180
|
if (exists $feature->{'code'}) { |
|
49
|
100
|
|
|
|
235
|
|
304
|
46
|
|
|
|
|
66
|
$code = $feature->{'code'}; |
305
|
46
|
50
|
|
|
|
110
|
if (ref $code ne 'CODE') { |
306
|
0
|
|
|
|
|
0
|
croak "'code' was specified for feature '$name' but it's not a coderef" |
307
|
|
|
|
|
|
|
} |
308
|
|
|
|
|
|
|
} |
309
|
2
|
|
|
|
|
12
|
elsif (%{$class.'::features'} and exists ${$class.'::features'}{$name}) { |
310
|
1
|
|
|
|
|
2
|
$code = ${$class.'::features'}{$name}; |
|
1
|
|
|
|
|
4
|
|
311
|
1
|
50
|
|
|
|
13
|
if (ref $code ne 'CODE') { |
312
|
0
|
|
|
|
|
0
|
croak "Found $name in \%${class}::features but it's not a coderef" |
313
|
|
|
|
|
|
|
} |
314
|
|
|
|
|
|
|
} |
315
|
|
|
|
|
|
|
else { |
316
|
48
|
|
|
|
|
48
|
$code = *{$class.'::'.$name}{CODE}; |
|
48
|
|
|
|
|
149
|
|
317
|
48
|
50
|
|
|
|
134
|
if (ref $code ne 'CODE') { |
318
|
0
|
|
|
|
|
0
|
croak "Couldn't find the code (function) for feature '$name'. Define it as a function '$name' in the '$class' package. Stopped" |
319
|
|
|
|
|
|
|
} |
320
|
|
|
|
|
|
|
} |
321
|
95
|
|
|
|
|
241
|
$feature->{'code'} = $code; |
322
|
|
|
|
|
|
|
|
323
|
95
|
100
|
|
|
|
281
|
if (exists $feature->{'label'}) { |
324
|
23
|
|
|
|
|
37
|
my $label = $feature->{'label'}; |
325
|
23
|
100
|
|
|
|
54
|
if (ref $label eq 'ARRAY') { |
|
|
50
|
|
|
|
|
|
326
|
13
|
|
|
|
|
21
|
$feature->{'label'} = {map {;uc($_) => 1} @$label}; |
|
31
|
|
|
|
|
119
|
|
327
|
|
|
|
|
|
|
} |
328
|
|
|
|
|
|
|
elsif (ref $label) { |
329
|
0
|
|
|
|
|
0
|
croak "Label must be a string or an array of strings - feature '$name' has a ".ref($label).'ref' |
330
|
|
|
|
|
|
|
} |
331
|
|
|
|
|
|
|
else { |
332
|
10
|
|
|
|
|
102
|
$feature->{'label'} = {uc($label) => 1}; |
333
|
|
|
|
|
|
|
} |
334
|
|
|
|
|
|
|
} |
335
|
|
|
|
|
|
|
} |
336
|
|
|
|
|
|
|
# print map "*$_\n", map keys(%$_), @{ $self->{'features'} }; |
337
|
15
|
|
|
|
|
104
|
return $self |
338
|
|
|
|
|
|
|
} |
339
|
|
|
|
|
|
|
|
340
|
|
|
|
|
|
|
sub expand_names : method { |
341
|
194
|
|
|
194
|
0
|
339
|
my ($self, $featnames) = @_; |
342
|
194
|
100
|
100
|
|
|
970
|
if (not ref $featnames and exists $self->{expand_names_cache}{$featnames}) { |
343
|
105
|
|
|
|
|
329
|
return $self->{expand_names_cache}{$featnames} |
344
|
|
|
|
|
|
|
} |
345
|
89
|
|
|
|
|
107
|
my $orig_featnames = $featnames; |
346
|
89
|
|
|
|
|
100
|
my @featkeys = @{ $self->{'featkeys'} }; |
|
89
|
|
|
|
|
318
|
|
347
|
89
|
|
|
|
|
125
|
my %feat_named = %{ $self->{'feat_named'} }; |
|
89
|
|
|
|
|
501
|
|
348
|
|
|
|
|
|
|
|
349
|
89
|
100
|
66
|
|
|
494
|
if ($featnames eq 'ALL') { |
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
350
|
4
|
|
|
|
|
10
|
$featnames = \@featkeys; |
351
|
|
|
|
|
|
|
} |
352
|
|
|
|
|
|
|
elsif (ref $featnames eq 'ARRAY') { |
353
|
|
|
|
|
|
|
# $featnames = [@$featnames]; # make a copy |
354
|
|
|
|
|
|
|
} |
355
|
|
|
|
|
|
|
# features given by labels |
356
|
|
|
|
|
|
|
elsif ($featnames !~ /[[:lower:]]/ and $featnames =~ /[[:upper:]]/) { |
357
|
10
|
|
|
|
|
36
|
my @all_labels = split /\s+/, $featnames; |
358
|
10
|
|
|
|
|
19
|
my @plus_labels = map {s/^\+//; $_} grep {substr($_, 0, 1) ne '-'} @all_labels; |
|
12
|
|
|
|
|
20
|
|
|
12
|
|
|
|
|
26
|
|
|
19
|
|
|
|
|
53
|
|
359
|
10
|
|
|
|
|
52
|
my @minus_labels = map {substr $_, 1} grep {substr($_, 0, 1) eq '-'} @all_labels; |
|
7
|
|
|
|
|
19
|
|
|
19
|
|
|
|
|
43
|
|
360
|
|
|
|
|
|
|
# Specifying just '-LABEL' means all but those that have LABEL |
361
|
10
|
100
|
66
|
|
|
42
|
if (@plus_labels == 0 and @minus_labels > 0) { |
362
|
3
|
|
|
|
|
7
|
@plus_labels = qw(ALL); |
363
|
|
|
|
|
|
|
} |
364
|
10
|
100
|
|
|
|
21
|
if (grep {$_ eq 'ALL'} @minus_labels) { |
|
7
|
|
|
|
|
20
|
|
365
|
1
|
|
|
|
|
205
|
croak "Label 'ALL' is special and can't be used with the minus sign, as in $featnames" |
366
|
|
|
|
|
|
|
} |
367
|
9
|
|
|
|
|
15
|
$featnames = []; |
368
|
9
|
|
|
|
|
16
|
for my $featkey (@featkeys) { |
369
|
127
|
|
|
|
|
197
|
my $feature = $feat_named{ $featkey }; |
370
|
127
|
100
|
|
|
|
134
|
my $included = grep { $_ eq 'ALL' or exists $feature->{'label'}{$_} } @plus_labels; |
|
191
|
|
|
|
|
667
|
|
371
|
127
|
|
|
|
|
140
|
my $excluded = grep { exists $feature->{'label'}{$_} } @minus_labels; |
|
79
|
|
|
|
|
148
|
|
372
|
127
|
100
|
100
|
|
|
500
|
push @$featnames, $featkey if $included and not $excluded; |
373
|
|
|
|
|
|
|
} |
374
|
|
|
|
|
|
|
} |
375
|
|
|
|
|
|
|
else { |
376
|
25
|
|
|
|
|
71
|
$featnames = ["$featnames"]; |
377
|
|
|
|
|
|
|
} |
378
|
|
|
|
|
|
|
|
379
|
88
|
100
|
|
|
|
181
|
if (not ref $orig_featnames) { $self->{expand_names_cache}{$orig_featnames} = $featnames } |
|
38
|
|
|
|
|
97
|
|
380
|
88
|
|
|
|
|
328
|
return $featnames |
381
|
|
|
|
|
|
|
} |
382
|
|
|
|
|
|
|
|
383
|
|
|
|
|
|
|
sub evaluate : method { |
384
|
165
|
|
|
165
|
0
|
35952
|
my ($self, $featnames, $format, @args) = @_; |
385
|
165
|
|
|
|
|
252
|
my $class = ref $self; |
386
|
|
|
|
|
|
|
|
387
|
165
|
|
|
|
|
383
|
$featnames = $self->expand_names($featnames); |
388
|
164
|
|
|
|
|
191
|
my @feats; |
389
|
164
|
100
|
|
|
|
616
|
if (exists $self->{evaluate_featnames_cache}{"@$featnames"}) { |
390
|
112
|
|
|
|
|
119
|
@feats = @{ $self->{evaluate_featnames_cache}{"@$featnames"} }; |
|
112
|
|
|
|
|
473
|
|
391
|
|
|
|
|
|
|
} |
392
|
|
|
|
|
|
|
else { |
393
|
52
|
|
|
|
|
50
|
my %feat_named = %{ $self->{feat_named} }; |
|
52
|
|
|
|
|
259
|
|
394
|
52
|
|
|
|
|
117
|
for my $featname (@$featnames) { |
395
|
145
|
50
|
|
|
|
278
|
if (not exists $feat_named{$featname}) { |
396
|
0
|
|
|
|
|
0
|
croak "Feature '$featname' you wish to evaluate was not found among known features (these are: @{$self->{featkeys}})" |
|
0
|
|
|
|
|
0
|
|
397
|
|
|
|
|
|
|
} |
398
|
145
|
|
|
|
|
277
|
push @feats, $feat_named{$featname}; |
399
|
|
|
|
|
|
|
} |
400
|
52
|
|
|
|
|
263
|
$self->{evaluate_featnames_cache}{"@$featnames"} = \@feats; |
401
|
|
|
|
|
|
|
} |
402
|
|
|
|
|
|
|
|
403
|
164
|
100
|
|
|
|
415
|
if (not exists $KNOWN_FORMATS{$format}) { |
404
|
1
|
|
|
|
|
5
|
croak "Unknown format: '$format'. Please specify one of: @{[keys %KNOWN_FORMATS]}." |
|
1
|
|
|
|
|
222
|
|
405
|
|
|
|
|
|
|
} |
406
|
163
|
|
|
|
|
243
|
for my $feature (@feats) { |
407
|
676
|
|
|
|
|
1656
|
$self->_create_mapping($feature, $format); |
408
|
|
|
|
|
|
|
} |
409
|
|
|
|
|
|
|
|
410
|
161
|
50
|
|
|
|
382
|
if (@args == 0) { |
411
|
0
|
|
|
|
|
0
|
warn 'No arguments specified for the features.'; |
412
|
|
|
|
|
|
|
} |
413
|
|
|
|
|
|
|
### Done argument checking. |
414
|
|
|
|
|
|
|
|
415
|
|
|
|
|
|
|
### Traverse the features and evaluate them |
416
|
161
|
|
|
|
|
185
|
my @rv; |
417
|
161
|
|
|
|
|
214
|
for my $feature (@feats) { |
418
|
670
|
|
|
|
|
1114
|
my $name = $feature->{'name'}; |
419
|
670
|
|
|
|
|
784
|
$CURRENT_FEATURE = $name; |
420
|
670
|
|
|
|
|
1936
|
my $normrv = $feature->{'code'}(@args); |
421
|
670
|
|
|
|
|
4354
|
undef $CURRENT_FEATURE; |
422
|
670
|
100
|
|
|
|
1667
|
my $format = exists $feature->{'format'} ? $feature->{'format'} : $format; |
423
|
|
|
|
|
|
|
|
424
|
670
|
100
|
100
|
|
|
2367
|
if (not defined $normrv and exists $self->{'N/A'}) { |
425
|
145
|
|
|
|
|
209
|
my $na = $self->{'N/A'}; |
426
|
145
|
100
|
100
|
|
|
689
|
if (exists $feature->{'type'} and $feature->{'type'} eq 'boo') { |
|
|
100
|
|
|
|
|
|
427
|
41
|
|
|
|
|
93
|
push @rv, $na; |
428
|
|
|
|
|
|
|
} |
429
|
|
|
|
|
|
|
elsif ($format eq 'binary') { |
430
|
|
|
|
|
|
|
# take one of the vectors in cat2bin |
431
|
45
|
|
|
|
|
45
|
my @dummy = @{ (values %{ $feature->{'cat2bin'} })[0] }; |
|
45
|
|
|
|
|
47
|
|
|
45
|
|
|
|
|
209
|
|
432
|
45
|
50
|
|
|
|
200
|
if (not @dummy) { |
433
|
0
|
|
|
|
|
0
|
croak "Couldn't determine the length of bit vector for feature '$name'," |
434
|
|
|
|
|
|
|
."which was about to be evaluated in binary and returned undef" |
435
|
|
|
|
|
|
|
} |
436
|
45
|
|
|
|
|
349
|
push @rv, map $na, @dummy; |
437
|
|
|
|
|
|
|
} |
438
|
|
|
|
|
|
|
else { |
439
|
59
|
|
|
|
|
137
|
push @rv, $na; |
440
|
|
|
|
|
|
|
} |
441
|
|
|
|
|
|
|
} |
442
|
|
|
|
|
|
|
else { |
443
|
|
|
|
|
|
|
# Normally format the value. The eval babble is there to take care of unexpected values. |
444
|
525
|
|
|
|
|
703
|
undef $@; |
445
|
525
|
|
|
|
|
696
|
my @val = eval { _format_value($feature, $normrv, $format, @args) }; |
|
525
|
|
|
|
|
946
|
|
446
|
525
|
100
|
|
|
|
1188
|
if ($@) { |
447
|
5
|
100
|
66
|
|
|
36
|
if (ref $@ and $@->isa('Data::FeatureFactory::SoftError')) { |
448
|
4
|
|
|
|
|
4
|
warn ${$@}; |
|
4
|
|
|
|
|
28
|
|
449
|
|
|
|
|
|
|
return |
450
|
4
|
|
|
|
|
84
|
} |
451
|
|
|
|
|
|
|
else { |
452
|
1
|
|
|
|
|
6
|
die $@ |
453
|
|
|
|
|
|
|
} |
454
|
|
|
|
|
|
|
} |
455
|
520
|
|
|
|
|
1581
|
push @rv, @val; |
456
|
|
|
|
|
|
|
} |
457
|
|
|
|
|
|
|
} |
458
|
|
|
|
|
|
|
|
459
|
156
|
|
|
|
|
1935
|
return @rv[0 .. $#rv] |
460
|
|
|
|
|
|
|
} |
461
|
|
|
|
|
|
|
|
462
|
|
|
|
|
|
|
sub _format_value { |
463
|
1086
|
|
|
1086
|
|
2326
|
my ($feature, $normrv, $format, @args) = @_; |
464
|
1086
|
|
|
|
|
1139
|
my @rv; |
465
|
1086
|
|
|
|
|
1464
|
my $name = $feature->{'name'}; |
466
|
1086
|
|
|
|
|
2208
|
local $\; local $,; |
|
1086
|
|
|
|
|
1303
|
|
467
|
|
|
|
|
|
|
|
468
|
|
|
|
|
|
|
# convert to number if appropriate |
469
|
1086
|
100
|
|
|
|
2450
|
if (exists $feature->{'type'}) { |
470
|
374
|
|
|
|
|
651
|
my $type = $feature->{'type'}; |
471
|
374
|
100
|
100
|
|
|
1805
|
if ($type eq 'num' or $type eq 'int') { |
472
|
185
|
|
|
|
|
299
|
$normrv += 0; |
473
|
|
|
|
|
|
|
} |
474
|
374
|
100
|
|
|
|
1351
|
if ($type eq 'int') { |
475
|
158
|
|
|
|
|
169
|
$normrv = int $normrv; |
476
|
|
|
|
|
|
|
} |
477
|
374
|
100
|
|
|
|
884
|
if ($type eq 'boo') { |
478
|
136
|
100
|
|
|
|
368
|
$normrv = $normrv ? 1 : 0; |
479
|
|
|
|
|
|
|
} |
480
|
|
|
|
|
|
|
} |
481
|
|
|
|
|
|
|
|
482
|
|
|
|
|
|
|
# check if the value is a legal one |
483
|
1086
|
100
|
|
|
|
2303
|
if (exists $feature->{'values'}) { |
|
|
100
|
|
|
|
|
|
484
|
834
|
100
|
|
|
|
2349
|
if (exists $feature->{'values'}{$normrv}) { # alles gute |
|
|
100
|
|
|
|
|
|
485
|
|
|
|
|
|
|
} |
486
|
|
|
|
|
|
|
elsif (exists $feature->{'default'}) { |
487
|
307
|
|
|
|
|
530
|
$normrv = $feature->{'default'}; |
488
|
|
|
|
|
|
|
} |
489
|
|
|
|
|
|
|
else { |
490
|
4
|
|
|
|
|
28
|
die Data::FeatureFactory::SoftError->new("Feature '$name' returned unexpected value '$normrv' on arguments '@args'") |
491
|
|
|
|
|
|
|
} |
492
|
|
|
|
|
|
|
} |
493
|
|
|
|
|
|
|
# check the range for numeric features |
494
|
|
|
|
|
|
|
elsif (exists $feature->{'range_l'}) { |
495
|
20
|
50
|
|
|
|
93
|
if (not exists $feature->{'range_r'}) { |
496
|
0
|
|
|
|
|
0
|
die "feature '$name' has range_l but not range_r"; |
497
|
|
|
|
|
|
|
} |
498
|
20
|
50
|
|
|
|
48
|
if ($normrv < $feature->{'range_l'}) { |
499
|
0
|
0
|
|
|
|
0
|
if (exists $feature->{'default'}) { |
500
|
0
|
|
|
|
|
0
|
$normrv = $feature->{'default'}; |
501
|
|
|
|
|
|
|
} |
502
|
|
|
|
|
|
|
else { |
503
|
0
|
|
|
|
|
0
|
die Data::FeatureFactory::SoftError->new( |
504
|
|
|
|
|
|
|
"Feature '$name' returned an unexpected value '$normrv' below the left allowed boundary '$$feature{range_l}'" |
505
|
|
|
|
|
|
|
) |
506
|
|
|
|
|
|
|
} |
507
|
|
|
|
|
|
|
} |
508
|
20
|
100
|
|
|
|
48
|
if ($normrv > $feature->{'range_r'}) { |
509
|
1
|
50
|
|
|
|
4
|
if (exists $feature->{'default'}) { |
510
|
1
|
|
|
|
|
2
|
$normrv = $feature->{'default'}; |
511
|
|
|
|
|
|
|
} |
512
|
|
|
|
|
|
|
else { |
513
|
0
|
|
|
|
|
0
|
die Data::FeatureFactory::SoftError->new( |
514
|
|
|
|
|
|
|
"Feature '$name' returned an unexpected value '$normrv' above the right allowed boundary '$$feature{range_r}'" |
515
|
|
|
|
|
|
|
) |
516
|
|
|
|
|
|
|
} |
517
|
|
|
|
|
|
|
} |
518
|
|
|
|
|
|
|
} |
519
|
|
|
|
|
|
|
|
520
|
1082
|
100
|
|
|
|
2612
|
if ($format eq 'normal') { |
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
521
|
425
|
100
|
|
|
|
883
|
if (exists $feature->{'postproc'}) { |
522
|
101
|
|
|
|
|
306
|
$normrv = $feature->{'postproc'}->($normrv); |
523
|
|
|
|
|
|
|
} |
524
|
424
|
|
|
|
|
1105
|
@rv = ($normrv); |
525
|
|
|
|
|
|
|
} |
526
|
|
|
|
|
|
|
elsif ($format eq 'numeric') { |
527
|
414
|
100
|
66
|
|
|
1832
|
if (exists $feature->{'type'} and $feature->{'type'} =~ /^( num | int | boo )$/x) { |
|
|
100
|
|
|
|
|
|
528
|
143
|
|
|
|
|
470
|
@rv = ($normrv); |
529
|
|
|
|
|
|
|
} |
530
|
|
|
|
|
|
|
elsif (exists $feature->{'cat2num'}) { |
531
|
219
|
50
|
|
|
|
518
|
if (not exists $feature->{'cat2num'}{$normrv}) { |
532
|
0
|
|
|
|
|
0
|
croak "Feature '$name' has the value '$normrv' for which there is no mapping to numbers" |
533
|
|
|
|
|
|
|
} |
534
|
219
|
|
|
|
|
568
|
@rv = ($feature->{'cat2num'}{$normrv}); |
535
|
|
|
|
|
|
|
} |
536
|
|
|
|
|
|
|
else { # dynamically creating the mapping |
537
|
52
|
|
|
|
|
58
|
my $n; |
538
|
52
|
50
|
|
|
|
159
|
if (exists $feature->{'cat2num_dyna'}{$normrv}) { |
539
|
52
|
|
|
|
|
362
|
$n = $feature->{'cat2num_dyna'}{$normrv}; |
540
|
|
|
|
|
|
|
} |
541
|
|
|
|
|
|
|
else { |
542
|
0
|
|
|
|
|
0
|
$n = ++$feature->{'num_value_max'}; |
543
|
0
|
|
|
|
|
0
|
$feature->{'cat2num_dyna'}{$normrv} = $n; |
544
|
0
|
|
|
|
|
0
|
$feature->{'num2cat_dyna'}{$n} = $normrv; |
545
|
0
|
|
|
|
|
0
|
my @toprint = ($normrv, $n); |
546
|
0
|
0
|
|
|
|
0
|
if (exists $feature->{'postproc'}) { |
547
|
0
|
|
|
|
|
0
|
my $ppd = $feature->{'postproc'}->($normrv); |
548
|
0
|
|
|
|
|
0
|
$feature->{'pp2cat_dyna'}{$ppd} = $normrv; |
549
|
0
|
|
|
|
|
0
|
push @toprint, $ppd; |
550
|
|
|
|
|
|
|
} |
551
|
0
|
0
|
|
|
|
0
|
print {$feature->{'num_values_fh'}} join("\t", @toprint)."\n" |
|
0
|
|
|
|
|
0
|
|
552
|
|
|
|
|
|
|
or croak "Couldn't print the mapping of categorial value '$normrv' to numeric value '$n' for feature '$name' to a file ($!).\n" |
553
|
|
|
|
|
|
|
. 'Please provide a list of values for the feature to avoid this' |
554
|
|
|
|
|
|
|
} |
555
|
52
|
|
|
|
|
112
|
@rv = ($n); |
556
|
|
|
|
|
|
|
} |
557
|
|
|
|
|
|
|
} |
558
|
|
|
|
|
|
|
elsif ($format eq 'binary') { |
559
|
243
|
100
|
100
|
|
|
960
|
if (exists $feature->{'type'} and $feature->{'type'} eq 'boo') { |
|
|
50
|
|
|
|
|
|
560
|
39
|
|
|
|
|
101
|
@rv = ($normrv); |
561
|
|
|
|
|
|
|
} |
562
|
|
|
|
|
|
|
elsif (not exists $feature->{'cat2bin'}{$normrv}) { |
563
|
0
|
|
|
|
|
0
|
croak "No mapping for value '$normrv' to binary in feature '$name'" |
564
|
|
|
|
|
|
|
} |
565
|
|
|
|
|
|
|
else { |
566
|
204
|
|
|
|
|
195
|
@rv = @{ $feature->{'cat2bin'}{$normrv} }; |
|
204
|
|
|
|
|
1350
|
|
567
|
|
|
|
|
|
|
} |
568
|
|
|
|
|
|
|
} |
569
|
|
|
|
|
|
|
else { |
570
|
0
|
|
|
|
|
0
|
croak "Unrecognized format '$format'" |
571
|
|
|
|
|
|
|
} |
572
|
|
|
|
|
|
|
return @rv |
573
|
1081
|
|
|
|
|
6099
|
} |
574
|
|
|
|
|
|
|
|
575
|
|
|
|
|
|
|
sub _values_of { |
576
|
244
|
|
|
244
|
|
283
|
my ($feature) = @_; |
577
|
244
|
|
|
|
|
240
|
my @values; |
578
|
244
|
100
|
|
|
|
671
|
if (exists $feature->{'values_ordered'}) { |
|
|
50
|
|
|
|
|
|
579
|
242
|
|
|
|
|
224
|
@values = @{ $feature->{'values_ordered'} }; |
|
242
|
|
|
|
|
910
|
|
580
|
|
|
|
|
|
|
} |
581
|
|
|
|
|
|
|
elsif (exists $feature->{'values'}) { |
582
|
2
|
|
|
|
|
3
|
@values = keys %{ $feature->{'values'} }; |
|
2
|
|
|
|
|
13
|
|
583
|
|
|
|
|
|
|
} |
584
|
|
|
|
|
|
|
else { |
585
|
0
|
|
|
|
|
0
|
croak "Attempted to gather the values of feature '$$feature{name}', which has none specified" |
586
|
|
|
|
|
|
|
} |
587
|
244
|
100
|
66
|
|
|
830
|
if (exists $feature->{'default'} and not exists $feature->{'values'}{ $feature->{'default'} }) { |
588
|
41
|
|
|
|
|
63
|
push @values, $feature->{'default'}; |
589
|
|
|
|
|
|
|
} |
590
|
|
|
|
|
|
|
return @values |
591
|
244
|
|
|
|
|
790
|
} |
592
|
|
|
|
|
|
|
|
593
|
|
|
|
|
|
|
sub _create_mapping : method { |
594
|
1049
|
|
|
1049
|
|
1482
|
my ($class, $feature, $format) = @_; |
595
|
1049
|
50
|
|
|
|
2554
|
$class = ref $class if ref $class; |
596
|
1049
|
100
|
66
|
|
|
2976
|
if (exists $feature->{'format'} and $format ne 'postprocd') { |
597
|
211
|
|
|
|
|
332
|
$format = $feature->{'format'}; |
598
|
|
|
|
|
|
|
} |
599
|
|
|
|
|
|
|
|
600
|
1049
|
100
|
|
|
|
3597
|
if (lc $format eq 'normal') { |
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
601
|
|
|
|
|
|
|
} |
602
|
|
|
|
|
|
|
elsif (lc $format eq 'numeric') { |
603
|
279
|
100
|
100
|
|
|
1254
|
return if exists $feature->{'type'} and $feature->{'type'} eq 'num'; |
604
|
262
|
100
|
100
|
|
|
1173
|
return if exists $feature->{'type'} and $feature->{'type'} eq 'int'; |
605
|
193
|
100
|
66
|
|
|
666
|
return if exists $feature->{'type'} and $feature->{'type'} eq 'boo'; |
606
|
140
|
100
|
|
|
|
492
|
return if exists $feature->{'cat2num'}; # Blindly trusting that what we have here is a sane mapping from the original values to numbers |
607
|
38
|
|
|
|
|
64
|
my $name = $feature->{'name'}; |
608
|
38
|
100
|
|
|
|
125
|
if (not exists $feature->{'values'}) { |
609
|
28
|
100
|
|
|
|
225
|
return if exists $feature->{'num_values_fh'}; |
610
|
3
|
|
|
|
|
37
|
warn "Categorial feature '$name' is about to be evaluated numerically but has no set of values specified"; |
611
|
3
|
|
|
|
|
86
|
(my $num_values_basename = $class.'__'.$name) =~ s/\W/_/g; |
612
|
3
|
|
|
|
|
8
|
$num_values_basename = '.FeatureFactory.'.$num_values_basename; |
613
|
3
|
|
|
|
|
56
|
my @filenames_to_try = ( |
614
|
|
|
|
|
|
|
$PATH.'/'.$num_values_basename, |
615
|
|
|
|
|
|
|
$ENV{'HOME'}.'/'.$num_values_basename, |
616
|
|
|
|
|
|
|
'/tmp/'.$num_values_basename, |
617
|
|
|
|
|
|
|
); |
618
|
3
|
|
|
|
|
7
|
my $num_values_fh; |
619
|
|
|
|
|
|
|
my $opened; |
620
|
3
|
|
|
|
|
7
|
my $num_value_max = 0; |
621
|
|
|
|
|
|
|
FILENAME_R: |
622
|
3
|
|
|
|
|
10
|
for my $fn (@filenames_to_try) { |
623
|
3
|
|
|
|
|
277
|
$opened = open my $fh, '+<'.$OPEN_OPTIONS, $fn; |
624
|
3
|
50
|
|
|
|
154
|
if ($opened) { |
625
|
3
|
|
|
|
|
7
|
local $_; # for some reason, this is necessary to prevent crashes (Modification of read-only value) when e.g. in for(qw(a b)){ } |
626
|
3
|
|
|
|
|
88
|
while (<$fh>) { |
627
|
22
|
|
|
|
|
64
|
chomp; |
628
|
22
|
|
|
|
|
91
|
my ($cat, $num, $ppd) = split /\t/; |
629
|
22
|
50
|
|
|
|
54
|
$num_value_max = $num if $num > $num_value_max; |
630
|
22
|
|
|
|
|
66
|
$feature->{'cat2num_dyna'}{$cat} = $num; |
631
|
22
|
|
|
|
|
51
|
$feature->{'num2cat_dyna'}{$num} = $cat; |
632
|
22
|
100
|
|
|
|
104
|
$feature->{'pp2cat_dyna' }{$ppd} = $cat if defined $ppd; |
633
|
|
|
|
|
|
|
} |
634
|
3
|
|
|
|
|
365
|
print STDERR "Saving the mapping for feature '$name' to file $fn\n"; |
635
|
3
|
|
|
|
|
16
|
$feature->{'num_values_fh'} = $fh; |
636
|
|
|
|
|
|
|
last FILENAME_R |
637
|
3
|
|
|
|
|
14
|
} |
638
|
|
|
|
|
|
|
} |
639
|
|
|
|
|
|
|
# If there's no file to recover from, try to start a new one |
640
|
3
|
50
|
|
|
|
11
|
if (not $opened) { FILENAME_W: for my $fn (@filenames_to_try) { |
|
0
|
|
|
|
|
0
|
|
641
|
0
|
|
|
|
|
0
|
$opened = open my $fh, '>'.$OPEN_OPTIONS, $fn; |
642
|
0
|
0
|
|
|
|
0
|
if ($opened) { |
643
|
0
|
|
|
|
|
0
|
print STDERR "Saving the mapping for feature '$name' to file $fn\n"; |
644
|
0
|
|
|
|
|
0
|
$feature->{'num_values_fh'} = $fh; |
645
|
|
|
|
|
|
|
last FILENAME_W |
646
|
0
|
|
|
|
|
0
|
} |
647
|
|
|
|
|
|
|
}} |
648
|
3
|
50
|
|
|
|
12
|
if (not $opened) { |
649
|
0
|
|
|
|
|
0
|
delete $feature->{'num_values_fh'}; |
650
|
0
|
|
|
|
|
0
|
croak "Couldn't open a file for saving the mapping the categories of feature '$name' to numbers. " |
651
|
|
|
|
|
|
|
. 'Please specify the values for the feature to avoid this' |
652
|
|
|
|
|
|
|
} |
653
|
3
|
|
|
|
|
16
|
$feature->{'num_value_max'} = $num_value_max; |
654
|
|
|
|
|
|
|
} |
655
|
|
|
|
|
|
|
else { # Got values specified - create a mapping |
656
|
10
|
|
|
|
|
34
|
my @values = _values_of($feature); |
657
|
10
|
|
|
|
|
29
|
my $n = 1; |
658
|
10
|
|
|
|
|
21
|
for my $value (@values) { |
659
|
111
|
|
|
|
|
234
|
$feature->{'cat2num'}{$value} = $n; |
660
|
111
|
|
|
|
|
538
|
$feature->{'num2cat'}{$n} = $value; |
661
|
|
|
|
|
|
|
} continue { |
662
|
111
|
|
|
|
|
172
|
$n++; |
663
|
|
|
|
|
|
|
} |
664
|
|
|
|
|
|
|
} |
665
|
|
|
|
|
|
|
} |
666
|
|
|
|
|
|
|
elsif (lc $format eq 'binary') { |
667
|
273
|
100
|
100
|
|
|
1148
|
return if exists $feature->{'type'} and $feature->{'type'} eq 'boo'; |
668
|
231
|
100
|
|
|
|
674
|
return if exists $feature->{'cat2bin'}; |
669
|
18
|
|
|
|
|
30
|
my $name = $feature->{'name'}; |
670
|
18
|
100
|
66
|
|
|
75
|
if (not exists $feature->{'values_ordered'} and not exists $feature->{'values'}) { |
671
|
2
|
|
|
|
|
427
|
croak "Attempted to convert feature '$name' to binary without specifying its values"; |
672
|
|
|
|
|
|
|
} |
673
|
|
|
|
|
|
|
|
674
|
16
|
|
|
|
|
36
|
my @values = _values_of($feature); |
675
|
|
|
|
|
|
|
|
676
|
16
|
|
|
|
|
26
|
my $n = 0; |
677
|
16
|
|
|
|
|
185
|
my @zeroes = (0) x scalar(@values); |
678
|
16
|
|
|
|
|
32
|
for my $value (@values) { |
679
|
121
|
|
|
|
|
377
|
my @vector = @zeroes; |
680
|
121
|
|
|
|
|
187
|
$vector[$n] = 1; |
681
|
121
|
|
|
|
|
322
|
$feature->{'cat2bin'}{$value} = \@vector; |
682
|
121
|
|
|
|
|
1171
|
$feature->{'bin2cat'}{join(' ', @vector)} = $value; |
683
|
|
|
|
|
|
|
} continue { |
684
|
121
|
|
|
|
|
257
|
$n++; |
685
|
|
|
|
|
|
|
} |
686
|
|
|
|
|
|
|
} |
687
|
|
|
|
|
|
|
elsif ($format eq 'postprocd') { |
688
|
11
|
100
|
|
|
|
47
|
return if exists $feature->{'pp2cat'}; |
689
|
2
|
|
|
|
|
5
|
my $name = $feature->{'name'}; |
690
|
2
|
50
|
|
|
|
7
|
if (not exists $feature->{'postproc'}) { |
691
|
0
|
|
|
|
|
0
|
croak "Feature '$name' doesn't have a postprocessing function specified - can't create mapping from postprocessed values. Stopped" |
692
|
|
|
|
|
|
|
} |
693
|
2
|
|
|
|
|
3
|
my $ppfun = $feature->{'postproc'}; |
694
|
2
|
|
|
|
|
5
|
my @values = _values_of($feature); |
695
|
2
|
|
|
|
|
5
|
my %pp2cat; |
696
|
2
|
|
|
|
|
13
|
for my $value (@values) { |
697
|
32
|
|
|
|
|
56
|
my $ppd = $ppfun->($value); |
698
|
32
|
|
|
|
|
140
|
$pp2cat{ $ppd } = $value; |
699
|
|
|
|
|
|
|
} |
700
|
2
|
|
|
|
|
10
|
$feature->{'pp2cat'} = \%pp2cat; |
701
|
|
|
|
|
|
|
} |
702
|
|
|
|
|
|
|
else { |
703
|
0
|
|
|
|
|
0
|
croak "Format '$format' not recognized - please specify 'normal', 'numeric', 'binary' or 'postprocd' (should have caught this earlier)" |
704
|
|
|
|
|
|
|
} |
705
|
|
|
|
|
|
|
} |
706
|
|
|
|
|
|
|
|
707
|
|
|
|
|
|
|
sub names : method { |
708
|
34
|
|
|
34
|
1
|
5541
|
my ($self) = @_; |
709
|
34
|
|
|
|
|
44
|
return map $_->{'name'}, @{ $self->{'features'} } |
|
34
|
|
|
|
|
346
|
|
710
|
|
|
|
|
|
|
} |
711
|
|
|
|
|
|
|
|
712
|
|
|
|
|
|
|
sub _vector_length { # how many bits will the binary representation of this feature have |
713
|
268
|
|
|
268
|
|
321
|
my ($feature) = @_; |
714
|
268
|
100
|
100
|
|
|
882
|
if (exists $feature->{'type'} and $feature->{'type'} eq 'boo') { |
715
|
52
|
|
|
|
|
89
|
return 1 |
716
|
|
|
|
|
|
|
} |
717
|
216
|
|
|
|
|
334
|
return scalar _values_of($feature) |
718
|
|
|
|
|
|
|
} |
719
|
|
|
|
|
|
|
|
720
|
|
|
|
|
|
|
sub _shift_value { |
721
|
40
|
|
|
40
|
|
706
|
my ($feature, $format, $values) = @_; |
722
|
40
|
50
|
|
|
|
80
|
if ($format ne 'binary') { |
723
|
0
|
|
|
|
|
0
|
return shift @$values |
724
|
|
|
|
|
|
|
} |
725
|
40
|
|
|
|
|
93
|
my $n = _vector_length($feature); |
726
|
40
|
50
|
|
|
|
75
|
if (@$values < $n) { |
727
|
0
|
|
|
|
|
0
|
croak "There's not enough fields left to shift a $format value (width $n) of feature '$$feature{name}' from a length " |
728
|
|
|
|
|
|
|
. scalar(@$values) . " list ('@$values')" |
729
|
|
|
|
|
|
|
} |
730
|
40
|
|
|
|
|
244
|
return splice @$values, 0, $n |
731
|
|
|
|
|
|
|
} |
732
|
|
|
|
|
|
|
|
733
|
|
|
|
|
|
|
sub _init_translation { |
734
|
32
|
|
|
32
|
|
62
|
my ($self, $names, $options) = @_; |
735
|
32
|
50
|
|
|
|
380
|
if (ref($names) ne 'ARRAY') { |
736
|
0
|
|
|
|
|
0
|
croak 'Names must be given by an arrayref' |
737
|
|
|
|
|
|
|
} |
738
|
32
|
50
|
|
|
|
85
|
if (ref($options) ne 'HASH') { |
739
|
0
|
|
|
|
|
0
|
croak 'Options must be given by a hashref' |
740
|
|
|
|
|
|
|
} |
741
|
|
|
|
|
|
|
|
742
|
32
|
|
|
|
|
68
|
my %accepted_options = map {;$_=>1} qw( |
|
288
|
|
|
|
|
546
|
|
743
|
|
|
|
|
|
|
names from_format to_format from_NA to_NA FS OFS header ignore |
744
|
|
|
|
|
|
|
); |
745
|
32
|
|
|
|
|
143
|
for (keys %$options) { |
746
|
118
|
50
|
|
|
|
276
|
if (not exists $accepted_options{$_}) { |
747
|
0
|
|
|
|
|
0
|
croak "Translate does not accept option '$_'. Accepted options are: ".join(' ', keys %accepted_options).'. Stopped' |
748
|
|
|
|
|
|
|
} |
749
|
|
|
|
|
|
|
} |
750
|
|
|
|
|
|
|
|
751
|
32
|
|
|
|
|
75
|
my $from_format = $options->{'from_format'}; |
752
|
32
|
|
|
|
|
44
|
my $to_format = $options->{'to_format'}; |
753
|
32
|
|
|
|
|
54
|
for ($from_format, $to_format) { |
754
|
64
|
50
|
|
|
|
314
|
if (! m/^(?: normal | numeric | binary )$/x) { |
755
|
0
|
|
|
|
|
0
|
croak '{to,from}_format must be one of "normal", "numeric" or "binary"' |
756
|
|
|
|
|
|
|
} |
757
|
|
|
|
|
|
|
} |
758
|
|
|
|
|
|
|
|
759
|
32
|
100
|
100
|
|
|
459
|
if (exists $options->{'from_NA'} and exists $options->{'to_NA'}) { |
|
|
100
|
66
|
|
|
|
|
|
|
100
|
66
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
760
|
|
|
|
|
|
|
} |
761
|
|
|
|
|
|
|
elsif (exists $options->{'from_NA'} and exists $self->{'N/A'}) { |
762
|
3
|
|
|
|
|
10
|
$options->{'to_NA'} = $self->{'N/A'}; |
763
|
|
|
|
|
|
|
} |
764
|
|
|
|
|
|
|
elsif (exists $options->{'to_NA'} and exists $self->{'N/A'}) { |
765
|
3
|
|
|
|
|
8
|
$options->{'from_NA'} = $self->{'N/A'}; |
766
|
|
|
|
|
|
|
} |
767
|
|
|
|
|
|
|
elsif (exists $options->{'to_NA'}) { |
768
|
0
|
|
|
|
|
0
|
$options->{'from_NA'} = undef; |
769
|
|
|
|
|
|
|
} |
770
|
|
|
|
|
|
|
elsif (exists $options->{'from_NA'}) { |
771
|
0
|
|
|
|
|
0
|
croak 'from_NA specified but neither to_NA nor global N/A value specified' |
772
|
|
|
|
|
|
|
} |
773
|
|
|
|
|
|
|
elsif (exists $self->{'N/A'}) { |
774
|
15
|
|
|
|
|
52
|
$options->{'from_NA'} = $options->{'to_NA'} = $self->{'N/A'}; |
775
|
|
|
|
|
|
|
} |
776
|
|
|
|
|
|
|
|
777
|
32
|
50
|
66
|
|
|
105
|
if (exists $options->{'header'} and not $options->{'header'}) { |
778
|
0
|
|
|
|
|
0
|
delete $options->{'header'}; |
779
|
|
|
|
|
|
|
} |
780
|
|
|
|
|
|
|
|
781
|
32
|
100
|
|
|
|
76
|
if (exists $options->{'ignore'}) { |
782
|
4
|
|
|
|
|
52
|
my $ignore = $options->{'ignore'}; |
783
|
4
|
|
|
|
|
9
|
$options->{'ignore'} = []; |
784
|
|
|
|
|
|
|
|
785
|
4
|
100
|
|
|
|
12
|
if (not ref $ignore) { |
786
|
2
|
|
|
|
|
4
|
$ignore = [$ignore]; |
787
|
|
|
|
|
|
|
} |
788
|
|
|
|
|
|
|
|
789
|
4
|
50
|
|
|
|
9
|
if (ref($ignore) eq 'ARRAY') { |
790
|
4
|
|
|
|
|
18
|
my $has_non_nums = grep !Scalar::Util::looks_like_number($_), @$ignore; |
791
|
4
|
50
|
|
|
|
10
|
if ($has_non_nums) { |
792
|
0
|
|
|
|
|
0
|
warn 'Some of the specifications of columns to ignore are non-numeric' |
793
|
|
|
|
|
|
|
} |
794
|
4
|
|
|
|
|
10
|
for my $idx (@$ignore) { |
795
|
9
|
50
|
|
|
|
17
|
if ($idx < 0) { |
796
|
0
|
|
|
|
|
0
|
croak "Negative column indices aren't currently supported. Trailing columns are ignored always. Stopped" |
797
|
|
|
|
|
|
|
} |
798
|
9
|
|
|
|
|
19
|
$options->{'ignore'}[ $idx ] = 1; |
799
|
|
|
|
|
|
|
} |
800
|
|
|
|
|
|
|
} |
801
|
|
|
|
|
|
|
else { |
802
|
0
|
|
|
|
|
0
|
croak 'Option "ignore" can only be a column number or an array thereof. Stopped' |
803
|
|
|
|
|
|
|
} |
804
|
|
|
|
|
|
|
|
805
|
|
|
|
|
|
|
# Remove the names of the columns to ignore if the names come from a header |
806
|
4
|
100
|
|
|
|
13
|
if (exists $options->{'header'}) { |
807
|
2
|
|
|
|
|
11
|
for my $idx (sort {$b <=> $a} @$ignore) { |
|
4
|
|
|
|
|
6
|
|
808
|
5
|
|
|
|
|
12
|
splice @$names, $idx, 1; |
809
|
|
|
|
|
|
|
} |
810
|
|
|
|
|
|
|
} |
811
|
|
|
|
|
|
|
} |
812
|
|
|
|
|
|
|
|
813
|
32
|
|
|
|
|
37
|
my (@features, @widths); |
814
|
32
|
|
|
|
|
102
|
my %names = map {;$_=>1} $self->names; |
|
237
|
|
|
|
|
442
|
|
815
|
32
|
|
|
|
|
103
|
for my $name (@$names) { |
816
|
181
|
50
|
|
|
|
385
|
if (not exists $names{$name}) { |
817
|
0
|
|
|
|
|
0
|
croak "Feature '$name' not found among ".join(' ', $self->names).". Stopped" |
818
|
|
|
|
|
|
|
} |
819
|
181
|
|
|
|
|
334
|
my $feature = $self->{'feat_named'}{ $name }; |
820
|
181
|
|
|
|
|
365
|
$self->_create_mapping($feature, $from_format); |
821
|
181
|
|
|
|
|
363
|
$self->_create_mapping($feature, $to_format); |
822
|
181
|
100
|
100
|
|
|
694
|
if ($from_format eq 'normal' and exists $feature->{'postproc'}) { |
823
|
15
|
100
|
66
|
|
|
84
|
if (exists $feature->{'values'}) { |
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
824
|
11
|
|
|
|
|
26
|
$self->_create_mapping($feature, 'postprocd'); |
825
|
|
|
|
|
|
|
} |
826
|
|
|
|
|
|
|
elsif (exists $feature->{'format'} or $to_format eq 'normal') { |
827
|
|
|
|
|
|
|
# translating normal -> normal -- kein problem |
828
|
|
|
|
|
|
|
} |
829
|
|
|
|
|
|
|
elsif (join(' ', sort $from_format, $to_format) eq 'normal numeric') { |
830
|
|
|
|
|
|
|
# translating with dynamic mapping |
831
|
|
|
|
|
|
|
} |
832
|
|
|
|
|
|
|
else { |
833
|
0
|
|
|
|
|
0
|
croak "Feature '$name' is postprocessed and about to be translated from normal but has no values specified. Stopped" |
834
|
|
|
|
|
|
|
} |
835
|
|
|
|
|
|
|
} |
836
|
181
|
|
|
|
|
229
|
push @features, $feature; |
837
|
181
|
|
|
|
|
220
|
my $bin = 0; |
838
|
181
|
100
|
100
|
|
|
823
|
if (exists $feature->{'format'} and $feature->{'format'} eq 'binary') { |
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
839
|
24
|
|
|
|
|
30
|
$bin = 1; |
840
|
|
|
|
|
|
|
} |
841
|
|
|
|
|
|
|
elsif (exists $feature->{'format'}) {} |
842
|
|
|
|
|
|
|
elsif ($from_format eq 'binary') { |
843
|
36
|
|
|
|
|
49
|
$bin = 1; |
844
|
|
|
|
|
|
|
} |
845
|
181
|
100
|
|
|
|
359
|
my $width = $bin ? _vector_length($feature) : 1; |
846
|
181
|
|
|
|
|
413
|
push @widths, $width; |
847
|
|
|
|
|
|
|
} |
848
|
32
|
|
|
|
|
810
|
return map [$names->[$_], $features[$_], $widths[$_]], 0 .. $#features |
849
|
|
|
|
|
|
|
} |
850
|
|
|
|
|
|
|
|
851
|
|
|
|
|
|
|
my %x2cat = ( |
852
|
|
|
|
|
|
|
binary => 'bin2cat', |
853
|
|
|
|
|
|
|
numeric => 'num2cat', |
854
|
|
|
|
|
|
|
postprocd => 'pp2cat', |
855
|
|
|
|
|
|
|
); |
856
|
|
|
|
|
|
|
|
857
|
|
|
|
|
|
|
sub _translate_row : method { |
858
|
203
|
|
|
203
|
|
322
|
my ($self, $descrs, $values, $options) = @_; |
859
|
203
|
50
|
|
|
|
459
|
if (ref($values) ne 'ARRAY') { |
860
|
0
|
|
|
|
|
0
|
croak 'Values must be given by an arrayref' |
861
|
|
|
|
|
|
|
} |
862
|
203
|
50
|
|
|
|
385
|
if (@$values < @$descrs) { |
863
|
0
|
|
|
|
|
0
|
croak "There's not enough values in the \@values array (".scalar(@$values).") to match the number of features (".scalar(@$descrs).")"; |
864
|
|
|
|
|
|
|
} |
865
|
203
|
|
|
|
|
504
|
my ($from_format, $to_format, $from_NA, $to_NA, $ignore) = @$options{qw( |
866
|
|
|
|
|
|
|
from_format to_format from_NA to_NA ignore)}; |
867
|
|
|
|
|
|
|
|
868
|
203
|
|
|
|
|
231
|
my $coln = 0; |
869
|
203
|
|
|
|
|
193
|
my @rv; |
870
|
|
|
|
|
|
|
FEATNAME: |
871
|
203
|
|
|
|
|
274
|
for my $descr (@$descrs) { |
872
|
1297
|
|
|
|
|
2075
|
my ($name, $feature, $width) = @$descr; |
873
|
1297
|
100
|
|
|
|
2326
|
if (defined $ignore) { |
874
|
160
|
|
|
|
|
312
|
while (exists $ignore->[ $coln++ ]) { |
875
|
80
|
|
|
|
|
189
|
push @rv, shift @$values; |
876
|
|
|
|
|
|
|
} |
877
|
|
|
|
|
|
|
} |
878
|
1297
|
100
|
|
|
|
3112
|
my $from_format = exists $feature->{'format'} ? $feature->{'format'} : $from_format; |
879
|
1297
|
100
|
|
|
|
2511
|
my $to_format = exists $feature->{'format'} ? $feature->{'format'} : $to_format; |
880
|
1297
|
|
|
|
|
3458
|
my @value = splice @$values, 0, $width; |
881
|
1297
|
50
|
|
|
|
2857
|
if (@value == 0) { |
882
|
0
|
|
|
|
|
0
|
croak "Zero-width value obtained for feature '$name'" |
883
|
|
|
|
|
|
|
} |
884
|
|
|
|
|
|
|
|
885
|
|
|
|
|
|
|
# Check if the value is N/A |
886
|
1297
|
|
|
|
|
1735
|
my $is_NA = 0; |
887
|
1297
|
100
|
|
|
|
2356
|
if (@value == 1) { |
888
|
950
|
|
|
|
|
1394
|
my $value = $value[0]; |
889
|
950
|
100
|
100
|
|
|
6477
|
if (defined $from_NA and $value eq $from_NA) { |
|
|
50
|
66
|
|
|
|
|
|
|
|
33
|
|
|
|
|
890
|
235
|
|
|
|
|
445
|
$is_NA = 1; |
891
|
|
|
|
|
|
|
} |
892
|
|
|
|
|
|
|
elsif (defined $to_NA and not defined $value and not defined $from_NA) { |
893
|
0
|
|
|
|
|
0
|
$is_NA = 1; |
894
|
|
|
|
|
|
|
} |
895
|
|
|
|
|
|
|
} |
896
|
|
|
|
|
|
|
else { |
897
|
347
|
50
|
66
|
|
|
1104
|
if (defined $to_NA and not grep {defined $_} @value and not defined $from_NA) { |
|
4750
|
100
|
33
|
|
|
10671
|
|
|
4750
|
|
100
|
|
|
14748
|
|
898
|
0
|
|
|
|
|
0
|
$is_NA = 1; |
899
|
|
|
|
|
|
|
} |
900
|
|
|
|
|
|
|
elsif (defined $from_NA and not grep {$_ ne $from_NA} @value) { |
901
|
133
|
|
|
|
|
168
|
$is_NA = 1; |
902
|
|
|
|
|
|
|
} |
903
|
|
|
|
|
|
|
} |
904
|
|
|
|
|
|
|
|
905
|
|
|
|
|
|
|
# Append the N/A if appropriate |
906
|
1297
|
100
|
|
|
|
5501
|
if ($is_NA) { |
907
|
368
|
100
|
|
|
|
902
|
my $n = $to_format eq 'binary' ? _vector_length($feature) : 1; |
908
|
368
|
|
|
|
|
821
|
push @rv, ( ($to_NA) x $n ); |
909
|
|
|
|
|
|
|
next FEATNAME |
910
|
368
|
|
|
|
|
1096
|
} |
911
|
|
|
|
|
|
|
|
912
|
929
|
100
|
|
|
|
1605
|
if ($from_format eq $to_format) { |
913
|
368
|
|
|
|
|
780
|
push @rv, @value; |
914
|
|
|
|
|
|
|
next FEATNAME |
915
|
368
|
|
|
|
|
1155
|
} |
916
|
|
|
|
|
|
|
else { |
917
|
561
|
|
|
|
|
660
|
my $catval; |
918
|
561
|
|
|
|
|
1895
|
my $from_format = $from_format; |
919
|
561
|
100
|
100
|
|
|
1820
|
if ($from_format eq 'normal' and exists $feature->{'postproc'}) { |
920
|
102
|
|
|
|
|
122
|
$from_format = 'postprocd'; |
921
|
|
|
|
|
|
|
} |
922
|
561
|
100
|
100
|
|
|
3579
|
if ($from_format eq 'normal') { |
|
|
100
|
66
|
|
|
|
|
|
|
100
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
923
|
171
|
|
|
|
|
485
|
($catval) = @value; |
924
|
|
|
|
|
|
|
} |
925
|
|
|
|
|
|
|
elsif ($from_format eq 'numeric' and exists $feature->{'type'} and $feature->{'type'} =~ /^(int|num|boo)$/) { |
926
|
49
|
|
|
|
|
75
|
($catval) = @value; |
927
|
|
|
|
|
|
|
} |
928
|
|
|
|
|
|
|
elsif ($from_format eq 'binary' and exists $feature->{'type'} and $feature->{'type'} eq 'boo') { |
929
|
27
|
|
|
|
|
40
|
($catval) = @value; |
930
|
|
|
|
|
|
|
} |
931
|
|
|
|
|
|
|
else { |
932
|
314
|
|
|
|
|
562
|
my $transfer = $x2cat{ $from_format }; |
933
|
314
|
50
|
|
|
|
581
|
if (not defined $transfer) { |
934
|
0
|
|
|
|
|
0
|
croak "Internal error: Unexpected value for \$from_format: '$from_format'" |
935
|
|
|
|
|
|
|
} |
936
|
314
|
100
|
|
|
|
612
|
if (not exists $feature->{ $transfer }) { |
937
|
40
|
50
|
|
|
|
194
|
if (exists $feature->{ $transfer.'_dyna' }) { |
938
|
40
|
|
|
|
|
142
|
$transfer = $transfer.'_dyna'; |
939
|
|
|
|
|
|
|
} |
940
|
|
|
|
|
|
|
else { |
941
|
0
|
|
|
|
|
0
|
croak "Cannot find mapping '$transfer' for feature '$name'" |
942
|
|
|
|
|
|
|
} |
943
|
|
|
|
|
|
|
} |
944
|
314
|
|
|
|
|
1122
|
my $valval = join(' ', @value); |
945
|
314
|
50
|
|
|
|
801
|
if (not exists $feature->{ $transfer }{ $valval }) { |
946
|
0
|
|
|
|
|
0
|
my $hint = ''; |
947
|
0
|
0
|
|
|
|
0
|
if ($valval eq $feature->{'name'}) { |
948
|
0
|
|
|
|
|
0
|
$hint = ". Maybe you forgot there was a header in your file? Stopped" |
949
|
|
|
|
|
|
|
} |
950
|
0
|
|
|
|
|
0
|
croak "Unexpected value '$valval' of feature '$name' for transfer '$transfer'$hint" |
951
|
|
|
|
|
|
|
} |
952
|
314
|
|
|
|
|
709
|
$catval = $feature->{ $transfer }{ $valval }; |
953
|
|
|
|
|
|
|
} |
954
|
|
|
|
|
|
|
|
955
|
561
|
|
|
|
|
1031
|
my @formatted = _format_value($feature, $catval, $to_format, 'NO_ARGS:TRANSLATING_ONLY'); |
956
|
561
|
|
|
|
|
2631
|
push @rv, @formatted; |
957
|
|
|
|
|
|
|
} |
958
|
|
|
|
|
|
|
} |
959
|
|
|
|
|
|
|
|
960
|
|
|
|
|
|
|
# Append the trailing columns |
961
|
203
|
|
|
|
|
352
|
push @rv, @$values; |
962
|
|
|
|
|
|
|
|
963
|
|
|
|
|
|
|
return @rv |
964
|
203
|
|
|
|
|
2688
|
} |
965
|
|
|
|
|
|
|
|
966
|
|
|
|
|
|
|
sub translate_row : method { |
967
|
13
|
|
|
13
|
0
|
10819
|
my ($self, $names, $values, $options) = @_; |
968
|
13
|
|
|
|
|
33
|
$names = $self->expand_names($names); |
969
|
13
|
|
|
|
|
39
|
my @descrs = $self->_init_translation($names, $options); |
970
|
13
|
|
|
|
|
49
|
$self->_translate_row(\@descrs, $values, $options); |
971
|
|
|
|
|
|
|
} |
972
|
|
|
|
|
|
|
|
973
|
|
|
|
|
|
|
sub translate : method { |
974
|
19
|
|
|
19
|
0
|
31686
|
my ($self, $source, $sink, $options) = @_; |
975
|
19
|
|
|
|
|
52
|
local $\; local $,; |
|
19
|
|
|
|
|
40
|
|
976
|
19
|
50
|
|
|
|
90
|
if (not defined Scalar::Util::openhandle($source)) { |
977
|
0
|
|
|
|
|
0
|
croak 'Source must be given by an open filehandle' |
978
|
|
|
|
|
|
|
} |
979
|
19
|
50
|
|
|
|
60
|
if (not defined Scalar::Util::openhandle($sink)) { |
980
|
0
|
|
|
|
|
0
|
croak 'Destination must be given by an open filehandle' |
981
|
|
|
|
|
|
|
} |
982
|
19
|
50
|
|
|
|
66
|
if (ref($options) ne 'HASH') { |
983
|
0
|
|
|
|
|
0
|
croak 'Options must be given by a hashref' |
984
|
|
|
|
|
|
|
} |
985
|
19
|
|
|
|
|
124
|
my $ifs = $options->{'FS'}; |
986
|
19
|
100
|
|
|
|
69
|
my $ofs = exists $options->{'OFS'} ? $options->{'OFS'} : $ifs; |
987
|
19
|
|
|
|
|
27
|
my @names; |
988
|
|
|
|
|
|
|
my @orig_header_fields; |
989
|
19
|
100
|
33
|
|
|
87
|
if (exists $options->{'names'}) { |
|
|
50
|
|
|
|
|
|
990
|
16
|
|
|
|
|
20
|
@names = @{ $self->expand_names($options->{'names'}) }; |
|
16
|
|
|
|
|
52
|
|
991
|
|
|
|
|
|
|
} |
992
|
|
|
|
|
|
|
elsif (exists $options->{'header'} and $options->{'header'}) { |
993
|
3
|
|
|
|
|
40
|
my $row = <$source>; |
994
|
3
|
|
|
|
|
8
|
chomp $row; |
995
|
3
|
|
|
|
|
74
|
@names = split /(?:\Q$ifs\E)+/, $row; |
996
|
3
|
|
|
|
|
13
|
@orig_header_fields = @names; |
997
|
|
|
|
|
|
|
} |
998
|
|
|
|
|
|
|
else { |
999
|
0
|
|
|
|
|
0
|
croak 'No feature names specified for translate' |
1000
|
|
|
|
|
|
|
} |
1001
|
|
|
|
|
|
|
|
1002
|
19
|
|
|
|
|
74
|
my @descrs = $self->_init_translation(\@names, $options); |
1003
|
|
|
|
|
|
|
|
1004
|
|
|
|
|
|
|
# Translate the header, if there's one. |
1005
|
19
|
100
|
|
|
|
68
|
if (@orig_header_fields) { |
1006
|
3
|
|
|
|
|
8
|
my $globbin = $options->{'to_format'} eq 'binary'; |
1007
|
3
|
|
|
|
|
11
|
my $last = pop @orig_header_fields; |
1008
|
3
|
|
|
|
|
8
|
for my $field (@orig_header_fields) { |
1009
|
19
|
|
|
|
|
25
|
my $nsep; |
1010
|
19
|
100
|
|
|
|
60
|
if (not exists $self->{'feat_named'}{ $field }) { |
1011
|
4
|
|
|
|
|
6
|
$nsep = 1; |
1012
|
|
|
|
|
|
|
} |
1013
|
|
|
|
|
|
|
else { |
1014
|
15
|
|
|
|
|
35
|
my $feature = $self->{'feat_named'}{ $field }; |
1015
|
15
|
|
100
|
|
|
109
|
my $bin = (exists $feature->{'format'} and $feature->{'format'} eq 'binary' or $globbin); |
1016
|
15
|
100
|
|
|
|
46
|
$nsep = $bin ? _vector_length($feature) : 1; |
1017
|
|
|
|
|
|
|
} |
1018
|
19
|
|
|
|
|
24
|
print {$sink} $field, $ofs x $nsep; |
|
19
|
|
|
|
|
92
|
|
1019
|
|
|
|
|
|
|
} |
1020
|
3
|
|
|
|
|
8
|
print {$sink} $last, "\n"; |
|
3
|
|
|
|
|
6
|
|
1021
|
|
|
|
|
|
|
} |
1022
|
|
|
|
|
|
|
|
1023
|
|
|
|
|
|
|
ROW: |
1024
|
19
|
|
|
|
|
181
|
while (defined (my $row = <$source>)) { |
1025
|
190
|
|
|
|
|
244
|
chomp $row; |
1026
|
190
|
|
|
|
|
2131
|
my @values = split /$ifs/, $row; |
1027
|
190
|
|
|
|
|
489
|
undef $@; |
1028
|
190
|
|
|
|
|
252
|
my @translated = eval { $self->_translate_row(\@descrs, \@values, $options) }; |
|
190
|
|
|
|
|
1204
|
|
1029
|
190
|
50
|
|
|
|
780
|
warn("$@ (line $.)"), next ROW if $@; |
1030
|
190
|
|
|
|
|
199
|
print {$sink} join($ofs, @translated), "\n"; |
|
190
|
|
|
|
|
3737
|
|
1031
|
|
|
|
|
|
|
} |
1032
|
|
|
|
|
|
|
} |
1033
|
|
|
|
|
|
|
|
1034
|
|
|
|
|
|
|
sub add_label { |
1035
|
0
|
|
|
0
|
0
|
0
|
my ($feature, @labels) = @_; |
1036
|
0
|
|
|
|
|
0
|
@labels = map uc($_), @labels; |
1037
|
0
|
0
|
|
|
|
0
|
if (exists $feature->{'label'}) { |
1038
|
0
|
0
|
|
|
|
0
|
if (ref($feature->{'label'}) eq 'ARRAY') { |
1039
|
0
|
|
|
|
|
0
|
push @{ $feature->{'label'} }, @labels; |
|
0
|
|
|
|
|
0
|
|
1040
|
|
|
|
|
|
|
} |
1041
|
|
|
|
|
|
|
else { |
1042
|
0
|
|
|
|
|
0
|
$feature->{'label'} = [$feature->{'label'}, @labels]; |
1043
|
|
|
|
|
|
|
} |
1044
|
|
|
|
|
|
|
} |
1045
|
|
|
|
|
|
|
else { |
1046
|
0
|
|
|
|
|
0
|
$feature->{'label'} = [@labels]; |
1047
|
|
|
|
|
|
|
} |
1048
|
|
|
|
|
|
|
} |
1049
|
|
|
|
|
|
|
|
1050
|
|
|
|
|
|
|
{ |
1051
|
|
|
|
|
|
|
package Data::FeatureFactory::SoftError; |
1052
|
|
|
|
|
|
|
sub new { |
1053
|
4
|
|
|
4
|
|
7
|
my ($class, $message) = @_; |
1054
|
4
|
50
|
|
|
|
9
|
$message = "SoftError occurred" if not defined $message; |
1055
|
4
|
|
|
|
|
28
|
return bless \$message, $class |
1056
|
|
|
|
|
|
|
} |
1057
|
|
|
|
|
|
|
} |
1058
|
|
|
|
|
|
|
|
1059
|
|
|
|
|
|
|
1 |
1060
|
|
|
|
|
|
|
|
1061
|
|
|
|
|
|
|
__END__ |