line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Algorithm::LibLinear::FeatureScaling; |
2
|
|
|
|
|
|
|
|
3
|
1
|
|
|
1
|
|
679
|
use 5.014; |
|
1
|
|
|
|
|
4
|
|
|
1
|
|
|
|
|
47
|
|
4
|
1
|
|
|
1
|
|
6
|
use Algorithm::LibLinear::Types; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
25
|
|
5
|
1
|
|
|
1
|
|
5
|
use Carp qw//; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
21
|
|
6
|
1
|
|
|
1
|
|
8
|
use List::MoreUtils qw/minmax none/; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
68
|
|
7
|
1
|
|
|
1
|
|
6
|
use List::Util qw/max/; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
104
|
|
8
|
1
|
|
|
1
|
|
5
|
use Smart::Args; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
1125
|
|
9
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
sub new { |
11
|
2
|
|
|
2
|
1
|
30
|
args |
12
|
|
|
|
|
|
|
my $class => 'ClassName', |
13
|
|
|
|
|
|
|
my $data_set => +{ |
14
|
|
|
|
|
|
|
isa => 'Algorithm::LibLinear::DataSet', |
15
|
|
|
|
|
|
|
optional => 1, |
16
|
|
|
|
|
|
|
}, |
17
|
|
|
|
|
|
|
my $lower_bound => +{ isa => 'Num', default => 0, }, |
18
|
|
|
|
|
|
|
my $min_max_values => +{ |
19
|
|
|
|
|
|
|
isa => 'ArrayRef[ArrayRef[Num]]', |
20
|
|
|
|
|
|
|
optional => 1, |
21
|
|
|
|
|
|
|
}, |
22
|
|
|
|
|
|
|
my $upper_bound => +{ isa => 'Num', default => 1.0, }; |
23
|
|
|
|
|
|
|
|
24
|
2
|
50
|
33
|
|
|
754
|
unless ($data_set or $min_max_values) { |
25
|
0
|
|
|
|
|
0
|
Carp::croak('Neither "data_set" nor "min_max_values" is specified.'); |
26
|
|
|
|
|
|
|
} |
27
|
|
|
|
|
|
|
|
28
|
2
|
|
|
|
|
11
|
my $self = bless +{ |
29
|
|
|
|
|
|
|
lower_bound => $lower_bound, |
30
|
|
|
|
|
|
|
upper_bound => $upper_bound, |
31
|
|
|
|
|
|
|
} => $class; |
32
|
|
|
|
|
|
|
|
33
|
2
|
|
33
|
|
|
13
|
$self->{min_max_values} = $min_max_values |
34
|
|
|
|
|
|
|
// $self->compute_min_max_values(data_set => $data_set); |
35
|
|
|
|
|
|
|
|
36
|
2
|
|
|
|
|
7
|
return $self; |
37
|
|
|
|
|
|
|
} |
38
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
sub load { |
40
|
0
|
|
|
0
|
1
|
0
|
args |
41
|
|
|
|
|
|
|
my $class => 'ClassName', |
42
|
|
|
|
|
|
|
my $fh => +{ isa => 'FileHandle', optional => 1, }, |
43
|
|
|
|
|
|
|
my $filename => +{ isa => 'Str', optional => 1, }, |
44
|
|
|
|
|
|
|
my $string => +{ isa => 'Str', optional => 1, }; |
45
|
|
|
|
|
|
|
|
46
|
0
|
0
|
|
0
|
|
0
|
if (none { defined } ($filename, $fh, $string)) { |
|
0
|
|
|
|
|
0
|
|
47
|
0
|
|
|
|
|
0
|
Carp::croak('No source specified.'); |
48
|
|
|
|
|
|
|
} |
49
|
0
|
|
|
|
|
0
|
my $source = $fh; |
50
|
0
|
|
0
|
|
|
0
|
$source //= do { |
51
|
0
|
0
|
0
|
|
|
0
|
open $fh, '<', +($filename // \$string) or Carp::croak($!); |
52
|
0
|
|
|
|
|
0
|
$fh; |
53
|
|
|
|
|
|
|
}; |
54
|
|
|
|
|
|
|
|
55
|
0
|
|
|
|
|
0
|
chomp(my $header = <$source>); |
56
|
0
|
0
|
|
|
|
0
|
Carp::croak('At present, y-scaling is not supported.') if $header eq 'y'; |
57
|
0
|
0
|
|
|
|
0
|
Carp::croak('Invalid format.') if $header ne 'x'; |
58
|
|
|
|
|
|
|
|
59
|
0
|
|
|
|
|
0
|
chomp(my $bounds = <$source>); |
60
|
0
|
|
|
|
|
0
|
my ($lower_bound, $upper_bound) = split /\s+/, $bounds; |
61
|
|
|
|
|
|
|
|
62
|
0
|
|
|
|
|
0
|
my @min_max_values; |
63
|
0
|
|
|
|
|
0
|
while (defined(my $min_max_values = <$source>)) { |
64
|
0
|
|
|
|
|
0
|
chomp $min_max_values; |
65
|
0
|
|
|
|
|
0
|
my (undef, $min, $max) = split /\s+/, $min_max_values; |
66
|
0
|
|
|
|
|
0
|
push @min_max_values, [ $min, $max ]; |
67
|
|
|
|
|
|
|
} |
68
|
|
|
|
|
|
|
|
69
|
|
|
|
|
|
|
$class->new( |
70
|
0
|
|
|
|
|
0
|
lower_bound => $lower_bound, |
71
|
|
|
|
|
|
|
min_max_values => \@min_max_values, |
72
|
|
|
|
|
|
|
upper_bound => $upper_bound, |
73
|
|
|
|
|
|
|
); |
74
|
|
|
|
|
|
|
} |
75
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
sub as_string { |
77
|
0
|
|
|
0
|
1
|
0
|
args |
78
|
|
|
|
|
|
|
my $self; |
79
|
0
|
|
|
|
|
0
|
my $acc = |
80
|
|
|
|
|
|
|
sprintf "x\n%.16g %.16g\n", $self->lower_bound, $self->upper_bound; |
81
|
0
|
|
|
|
|
0
|
my $index = 0; |
82
|
0
|
|
|
|
|
0
|
for my $min_max_value (@{ $self->min_max_values }) { |
|
0
|
|
|
|
|
0
|
|
83
|
0
|
|
|
|
|
0
|
$acc .= sprintf "\%d %.16g %.16g\n", ++$index, @$min_max_value; |
84
|
|
|
|
|
|
|
} |
85
|
0
|
|
|
|
|
0
|
return $acc; |
86
|
|
|
|
|
|
|
} |
87
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
sub compute_min_max_values { |
89
|
2
|
|
|
2
|
0
|
6
|
args |
90
|
|
|
|
|
|
|
my $self, |
91
|
|
|
|
|
|
|
my $data_set => 'Algorithm::LibLinear::DataSet'; |
92
|
|
|
|
|
|
|
|
93
|
2
|
|
|
|
|
103
|
my @feature_vectors = map { $_->{feature} } @{ $data_set->as_arrayref }; |
|
7
|
|
|
|
|
18
|
|
|
2
|
|
|
|
|
8
|
|
94
|
2
|
|
|
|
|
6
|
my $last_index = max map { keys %$_ } @feature_vectors; |
|
7
|
|
|
|
|
46
|
|
95
|
2
|
|
|
|
|
5
|
my @min_max_values; |
96
|
2
|
|
|
|
|
6
|
for my $i (1 .. $last_index) { |
97
|
8
|
|
100
|
|
|
12
|
my ($min, $max) = minmax map { $_->{$i} // 0 } @feature_vectors; |
|
29
|
|
|
|
|
118
|
|
98
|
8
|
|
|
|
|
25
|
push @min_max_values, [ $min, $max ]; |
99
|
|
|
|
|
|
|
} |
100
|
2
|
|
|
|
|
15
|
return \@min_max_values; |
101
|
|
|
|
|
|
|
} |
102
|
|
|
|
|
|
|
|
103
|
19
|
|
|
19
|
1
|
97
|
sub lower_bound { $_[0]->{lower_bound} } |
104
|
|
|
|
|
|
|
|
105
|
8
|
|
|
8
|
1
|
16
|
sub min_max_values { $_[0]->{min_max_values} } |
106
|
|
|
|
|
|
|
|
107
|
|
|
|
|
|
|
sub save { |
108
|
0
|
|
|
0
|
1
|
0
|
args |
109
|
|
|
|
|
|
|
my $self, |
110
|
|
|
|
|
|
|
my $fh => +{ isa => 'FileHandle', optional => 1, }, |
111
|
|
|
|
|
|
|
my $filename => +{ isa => 'Str', optional => 1, }; |
112
|
|
|
|
|
|
|
|
113
|
0
|
0
|
0
|
|
|
0
|
unless ($filename or $fh) { |
114
|
0
|
|
|
|
|
0
|
Carp::croak('Neither "filename" nor "fh" is given.'); |
115
|
|
|
|
|
|
|
} |
116
|
0
|
0
|
0
|
|
|
0
|
open $fh, '>', $filename or Carp::croak($!) unless $fh; |
117
|
0
|
|
|
|
|
0
|
print $fh $self->as_string; |
118
|
|
|
|
|
|
|
} |
119
|
|
|
|
|
|
|
|
120
|
|
|
|
|
|
|
sub scale { |
121
|
3
|
|
|
3
|
1
|
25
|
args_pos |
122
|
|
|
|
|
|
|
my $self, |
123
|
|
|
|
|
|
|
my $target_type => 'Str', |
124
|
|
|
|
|
|
|
my $target; |
125
|
|
|
|
|
|
|
|
126
|
3
|
|
|
|
|
241
|
my $method = $self->can("scale_$target_type"); |
127
|
3
|
50
|
|
|
|
63
|
unless ($method) { |
128
|
0
|
|
|
|
|
0
|
Carp::croak("Cannot scale such type of target: $target_type."); |
129
|
|
|
|
|
|
|
} |
130
|
3
|
|
|
|
|
7
|
$self->$method($target); |
131
|
|
|
|
|
|
|
} |
132
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
sub scale_data_set { |
134
|
3
|
|
|
3
|
0
|
11
|
args_pos |
135
|
|
|
|
|
|
|
my $self, |
136
|
|
|
|
|
|
|
my $data_set => 'Algorithm::LibLinear::DataSet'; |
137
|
|
|
|
|
|
|
|
138
|
8
|
|
|
|
|
21
|
my @scaled_data_set = |
139
|
3
|
|
|
|
|
130
|
map { $self->scale_labeled_data($_) } @{ $data_set->as_arrayref }; |
|
3
|
|
|
|
|
20
|
|
140
|
3
|
|
|
|
|
16
|
Algorithm::LibLinear::DataSet->new(data_set => \@scaled_data_set); |
141
|
|
|
|
|
|
|
} |
142
|
|
|
|
|
|
|
|
143
|
|
|
|
|
|
|
sub scale_feature { |
144
|
8
|
|
|
8
|
0
|
21
|
args_pos |
145
|
|
|
|
|
|
|
my $self, |
146
|
|
|
|
|
|
|
my $feature => 'Algorithm::LibLinear::Feature'; |
147
|
|
|
|
|
|
|
|
148
|
8
|
|
|
|
|
82
|
my ($lower_bound, $upper_bound) = ($self->lower_bound, $self->upper_bound); |
149
|
8
|
|
|
|
|
19
|
my $min_max_values = $self->min_max_values; |
150
|
8
|
|
|
|
|
13
|
my %scaled_feature; |
151
|
8
|
|
|
|
|
20
|
for my $index (1 .. @$min_max_values) { |
152
|
34
|
|
100
|
|
|
101
|
my $unscaled = $feature->{$index} // 0; |
153
|
34
|
|
50
|
|
|
32
|
my ($min, $max) = @{ $min_max_values->[$index - 1] // [0, 0] }; |
|
34
|
|
|
|
|
96
|
|
154
|
34
|
100
|
|
|
|
69
|
next if $min == $max; |
155
|
29
|
|
|
|
|
23
|
my $scaled; |
156
|
29
|
100
|
|
|
|
59
|
if ($unscaled == $min) { |
|
|
100
|
|
|
|
|
|
157
|
11
|
|
|
|
|
12
|
$scaled = $lower_bound; |
158
|
|
|
|
|
|
|
} elsif ($unscaled == $max) { |
159
|
9
|
|
|
|
|
12
|
$scaled = $upper_bound; |
160
|
|
|
|
|
|
|
} else { |
161
|
9
|
|
|
|
|
21
|
my $ratio = ($unscaled - $min) / ($max - $min); |
162
|
9
|
|
|
|
|
16
|
$scaled = $lower_bound + ($upper_bound - $lower_bound) * $ratio; |
163
|
|
|
|
|
|
|
} |
164
|
29
|
100
|
|
|
|
95
|
$scaled_feature{$index} = $scaled if $scaled != 0; |
165
|
|
|
|
|
|
|
} |
166
|
8
|
|
|
|
|
61
|
return \%scaled_feature; |
167
|
|
|
|
|
|
|
} |
168
|
|
|
|
|
|
|
|
169
|
|
|
|
|
|
|
sub scale_labeled_data { |
170
|
8
|
|
|
8
|
0
|
23
|
args_pos |
171
|
|
|
|
|
|
|
my $self, |
172
|
|
|
|
|
|
|
my $labeled_data => 'Algorithm::LibLinear::LabeledData'; |
173
|
|
|
|
|
|
|
|
174
|
|
|
|
|
|
|
+{ |
175
|
8
|
|
|
|
|
89
|
feature => $self->scale_feature($labeled_data->{feature}), |
176
|
|
|
|
|
|
|
label => $labeled_data->{label}, |
177
|
|
|
|
|
|
|
}; |
178
|
|
|
|
|
|
|
} |
179
|
|
|
|
|
|
|
|
180
|
19
|
|
|
19
|
1
|
73
|
sub upper_bound { $_[0]->{upper_bound} } |
181
|
|
|
|
|
|
|
|
182
|
|
|
|
|
|
|
1; |
183
|
|
|
|
|
|
|
|
184
|
|
|
|
|
|
|
__DATA__ |
185
|
|
|
|
|
|
|
|
186
|
|
|
|
|
|
|
=head1 NAME |
187
|
|
|
|
|
|
|
|
188
|
|
|
|
|
|
|
Algorithm::LibLinear::FeatureScaling |
189
|
|
|
|
|
|
|
|
190
|
|
|
|
|
|
|
=head1 SYNOPSIS |
191
|
|
|
|
|
|
|
|
192
|
|
|
|
|
|
|
use Algorithm::LibLinear::DataSet; |
193
|
|
|
|
|
|
|
use Algorithm::LibLinear::FeatureScaling; |
194
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
my $scale = Algorithm::LibLinear::FeatureScaling->new( |
196
|
|
|
|
|
|
|
data_set => Algorithm::LibLinear::DataSet->new(...), |
197
|
|
|
|
|
|
|
lower_bound => -10, |
198
|
|
|
|
|
|
|
upper_bound => 10, |
199
|
|
|
|
|
|
|
); |
200
|
|
|
|
|
|
|
my $scale = Algorithm::LibLinear::FeatureScaling->load( |
201
|
|
|
|
|
|
|
filename => '/path/to/file', |
202
|
|
|
|
|
|
|
); |
203
|
|
|
|
|
|
|
|
204
|
|
|
|
|
|
|
my $scaled_feature = $scale->scale(feature => +{ 1 => 30, 2 => - 25, ... }); |
205
|
|
|
|
|
|
|
my $scaled_labeled_data = $scale->scale( |
206
|
|
|
|
|
|
|
labeled_data => +{ feature => +{ 1 => 30, ... }, label => 1 }, |
207
|
|
|
|
|
|
|
); |
208
|
|
|
|
|
|
|
my $scaled_data_set = $scale->scale( |
209
|
|
|
|
|
|
|
data_set => Algorithm::LibLinear::DataSet->new(...), |
210
|
|
|
|
|
|
|
); |
211
|
|
|
|
|
|
|
|
212
|
|
|
|
|
|
|
say $scale->as_string; |
213
|
|
|
|
|
|
|
$scale->save(filename => '/path/to/another/file'); |
214
|
|
|
|
|
|
|
|
215
|
|
|
|
|
|
|
=head1 DESCRIPTION |
216
|
|
|
|
|
|
|
|
217
|
|
|
|
|
|
|
Support vector classification is actually just a calculation of inner product of feature vector and normal vector of separation hyperplane. If some elements in feature vectors have greater dynamic range than others, they can have stronger influence on the final calculation result. |
218
|
|
|
|
|
|
|
|
219
|
|
|
|
|
|
|
For example, consider a normal vector to be C<{ 1 1 1 }> and feature vectors to be classified are C<{ -2 10 5 }>, C<{ 5 -50 0 }> and C<{ 10 100 10 }>. Inner products of these normal vector and feature vectors are 13, -45 and 120 respectively. Obviously 2nd elements of the feature vectors have wider dynamic range than 1st and 3rd ones and dominate calculation result. |
220
|
|
|
|
|
|
|
|
221
|
|
|
|
|
|
|
To avoid such a problem, scaling elements of vectors to make they have same dynamic range is very important. This module provides such vector scaling functionality. If you are familiar with the LIBSVM distribution, you can see this is a library version of C<svm-scale> command written in Perl. |
222
|
|
|
|
|
|
|
|
223
|
|
|
|
|
|
|
=head1 METHODS |
224
|
|
|
|
|
|
|
|
225
|
|
|
|
|
|
|
=head2 new(data_set => $data_set | min_max_values => \@min_max_values [, lower_bound => 0.0] [, upper_bound => 1.0]) |
226
|
|
|
|
|
|
|
|
227
|
|
|
|
|
|
|
Constructor. You can set some named parameters below. At least C<data_set> or C<min_max_values> is required. |
228
|
|
|
|
|
|
|
|
229
|
|
|
|
|
|
|
=over 4 |
230
|
|
|
|
|
|
|
|
231
|
|
|
|
|
|
|
=item data_set |
232
|
|
|
|
|
|
|
|
233
|
|
|
|
|
|
|
An instance of L<Algorithm::LibLinear::DataSet>. This is used to compute dynamic ranges of each vector element. |
234
|
|
|
|
|
|
|
|
235
|
|
|
|
|
|
|
=item min_max_values |
236
|
|
|
|
|
|
|
|
237
|
|
|
|
|
|
|
Pre-calculated dynamic ranges of each vector element. Its structure is like: |
238
|
|
|
|
|
|
|
|
239
|
|
|
|
|
|
|
my @min_max_values = ( |
240
|
|
|
|
|
|
|
[ -10, 10 ], # Dynamic range of 1st elements of vectors. |
241
|
|
|
|
|
|
|
[ 0, 1 ], # 2nd |
242
|
|
|
|
|
|
|
[ -1, 1 ], # 3rd |
243
|
|
|
|
|
|
|
... |
244
|
|
|
|
|
|
|
); |
245
|
|
|
|
|
|
|
|
246
|
|
|
|
|
|
|
=item lower_bound |
247
|
|
|
|
|
|
|
|
248
|
|
|
|
|
|
|
=item upper_bound |
249
|
|
|
|
|
|
|
|
250
|
|
|
|
|
|
|
The lower/upper limits of dynamic range for each element. Default values are 0.0 and 1.0 respectively. |
251
|
|
|
|
|
|
|
|
252
|
|
|
|
|
|
|
=back |
253
|
|
|
|
|
|
|
|
254
|
|
|
|
|
|
|
=head2 load(filename => $path | fh => \*FH | string => $content) |
255
|
|
|
|
|
|
|
|
256
|
|
|
|
|
|
|
Class method. Creates new instance from dumped scaling parameter file. |
257
|
|
|
|
|
|
|
|
258
|
|
|
|
|
|
|
Please note that this method can parse only a subset of C<svm-scale>'s file format at present. |
259
|
|
|
|
|
|
|
|
260
|
|
|
|
|
|
|
=head2 as_string |
261
|
|
|
|
|
|
|
|
262
|
|
|
|
|
|
|
Dump the scaling parameter as C<svm-scale>'s format. |
263
|
|
|
|
|
|
|
|
264
|
|
|
|
|
|
|
=head2 save(filename => $path | fh => \*FH) |
265
|
|
|
|
|
|
|
|
266
|
|
|
|
|
|
|
Writes result of C<as_string> out to a file. |
267
|
|
|
|
|
|
|
|
268
|
|
|
|
|
|
|
=head2 scale(data_set => $data_set | feature => \%feature | labeled_data => \%labeled_data) |
269
|
|
|
|
|
|
|
|
270
|
|
|
|
|
|
|
Scale the given feature, labeled data or data set. |
271
|
|
|
|
|
|
|
|
272
|
|
|
|
|
|
|
=head1 SEE ALSO |
273
|
|
|
|
|
|
|
|
274
|
|
|
|
|
|
|
L<A Practical Guide to Support Vector Classification|http://www.csie.ntu.edu.tw/~cjlin/papers/guide/guide.pdf> - For understanding importance of scaling, see Chapter 2.2, appendix A and B. |
275
|
|
|
|
|
|
|
|
276
|
|
|
|
|
|
|
=cut |