File Coverage

blib/lib/Algorithm/LibLinear/FeatureScaling.pm
Criterion Covered Total %
statement 66 99 66.6
branch 10 24 41.6
condition 7 23 30.4
subroutine 15 19 78.9
pod 8 12 66.6
total 106 177 59.8


line stmt bran cond sub pod time code
1             package Algorithm::LibLinear::FeatureScaling;
2              
3 1     1   679 use 5.014;
  1         4  
  1         47  
4 1     1   6 use Algorithm::LibLinear::Types;
  1         2  
  1         25  
5 1     1   5 use Carp qw//;
  1         2  
  1         21  
6 1     1   8 use List::MoreUtils qw/minmax none/;
  1         2  
  1         68  
7 1     1   6 use List::Util qw/max/;
  1         2  
  1         104  
8 1     1   5 use Smart::Args;
  1         2  
  1         1125  
9              
10             sub new {
11 2     2 1 30 args
12             my $class => 'ClassName',
13             my $data_set => +{
14             isa => 'Algorithm::LibLinear::DataSet',
15             optional => 1,
16             },
17             my $lower_bound => +{ isa => 'Num', default => 0, },
18             my $min_max_values => +{
19             isa => 'ArrayRef[ArrayRef[Num]]',
20             optional => 1,
21             },
22             my $upper_bound => +{ isa => 'Num', default => 1.0, };
23              
24 2 50 33     754 unless ($data_set or $min_max_values) {
25 0         0 Carp::croak('Neither "data_set" nor "min_max_values" is specified.');
26             }
27              
28 2         11 my $self = bless +{
29             lower_bound => $lower_bound,
30             upper_bound => $upper_bound,
31             } => $class;
32              
33 2   33     13 $self->{min_max_values} = $min_max_values
34             // $self->compute_min_max_values(data_set => $data_set);
35              
36 2         7 return $self;
37             }
38              
39             sub load {
40 0     0 1 0 args
41             my $class => 'ClassName',
42             my $fh => +{ isa => 'FileHandle', optional => 1, },
43             my $filename => +{ isa => 'Str', optional => 1, },
44             my $string => +{ isa => 'Str', optional => 1, };
45              
46 0 0   0   0 if (none { defined } ($filename, $fh, $string)) {
  0         0  
47 0         0 Carp::croak('No source specified.');
48             }
49 0         0 my $source = $fh;
50 0   0     0 $source //= do {
51 0 0 0     0 open $fh, '<', +($filename // \$string) or Carp::croak($!);
52 0         0 $fh;
53             };
54              
55 0         0 chomp(my $header = <$source>);
56 0 0       0 Carp::croak('At present, y-scaling is not supported.') if $header eq 'y';
57 0 0       0 Carp::croak('Invalid format.') if $header ne 'x';
58              
59 0         0 chomp(my $bounds = <$source>);
60 0         0 my ($lower_bound, $upper_bound) = split /\s+/, $bounds;
61              
62 0         0 my @min_max_values;
63 0         0 while (defined(my $min_max_values = <$source>)) {
64 0         0 chomp $min_max_values;
65 0         0 my (undef, $min, $max) = split /\s+/, $min_max_values;
66 0         0 push @min_max_values, [ $min, $max ];
67             }
68              
69             $class->new(
70 0         0 lower_bound => $lower_bound,
71             min_max_values => \@min_max_values,
72             upper_bound => $upper_bound,
73             );
74             }
75              
76             sub as_string {
77 0     0 1 0 args
78             my $self;
79 0         0 my $acc =
80             sprintf "x\n%.16g %.16g\n", $self->lower_bound, $self->upper_bound;
81 0         0 my $index = 0;
82 0         0 for my $min_max_value (@{ $self->min_max_values }) {
  0         0  
83 0         0 $acc .= sprintf "\%d %.16g %.16g\n", ++$index, @$min_max_value;
84             }
85 0         0 return $acc;
86             }
87              
88             sub compute_min_max_values {
89 2     2 0 6 args
90             my $self,
91             my $data_set => 'Algorithm::LibLinear::DataSet';
92              
93 2         103 my @feature_vectors = map { $_->{feature} } @{ $data_set->as_arrayref };
  7         18  
  2         8  
94 2         6 my $last_index = max map { keys %$_ } @feature_vectors;
  7         46  
95 2         5 my @min_max_values;
96 2         6 for my $i (1 .. $last_index) {
97 8   100     12 my ($min, $max) = minmax map { $_->{$i} // 0 } @feature_vectors;
  29         118  
98 8         25 push @min_max_values, [ $min, $max ];
99             }
100 2         15 return \@min_max_values;
101             }
102              
103 19     19 1 97 sub lower_bound { $_[0]->{lower_bound} }
104              
105 8     8 1 16 sub min_max_values { $_[0]->{min_max_values} }
106              
107             sub save {
108 0     0 1 0 args
109             my $self,
110             my $fh => +{ isa => 'FileHandle', optional => 1, },
111             my $filename => +{ isa => 'Str', optional => 1, };
112              
113 0 0 0     0 unless ($filename or $fh) {
114 0         0 Carp::croak('Neither "filename" nor "fh" is given.');
115             }
116 0 0 0     0 open $fh, '>', $filename or Carp::croak($!) unless $fh;
117 0         0 print $fh $self->as_string;
118             }
119              
120             sub scale {
121 3     3 1 25 args_pos
122             my $self,
123             my $target_type => 'Str',
124             my $target;
125              
126 3         241 my $method = $self->can("scale_$target_type");
127 3 50       63 unless ($method) {
128 0         0 Carp::croak("Cannot scale such type of target: $target_type.");
129             }
130 3         7 $self->$method($target);
131             }
132              
133             sub scale_data_set {
134 3     3 0 11 args_pos
135             my $self,
136             my $data_set => 'Algorithm::LibLinear::DataSet';
137              
138 8         21 my @scaled_data_set =
139 3         130 map { $self->scale_labeled_data($_) } @{ $data_set->as_arrayref };
  3         20  
140 3         16 Algorithm::LibLinear::DataSet->new(data_set => \@scaled_data_set);
141             }
142              
143             sub scale_feature {
144 8     8 0 21 args_pos
145             my $self,
146             my $feature => 'Algorithm::LibLinear::Feature';
147              
148 8         82 my ($lower_bound, $upper_bound) = ($self->lower_bound, $self->upper_bound);
149 8         19 my $min_max_values = $self->min_max_values;
150 8         13 my %scaled_feature;
151 8         20 for my $index (1 .. @$min_max_values) {
152 34   100     101 my $unscaled = $feature->{$index} // 0;
153 34   50     32 my ($min, $max) = @{ $min_max_values->[$index - 1] // [0, 0] };
  34         96  
154 34 100       69 next if $min == $max;
155 29         23 my $scaled;
156 29 100       59 if ($unscaled == $min) {
    100          
157 11         12 $scaled = $lower_bound;
158             } elsif ($unscaled == $max) {
159 9         12 $scaled = $upper_bound;
160             } else {
161 9         21 my $ratio = ($unscaled - $min) / ($max - $min);
162 9         16 $scaled = $lower_bound + ($upper_bound - $lower_bound) * $ratio;
163             }
164 29 100       95 $scaled_feature{$index} = $scaled if $scaled != 0;
165             }
166 8         61 return \%scaled_feature;
167             }
168              
169             sub scale_labeled_data {
170 8     8 0 23 args_pos
171             my $self,
172             my $labeled_data => 'Algorithm::LibLinear::LabeledData';
173              
174             +{
175 8         89 feature => $self->scale_feature($labeled_data->{feature}),
176             label => $labeled_data->{label},
177             };
178             }
179              
180 19     19 1 73 sub upper_bound { $_[0]->{upper_bound} }
181              
182             1;
183              
184             __DATA__
185              
186             =head1 NAME
187              
188             Algorithm::LibLinear::FeatureScaling
189              
190             =head1 SYNOPSIS
191              
192             use Algorithm::LibLinear::DataSet;
193             use Algorithm::LibLinear::FeatureScaling;
194            
195             my $scale = Algorithm::LibLinear::FeatureScaling->new(
196             data_set => Algorithm::LibLinear::DataSet->new(...),
197             lower_bound => -10,
198             upper_bound => 10,
199             );
200             my $scale = Algorithm::LibLinear::FeatureScaling->load(
201             filename => '/path/to/file',
202             );
203            
204             my $scaled_feature = $scale->scale(feature => +{ 1 => 30, 2 => - 25, ... });
205             my $scaled_labeled_data = $scale->scale(
206             labeled_data => +{ feature => +{ 1 => 30, ... }, label => 1 },
207             );
208             my $scaled_data_set = $scale->scale(
209             data_set => Algorithm::LibLinear::DataSet->new(...),
210             );
211            
212             say $scale->as_string;
213             $scale->save(filename => '/path/to/another/file');
214              
215             =head1 DESCRIPTION
216              
217             Support vector classification is actually just a calculation of inner product of feature vector and normal vector of separation hyperplane. If some elements in feature vectors have greater dynamic range than others, they can have stronger influence on the final calculation result.
218              
219             For example, consider a normal vector to be C<{ 1 1 1 }> and feature vectors to be classified are C<{ -2 10 5 }>, C<{ 5 -50 0 }> and C<{ 10 100 10 }>. Inner products of these normal vector and feature vectors are 13, -45 and 120 respectively. Obviously 2nd elements of the feature vectors have wider dynamic range than 1st and 3rd ones and dominate calculation result.
220              
221             To avoid such a problem, scaling elements of vectors to make they have same dynamic range is very important. This module provides such vector scaling functionality. If you are familiar with the LIBSVM distribution, you can see this is a library version of C<svm-scale> command written in Perl.
222              
223             =head1 METHODS
224              
225             =head2 new(data_set => $data_set | min_max_values => \@min_max_values [, lower_bound => 0.0] [, upper_bound => 1.0])
226              
227             Constructor. You can set some named parameters below. At least C<data_set> or C<min_max_values> is required.
228              
229             =over 4
230              
231             =item data_set
232              
233             An instance of L<Algorithm::LibLinear::DataSet>. This is used to compute dynamic ranges of each vector element.
234              
235             =item min_max_values
236              
237             Pre-calculated dynamic ranges of each vector element. Its structure is like:
238              
239             my @min_max_values = (
240             [ -10, 10 ], # Dynamic range of 1st elements of vectors.
241             [ 0, 1 ], # 2nd
242             [ -1, 1 ], # 3rd
243             ...
244             );
245              
246             =item lower_bound
247              
248             =item upper_bound
249              
250             The lower/upper limits of dynamic range for each element. Default values are 0.0 and 1.0 respectively.
251              
252             =back
253              
254             =head2 load(filename => $path | fh => \*FH | string => $content)
255              
256             Class method. Creates new instance from dumped scaling parameter file.
257              
258             Please note that this method can parse only a subset of C<svm-scale>'s file format at present.
259              
260             =head2 as_string
261              
262             Dump the scaling parameter as C<svm-scale>'s format.
263              
264             =head2 save(filename => $path | fh => \*FH)
265              
266             Writes result of C<as_string> out to a file.
267              
268             =head2 scale(data_set => $data_set | feature => \%feature | labeled_data => \%labeled_data)
269              
270             Scale the given feature, labeled data or data set.
271              
272             =head1 SEE ALSO
273              
274             L<A Practical Guide to Support Vector Classification|http://www.csie.ntu.edu.tw/~cjlin/papers/guide/guide.pdf> - For understanding importance of scaling, see Chapter 2.2, appendix A and B.
275              
276             =cut