File Coverage

blib/lib/Algorithm/LibLinear/FeatureScaling.pm

Criterion	Covered	Total	%
statement	66	99	66.6
branch	10	24	41.6
condition	7	23	30.4
subroutine	15	19	78.9
pod	8	12	66.6
total	106	177	59.8

line	stmt	bran	cond	sub	pod	time	code
1							package Algorithm::LibLinear::FeatureScaling;
2
3	1			1		679	use 5.014;
	1					4
	1					47
4	1			1		6	use Algorithm::LibLinear::Types;
	1					2
	1					25
5	1			1		5	use Carp qw//;
	1					2
	1					21
6	1			1		8	use List::MoreUtils qw/minmax none/;
	1					2
	1					68
7	1			1		6	use List::Util qw/max/;
	1					2
	1					104
8	1			1		5	use Smart::Args;
	1					2
	1					1125
9
10							sub new {
11	2			2	1	30	args
12							my $class => 'ClassName',
13							my $data_set => +{
14							isa => 'Algorithm::LibLinear::DataSet',
15							optional => 1,
16							},
17							my $lower_bound => +{ isa => 'Num', default => 0, },
18							my $min_max_values => +{
19							isa => 'ArrayRef[ArrayRef[Num]]',
20							optional => 1,
21							},
22							my $upper_bound => +{ isa => 'Num', default => 1.0, };
23
24	2	50	33			754	unless ($data_set or $min_max_values) {
25	0					0	Carp::croak('Neither "data_set" nor "min_max_values" is specified.');
26							}
27
28	2					11	my $self = bless +{
29							lower_bound => $lower_bound,
30							upper_bound => $upper_bound,
31							} => $class;
32
33	2		33			13	$self->{min_max_values} = $min_max_values
34							// $self->compute_min_max_values(data_set => $data_set);
35
36	2					7	return $self;
37							}
38
39							sub load {
40	0			0	1	0	args
41							my $class => 'ClassName',
42							my $fh => +{ isa => 'FileHandle', optional => 1, },
43							my $filename => +{ isa => 'Str', optional => 1, },
44							my $string => +{ isa => 'Str', optional => 1, };
45
46	0	0		0		0	if (none { defined } ($filename, $fh, $string)) {
	0					0
47	0					0	Carp::croak('No source specified.');
48							}
49	0					0	my $source = $fh;
50	0		0			0	$source //= do {
51	0	0	0			0	open $fh, '<', +($filename // \$string) or Carp::croak($!);
52	0					0	$fh;
53							};
54
55	0					0	chomp(my $header = <$source>);
56	0	0				0	Carp::croak('At present, y-scaling is not supported.') if $header eq 'y';
57	0	0				0	Carp::croak('Invalid format.') if $header ne 'x';
58
59	0					0	chomp(my $bounds = <$source>);
60	0					0	my ($lower_bound, $upper_bound) = split /\s+/, $bounds;
61
62	0					0	my @min_max_values;
63	0					0	while (defined(my $min_max_values = <$source>)) {
64	0					0	chomp $min_max_values;
65	0					0	my (undef, $min, $max) = split /\s+/, $min_max_values;
66	0					0	push @min_max_values, [ $min, $max ];
67							}
68
69							$class->new(
70	0					0	lower_bound => $lower_bound,
71							min_max_values => \@min_max_values,
72							upper_bound => $upper_bound,
73							);
74							}
75
76							sub as_string {
77	0			0	1	0	args
78							my $self;
79	0					0	my $acc =
80							sprintf "x\n%.16g %.16g\n", $self->lower_bound, $self->upper_bound;
81	0					0	my $index = 0;
82	0					0	for my $min_max_value (@{ $self->min_max_values }) {
	0					0
83	0					0	$acc .= sprintf "\%d %.16g %.16g\n", ++$index, @$min_max_value;
84							}
85	0					0	return $acc;
86							}
87
88							sub compute_min_max_values {
89	2			2	0	6	args
90							my $self,
91							my $data_set => 'Algorithm::LibLinear::DataSet';
92
93	2					103	my @feature_vectors = map { $_->{feature} } @{ $data_set->as_arrayref };
	7					18
	2					8
94	2					6	my $last_index = max map { keys %$_ } @feature_vectors;
	7					46
95	2					5	my @min_max_values;
96	2					6	for my $i (1 .. $last_index) {
97	8		100			12	my ($min, $max) = minmax map { $_->{$i} // 0 } @feature_vectors;
	29					118
98	8					25	push @min_max_values, [ $min, $max ];
99							}
100	2					15	return \@min_max_values;
101							}
102
103	19			19	1	97	sub lower_bound { $_[0]->{lower_bound} }
104
105	8			8	1	16	sub min_max_values { $_[0]->{min_max_values} }
106
107							sub save {
108	0			0	1	0	args
109							my $self,
110							my $fh => +{ isa => 'FileHandle', optional => 1, },
111							my $filename => +{ isa => 'Str', optional => 1, };
112
113	0	0	0			0	unless ($filename or $fh) {
114	0					0	Carp::croak('Neither "filename" nor "fh" is given.');
115							}
116	0	0	0			0	open $fh, '>', $filename or Carp::croak($!) unless $fh;
117	0					0	print $fh $self->as_string;
118							}
119
120							sub scale {
121	3			3	1	25	args_pos
122							my $self,
123							my $target_type => 'Str',
124							my $target;
125
126	3					241	my $method = $self->can("scale_$target_type");
127	3	50				63	unless ($method) {
128	0					0	Carp::croak("Cannot scale such type of target: $target_type.");
129							}
130	3					7	$self->$method($target);
131							}
132
133							sub scale_data_set {
134	3			3	0	11	args_pos
135							my $self,
136							my $data_set => 'Algorithm::LibLinear::DataSet';
137
138	8					21	my @scaled_data_set =
139	3					130	map { $self->scale_labeled_data($_) } @{ $data_set->as_arrayref };
	3					20
140	3					16	Algorithm::LibLinear::DataSet->new(data_set => \@scaled_data_set);
141							}
142
143							sub scale_feature {
144	8			8	0	21	args_pos
145							my $self,
146							my $feature => 'Algorithm::LibLinear::Feature';
147
148	8					82	my ($lower_bound, $upper_bound) = ($self->lower_bound, $self->upper_bound);
149	8					19	my $min_max_values = $self->min_max_values;
150	8					13	my %scaled_feature;
151	8					20	for my $index (1 .. @$min_max_values) {
152	34		100			101	my $unscaled = $feature->{$index} // 0;
153	34		50			32	my ($min, $max) = @{ $min_max_values->[$index - 1] // [0, 0] };
	34					96
154	34	100				69	next if $min == $max;
155	29					23	my $scaled;
156	29	100				59	if ($unscaled == $min) {
		100
157	11					12	$scaled = $lower_bound;
158							} elsif ($unscaled == $max) {
159	9					12	$scaled = $upper_bound;
160							} else {
161	9					21	my $ratio = ($unscaled - $min) / ($max - $min);
162	9					16	$scaled = $lower_bound + ($upper_bound - $lower_bound) * $ratio;
163							}
164	29	100				95	$scaled_feature{$index} = $scaled if $scaled != 0;
165							}
166	8					61	return \%scaled_feature;
167							}
168
169							sub scale_labeled_data {
170	8			8	0	23	args_pos
171							my $self,
172							my $labeled_data => 'Algorithm::LibLinear::LabeledData';
173
174							+{
175	8					89	feature => $self->scale_feature($labeled_data->{feature}),
176							label => $labeled_data->{label},
177							};
178							}
179
180	19			19	1	73	sub upper_bound { $_[0]->{upper_bound} }
181
182							1;
183
184							__DATA__
185
186							=head1 NAME
187
188							Algorithm::LibLinear::FeatureScaling
189
190							=head1 SYNOPSIS
191
192							use Algorithm::LibLinear::DataSet;
193							use Algorithm::LibLinear::FeatureScaling;
194
195							my $scale = Algorithm::LibLinear::FeatureScaling->new(
196							data_set => Algorithm::LibLinear::DataSet->new(...),
197							lower_bound => -10,
198							upper_bound => 10,
199							);
200							my $scale = Algorithm::LibLinear::FeatureScaling->load(
201							filename => '/path/to/file',
202							);
203
204							my $scaled_feature = $scale->scale(feature => +{ 1 => 30, 2 => - 25, ... });
205							my $scaled_labeled_data = $scale->scale(
206							labeled_data => +{ feature => +{ 1 => 30, ... }, label => 1 },
207							);
208							my $scaled_data_set = $scale->scale(
209							data_set => Algorithm::LibLinear::DataSet->new(...),
210							);
211
212							say $scale->as_string;
213							$scale->save(filename => '/path/to/another/file');
214
215							=head1 DESCRIPTION
216
217							Support vector classification is actually just a calculation of inner product of feature vector and normal vector of separation hyperplane. If some elements in feature vectors have greater dynamic range than others, they can have stronger influence on the final calculation result.
218
219							For example, consider a normal vector to be C<{ 1 1 1 }> and feature vectors to be classified are C<{ -2 10 5 }>, C<{ 5 -50 0 }> and C<{ 10 100 10 }>. Inner products of these normal vector and feature vectors are 13, -45 and 120 respectively. Obviously 2nd elements of the feature vectors have wider dynamic range than 1st and 3rd ones and dominate calculation result.
220
221							To avoid such a problem, scaling elements of vectors to make they have same dynamic range is very important. This module provides such vector scaling functionality. If you are familiar with the LIBSVM distribution, you can see this is a library version of C<svm-scale> command written in Perl.
222
223							=head1 METHODS
224
225							=head2 new(data_set => $data_set \| min_max_values => \@min_max_values [, lower_bound => 0.0] [, upper_bound => 1.0])
226
227							Constructor. You can set some named parameters below. At least C<data_set> or C<min_max_values> is required.
228
229							=over 4
230
231							=item data_set
232
233							An instance of L<Algorithm::LibLinear::DataSet>. This is used to compute dynamic ranges of each vector element.
234
235							=item min_max_values
236
237							Pre-calculated dynamic ranges of each vector element. Its structure is like:
238
239							my @min_max_values = (
240							[ -10, 10 ], # Dynamic range of 1st elements of vectors.
241							[ 0, 1 ], # 2nd
242							[ -1, 1 ], # 3rd
243							...
244							);
245
246							=item lower_bound
247
248							=item upper_bound
249
250							The lower/upper limits of dynamic range for each element. Default values are 0.0 and 1.0 respectively.
251
252							=back
253
254							=head2 load(filename => $path \| fh => \*FH \| string => $content)
255
256							Class method. Creates new instance from dumped scaling parameter file.
257
258							Please note that this method can parse only a subset of C<svm-scale>'s file format at present.
259
260							=head2 as_string
261
262							Dump the scaling parameter as C<svm-scale>'s format.
263
264							=head2 save(filename => $path \| fh => \*FH)
265
266							Writes result of C<as_string> out to a file.
267
268							=head2 scale(data_set => $data_set \| feature => \%feature \| labeled_data => \%labeled_data)
269
270							Scale the given feature, labeled data or data set.
271
272							=head1 SEE ALSO
273
274							L<A Practical Guide to Support Vector Classification\|http://www.csie.ntu.edu.tw/~cjlin/papers/guide/guide.pdf> - For understanding importance of scaling, see Chapter 2.2, appendix A and B.
275
276							=cut