File Coverage

blib/lib/Algorithm/FeatureSelection.pm

Criterion	Covered	Total	%
statement	113	126	89.6
branch	10	16	62.5
condition	5	6	83.3
subroutine	12	15	80.0
pod	8	9	88.8
total	148	172	86.0

line	stmt	bran	cond	sub	pod	time	code
1							package Algorithm::FeatureSelection;
2	6			6		4047	use strict;
	6					9
	6					191
3	6			6		29	use warnings;
	6					10
	6					152
4	6			6		33	use List::Util qw(sum);
	6					11
	6					8586
5
6							our $VERSION = '0.02';
7
8							sub new {
9	5			5	1	65	my $class = shift;
10	5					25	my $self = bless {@_}, $class;
11	5					131	return $self;
12							}
13
14							sub pmi {
15	0			0	1	0	my $self = shift;
16	0					0	$self->pairewise_mutual_information(@_);
17							}
18
19							sub ig {
20	0			0	1	0	my $self = shift;
21	0					0	$self->information_gain(@_);
22							}
23
24							sub igr {
25	0			0	1	0	my $self = shift;
26	0					0	$self->information_gain_ratio(@_);
27							}
28
29							sub pairwise_mutual_information {
30	1			1	1	94	my $self = shift;
31	1					3	my $features = shift;
32
33							## -----------------------------------------------------------------
34							##
35							## The argument is expected as below.
36							##
37							## $features = {
38							## feature_1 => {
39							## class_a => 10,
40							## class_b => 2,
41							## },
42							## feature_2 => {
43							## class_b => 11,
44							## class_d => 32
45							## },
46							## .
47							## .
48							## .
49							## };
50							##
51							## -----------------------------------------------------------------
52							##
53							## Pairewise Mutual Information
54							##
55							## PMI(w, c) = log ( P( Xw = 1, C = c ) / P( Xw=1 )P( C=c ) )
56							##
57							## c.f. w = feature
58							## c = Class
59							##
60							## -----------------------------------------------------------------
61
62	1					2	my $feature_count;
63							my $class_count;
64	0					0	my $co_occur_count;
65	0					0	my $all_features_num;
66	1					8	while ( my ( $feature, $ref ) = each %$features ) {
67	21					79	while ( my ( $class, $count ) = each %$ref ) {
68	28					51	$feature_count->{$feature} += $count;
69	28					34	$class_count->{$class} += $count;
70	28					62	$co_occur_count->{ $class . "\t" . $feature } += $count;
71	28					235	$all_features_num += $count;
72							}
73							}
74
75	1					1	my $PMI;
76
77	1					8	for ( keys %$co_occur_count ) {
78	28					38	my $f12 = $co_occur_count->{$_};
79	28					63	my ( $class, $feature ) = split "\t", $_;
80	28					41	my $f1 = $feature_count->{$feature};
81	28					33	my $f2 = $class_count->{$class};
82
83	28					61	my $pmi_score = _log2( ( $f12 / $all_features_num )
84							/ ( ( $f1 / $all_features_num ) * ( $f2 / $all_features_num ) ) );
85
86	28					77	$PMI->{$feature}->{$class} = $pmi_score;
87							}
88
89	1					12	return $PMI;
90							}
91
92							sub information_gain {
93	2			2	1	65	my $self = shift;
94	2					5	my $features = shift;
95
96							## -----------------------------------------------------------------
97							##
98							## The argument is expected as below.
99							##
100							## $features = {
101							## feature_1 => {
102							## class_a => 10,
103							## class_b => 2,
104							## },
105							## feature_2 => {
106							## class_b => 11,
107							## class_d => 32
108							## },
109							## .
110							## .
111							## .
112							## };
113							##
114							## -----------------------------------------------------------------
115							##
116							## Information Gain
117							##
118							## IG(w) = H(C) - ( P(Xw = 1) H(C\|Xw = 1) + P(Xw = 0) H(C\|Xw = 0) )
119							##
120							## c.f. w = feature
121							## C = class
122							##
123							## -----------------------------------------------------------------
124
125	2					3	my $IG;
126
127							my $classes;
128	0					0	my $classes_sum;
129	0					0	my $all_features_num;
130	2					19	while ( my ( $feature, $ref ) = each %$features ) {
131	42					110	while ( my ( $class, $count ) = each %$ref ) {
132	56					92	$classes->{$class}->{$feature} += $count;
133	56					61	$classes_sum->{$class} += $count;
134	56					237	$all_features_num += $count;
135							}
136							}
137
138	2					4	my @array;
139	2					10	while ( my ( $class, $ref ) = each %$classes ) {
140	4					28	my $sum = sum( values %$ref );
141	4					9	my $p_class = $sum / $all_features_num;
142	4					30	push @array, $p_class;
143							}
144	2					8	my $entropy = $self->entropy( \@array );
145
146	2					20	while ( my ( $feature, $ref ) = each %$features ) {
147
148	42					98	my $sum = sum( values %$ref );
149
150							# H ( C \| Xw = 1)
151	42					48	my $on_entropy;
152							{
153	42					50	my @array;
	42					42
154	42					119	while ( my ( $class, $count ) = each %$ref ) {
155	56					74	my $p_class_feature = $count / $sum;
156	56					209	push @array, $p_class_feature;
157							}
158
159	42		100			91	$on_entropy = $self->entropy( \@array ) \|\| 0;
160							}
161
162							# H ( C \| Xw = 0)
163	42					60	my $off_entropy;
164							{
165	42					57	my @array;
	42					48
166	42					166	while ( my ( $class, $count ) = each %$ref ) {
167
168	56					108	my $p_class_feature = ( $classes_sum->{$class} - $count )
169							/ ( $all_features_num - $sum );
170	56					185	push @array, $p_class_feature;
171							}
172
173	42		100			93	$off_entropy = $self->entropy( \@array ) \|\| 0;
174							}
175
176							# Information Gain
177	42					121	my $ig
178							= $entropy
179							- ( ( $sum / $all_features_num )
180							* $on_entropy
181							+ ( ( $all_features_num - $sum ) / $all_features_num )
182							* $off_entropy );
183
184	42					209	$IG->{$feature} = $ig;
185							}
186
187	2					19	return $IG;
188							}
189
190							sub information_gain_ratio {
191	1			1	1	58	my $self = shift;
192	1					2	my $data = shift;
193
194	1					5	my $SI = $self->split_information($data);
195	1					4	my $IG = $self->information_gain($data);
196	1					3	my $IGR;
197	1					12	for ( sort { $IG->{$b} <=> $IG->{$a} } keys %$IG ) {
	72					109
198	21	50				53	if ( my $ratio = $IG->{$_} / $SI ) {
199	21	50				83	$IGR->{$_} = $ratio if $ratio > 0;
200							}
201							}
202	1					10	return $IGR;
203							}
204
205							sub entropy {
206	91			91	1	132	my $self = shift;
207	91					109	my $data = shift;
208
209	91					91	my @ratio;
210	91	100				298	if ( ref $data eq 'HASH' ) {
		50
211	2					10	@ratio = _ratio( [ values %$data ] );
212							}
213							elsif ( ref $data eq 'ARRAY' ) {
214	89		50			290	my $s = sum(@$data) \|\| 0;
215	89	100				161	if ( $s == 1 ) {
216	59					127	@ratio = @$data;
217							}
218							else {
219	30					59	@ratio = _ratio($data);
220							}
221							}
222
223	91					113	my $entropy;
224	91					123	for my $p (@ratio) {
225	135	50				287	if ( $p <= 0 ) {
226	0					0	$p = 0.000000000000000000000001;
227							}
228
229	135					229	$entropy += -$p * _log2($p);
230							}
231	91					411	return $entropy;
232
233							}
234
235							sub split_information {
236	1			1	0	2	my $self = shift;
237	1					1	my $data = shift;
238
239	1					4	my $all = int keys %$data;
240	1					2	my $s;
241	1					5	while ( my ( $w, $ref ) = each %$data ) {
242	21					130	for my $category ( keys %$ref ) {
243	28					95	$s->{$category}++;
244							}
245							}
246	1					2	my @array;
247	1					5	while ( my ( $category, $num ) = each %$s ) {
248	2					8	push @array, $num / $all;
249							}
250	1					5	my $SI = $self->entropy( \@array );
251	1					4	return $SI;
252							}
253
254							sub _ratio {
255	32			32		42	my $arrayref = shift;
256	32					34	my @ratio;
257	32					77	my $sum = sum(@$arrayref);
258	32					55	for (@$arrayref) {
259	42	50				94	next if $_ <= 0;
260	42					55	eval { push @ratio, $_ / $sum; };
	42					72
261	42	50				118	if ($@) {
262	6			6		7406	use Data::Dumper;
	6					96240
	6					1019
263	0					0	print Dumper $arrayref;
264	0					0	die($@);
265							}
266							}
267	32					102	return @ratio;
268							}
269
270							sub _log2 {
271	163			163		237	my $n = shift;
272	163					521	log($n) / log(2);
273							}
274
275							1;
276							__END__