line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Lingua::JA::Categorize::Result; |
2
|
1
|
|
|
1
|
|
4
|
use strict; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
35
|
|
3
|
1
|
|
|
1
|
|
5
|
use warnings; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
37
|
|
4
|
1
|
|
|
1
|
|
4
|
use List::Util qw(sum); |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
109
|
|
5
|
1
|
|
|
1
|
|
4
|
use base qw( Lingua::JA::Categorize::Base ); |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
512
|
|
6
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
sub word_set { |
8
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
9
|
0
|
|
|
|
|
|
my $word_set = $self->{word_set}; |
10
|
0
|
|
|
|
|
|
my @list; |
11
|
0
|
|
|
|
|
|
for ( sort { $word_set->{$b} <=> $word_set->{$a} } keys %$word_set ) { |
|
0
|
|
|
|
|
|
|
12
|
0
|
|
|
|
|
|
push( @list, { $_ => $word_set->{$_} } ); |
13
|
|
|
|
|
|
|
} |
14
|
0
|
|
|
|
|
|
return \@list; |
15
|
|
|
|
|
|
|
} |
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
sub score { |
18
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
19
|
0
|
|
|
|
|
|
my $num = shift; |
20
|
0
|
|
0
|
|
|
|
$num ||= 3; |
21
|
0
|
0
|
|
|
|
|
unless ( $self->word_set->[0] ) { |
22
|
0
|
|
|
|
|
|
return undef; |
23
|
|
|
|
|
|
|
} |
24
|
0
|
|
|
|
|
|
my $score = $self->{score}; |
25
|
0
|
|
|
|
|
|
my @list; |
26
|
0
|
|
|
|
|
|
my $i = 0; |
27
|
0
|
|
|
|
|
|
for ( sort { $score->{$b} <=> $score->{$a} } keys %$score ) { |
|
0
|
|
|
|
|
|
|
28
|
0
|
0
|
|
|
|
|
push( @list, { $_ => $score->{$_} } ) if $score->{$_} > 0; |
29
|
0
|
|
|
|
|
|
$i++; |
30
|
0
|
0
|
|
|
|
|
last if ( $i == $num ); |
31
|
|
|
|
|
|
|
} |
32
|
0
|
|
|
|
|
|
return \@list; |
33
|
|
|
|
|
|
|
} |
34
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
sub confidence { |
36
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
37
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
# マッチor ノーマッチによる確信度計算 |
39
|
0
|
|
|
|
|
|
my $match_word_point = $self->_match_word_point; |
40
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
# エントロピーによる確信度計算 |
42
|
0
|
|
|
|
|
|
my $entropy_point = $self->_entropy_point; |
43
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
# 距離計算 |
45
|
0
|
|
|
|
|
|
my $v3 = $self->_distance_point(3); |
46
|
0
|
|
|
|
|
|
my $v10 = $self->_distance_point(10); |
47
|
0
|
|
|
|
|
|
my $distance_point = 1 - $v3 / $v10; |
48
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
# 線形結合(重みは適当) |
50
|
0
|
|
|
|
|
|
my $w1 = 5; |
51
|
0
|
|
|
|
|
|
my $w2 = 1; |
52
|
0
|
|
|
|
|
|
my $w3 = 1; |
53
|
0
|
|
|
|
|
|
my $confidence_point |
54
|
|
|
|
|
|
|
= ( $w1 * $match_word_point |
55
|
|
|
|
|
|
|
+ $w2 * $entropy_point |
56
|
|
|
|
|
|
|
+ $w3 * $distance_point ) / ( $w1 + $w2 + $w3 ); |
57
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
#print "M:", $match_word_point, "\n"; |
59
|
|
|
|
|
|
|
#print "E:", $entropy_point, "\n"; |
60
|
|
|
|
|
|
|
#print "D:", $distance_point, "\n"; |
61
|
|
|
|
|
|
|
|
62
|
0
|
|
|
|
|
|
return $confidence_point; |
63
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
} |
65
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
sub _distance_point { |
67
|
0
|
|
|
0
|
|
|
my $self = shift; |
68
|
0
|
|
0
|
|
|
|
my $n = shift || 3; |
69
|
0
|
|
|
|
|
|
my $brain = $self->context->categorizer->brain; |
70
|
|
|
|
|
|
|
my @categories |
71
|
0
|
|
|
|
|
|
= map { keys %$_; } @{ $self->score($n) }; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
|
73
|
|
|
|
|
|
|
# 必要なデータを抽出 |
74
|
0
|
|
|
|
|
|
my $data; |
75
|
0
|
|
|
|
|
|
for (@categories) { |
76
|
0
|
|
|
|
|
|
$data->{$_} = $brain->{training_data}->{labels}->{$_}->{attributes}; |
77
|
|
|
|
|
|
|
} |
78
|
|
|
|
|
|
|
|
79
|
|
|
|
|
|
|
# 重心を測定 |
80
|
0
|
|
|
|
|
|
my $centroid; |
81
|
|
|
|
|
|
|
my %counter; |
82
|
0
|
|
|
|
|
|
my %sum; |
83
|
0
|
|
|
|
|
|
while ( my ( $label, $ref ) = each(%$data) ) { |
84
|
0
|
|
|
|
|
|
while ( my ( $attr, $score ) = each(%$ref) ) { |
85
|
0
|
|
|
|
|
|
$counter{$attr}++; |
86
|
0
|
|
|
|
|
|
$sum{$attr} += $score; |
87
|
|
|
|
|
|
|
} |
88
|
|
|
|
|
|
|
} |
89
|
0
|
|
|
|
|
|
while ( my ( $key, $value ) = each(%sum) ) { |
90
|
0
|
|
|
|
|
|
$centroid->{$key} = $value / $counter{$key}; |
91
|
|
|
|
|
|
|
} |
92
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
# 重心からの平均距離を求める |
94
|
0
|
|
|
|
|
|
my @array; |
95
|
0
|
|
|
|
|
|
for (@categories) { |
96
|
0
|
|
|
|
|
|
my $p = $data->{$_}; |
97
|
0
|
|
|
|
|
|
my $distance = $self->_distance( $centroid, $p ); |
98
|
0
|
|
|
|
|
|
push( @array, $distance ); |
99
|
|
|
|
|
|
|
} |
100
|
0
|
|
|
|
|
|
my $avg = sum(@array) / int( keys %$data ); |
101
|
0
|
|
|
|
|
|
return $avg; |
102
|
|
|
|
|
|
|
} |
103
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
sub _distance { |
105
|
0
|
|
|
0
|
|
|
my $slef = shift; |
106
|
0
|
|
|
|
|
|
my $arg1 = shift; |
107
|
0
|
|
|
|
|
|
my $arg2 = shift; |
108
|
0
|
|
|
|
|
|
my %hash1 = %$arg1; |
109
|
0
|
|
|
|
|
|
my %hash2 = %$arg2; |
110
|
0
|
|
|
|
|
|
my $sum; |
111
|
0
|
|
|
|
|
|
while ( my ( $attr, $score ) = each(%hash1) ) { |
112
|
0
|
|
|
|
|
|
my $d = $score; |
113
|
0
|
0
|
|
|
|
|
if ( my $score2 = delete $hash2{$attr} ) { |
114
|
0
|
|
|
|
|
|
$d = $score - $score2; |
115
|
|
|
|
|
|
|
} |
116
|
|
|
|
|
|
|
else { |
117
|
|
|
|
|
|
|
} |
118
|
0
|
|
|
|
|
|
$sum += ( $d**2 ); |
119
|
|
|
|
|
|
|
} |
120
|
0
|
|
|
|
|
|
while ( my ( $attr, $score ) = each(%hash2) ) { |
121
|
0
|
|
|
|
|
|
$sum += ( ( 0 - $score )**2 ); |
122
|
|
|
|
|
|
|
} |
123
|
0
|
|
|
|
|
|
return sqrt($sum); |
124
|
|
|
|
|
|
|
} |
125
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
sub _match_word_point { |
127
|
0
|
|
|
0
|
|
|
my $self = shift; |
128
|
0
|
|
|
|
|
|
my $match = 0; |
129
|
0
|
|
|
|
|
|
my $no_match = 0; |
130
|
0
|
0
|
|
|
|
|
if ( $self->matches ) { |
131
|
0
|
|
|
|
|
|
for ( @{ $self->matches } ) { |
|
0
|
|
|
|
|
|
|
132
|
0
|
|
|
|
|
|
$match += $self->{word_set}->{$_}; |
133
|
|
|
|
|
|
|
} |
134
|
|
|
|
|
|
|
} |
135
|
0
|
0
|
|
|
|
|
if ( $self->no_matches ) { |
136
|
0
|
|
|
|
|
|
for ( @{ $self->no_matches } ) { |
|
0
|
|
|
|
|
|
|
137
|
0
|
|
|
|
|
|
$no_match += $self->{word_set}->{$_}; |
138
|
|
|
|
|
|
|
} |
139
|
|
|
|
|
|
|
} |
140
|
0
|
|
|
|
|
|
my $ratio = $match / ( $match + $no_match ); |
141
|
0
|
|
|
|
|
|
return $ratio; |
142
|
|
|
|
|
|
|
} |
143
|
|
|
|
|
|
|
|
144
|
|
|
|
|
|
|
sub _entropy_point { |
145
|
0
|
|
|
0
|
|
|
my $self = shift; |
146
|
|
|
|
|
|
|
my @scores |
147
|
0
|
|
|
|
|
|
= map { values %$_; } @{ $self->score(5) }; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
148
|
0
|
|
|
|
|
|
my $sum = sum(@scores); |
149
|
0
|
|
|
|
|
|
my $e = 0; |
150
|
0
|
|
|
|
|
|
my ( $p, $z ); |
151
|
0
|
|
|
|
|
|
for (@scores) { |
152
|
0
|
0
|
|
|
|
|
if ( $_ > 0 ) { |
153
|
0
|
|
|
|
|
|
$p = $_ / $sum; |
154
|
0
|
|
|
|
|
|
$z = -$p * $self->_log2($p); |
155
|
0
|
|
|
|
|
|
$e += $z; |
156
|
|
|
|
|
|
|
} |
157
|
|
|
|
|
|
|
} |
158
|
0
|
|
|
|
|
|
my $we = 2**$e; |
159
|
0
|
|
|
|
|
|
my $max = int @scores; |
160
|
0
|
|
|
|
|
|
my $scale = 1000; |
161
|
0
|
|
|
|
|
|
my $ee = $self->_log2( ( $max - $we ) * $scale ); |
162
|
0
|
|
|
|
|
|
my $e_max = $self->_log2( $max * $scale ); |
163
|
0
|
|
|
|
|
|
return $ee / $e_max; |
164
|
|
|
|
|
|
|
} |
165
|
|
|
|
|
|
|
|
166
|
|
|
|
|
|
|
sub _log2 { |
167
|
0
|
|
|
0
|
|
|
my $self = shift; |
168
|
0
|
|
|
|
|
|
my $n = shift; |
169
|
0
|
|
|
|
|
|
log($n) / log(2); |
170
|
|
|
|
|
|
|
} |
171
|
|
|
|
|
|
|
|
172
|
|
|
|
|
|
|
sub no_matches { |
173
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
174
|
0
|
|
|
|
|
|
my $no_matches = $self->{no_matches}; |
175
|
0
|
|
|
|
|
|
return $no_matches; |
176
|
|
|
|
|
|
|
} |
177
|
|
|
|
|
|
|
|
178
|
|
|
|
|
|
|
sub matches { |
179
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
180
|
0
|
|
|
|
|
|
my $matches = $self->{matches}; |
181
|
0
|
|
|
|
|
|
return $matches; |
182
|
|
|
|
|
|
|
} |
183
|
|
|
|
|
|
|
|
184
|
|
|
|
|
|
|
sub user_extention { |
185
|
0
|
|
|
0
|
0
|
|
my $self = shift; |
186
|
0
|
|
|
|
|
|
my $user_extention = $self->{user_extention}; |
187
|
0
|
|
|
|
|
|
return $user_extention; |
188
|
|
|
|
|
|
|
} |
189
|
|
|
|
|
|
|
|
190
|
|
|
|
|
|
|
1; |
191
|
|
|
|
|
|
|
__END__ |