| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package Lingua::JA::Categorize::Result; |
|
2
|
1
|
|
|
1
|
|
4
|
use strict; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
35
|
|
|
3
|
1
|
|
|
1
|
|
5
|
use warnings; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
37
|
|
|
4
|
1
|
|
|
1
|
|
4
|
use List::Util qw(sum); |
|
|
1
|
|
|
|
|
3
|
|
|
|
1
|
|
|
|
|
109
|
|
|
5
|
1
|
|
|
1
|
|
4
|
use base qw( Lingua::JA::Categorize::Base ); |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
512
|
|
|
6
|
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
sub word_set { |
|
8
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
|
9
|
0
|
|
|
|
|
|
my $word_set = $self->{word_set}; |
|
10
|
0
|
|
|
|
|
|
my @list; |
|
11
|
0
|
|
|
|
|
|
for ( sort { $word_set->{$b} <=> $word_set->{$a} } keys %$word_set ) { |
|
|
0
|
|
|
|
|
|
|
|
12
|
0
|
|
|
|
|
|
push( @list, { $_ => $word_set->{$_} } ); |
|
13
|
|
|
|
|
|
|
} |
|
14
|
0
|
|
|
|
|
|
return \@list; |
|
15
|
|
|
|
|
|
|
} |
|
16
|
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
sub score { |
|
18
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
|
19
|
0
|
|
|
|
|
|
my $num = shift; |
|
20
|
0
|
|
0
|
|
|
|
$num ||= 3; |
|
21
|
0
|
0
|
|
|
|
|
unless ( $self->word_set->[0] ) { |
|
22
|
0
|
|
|
|
|
|
return undef; |
|
23
|
|
|
|
|
|
|
} |
|
24
|
0
|
|
|
|
|
|
my $score = $self->{score}; |
|
25
|
0
|
|
|
|
|
|
my @list; |
|
26
|
0
|
|
|
|
|
|
my $i = 0; |
|
27
|
0
|
|
|
|
|
|
for ( sort { $score->{$b} <=> $score->{$a} } keys %$score ) { |
|
|
0
|
|
|
|
|
|
|
|
28
|
0
|
0
|
|
|
|
|
push( @list, { $_ => $score->{$_} } ) if $score->{$_} > 0; |
|
29
|
0
|
|
|
|
|
|
$i++; |
|
30
|
0
|
0
|
|
|
|
|
last if ( $i == $num ); |
|
31
|
|
|
|
|
|
|
} |
|
32
|
0
|
|
|
|
|
|
return \@list; |
|
33
|
|
|
|
|
|
|
} |
|
34
|
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
sub confidence { |
|
36
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
|
37
|
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
# マッチor ノーマッチによる確信度計算 |
|
39
|
0
|
|
|
|
|
|
my $match_word_point = $self->_match_word_point; |
|
40
|
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
# エントロピーによる確信度計算 |
|
42
|
0
|
|
|
|
|
|
my $entropy_point = $self->_entropy_point; |
|
43
|
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
# 距離計算 |
|
45
|
0
|
|
|
|
|
|
my $v3 = $self->_distance_point(3); |
|
46
|
0
|
|
|
|
|
|
my $v10 = $self->_distance_point(10); |
|
47
|
0
|
|
|
|
|
|
my $distance_point = 1 - $v3 / $v10; |
|
48
|
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
# 線形結合(重みは適当) |
|
50
|
0
|
|
|
|
|
|
my $w1 = 5; |
|
51
|
0
|
|
|
|
|
|
my $w2 = 1; |
|
52
|
0
|
|
|
|
|
|
my $w3 = 1; |
|
53
|
0
|
|
|
|
|
|
my $confidence_point |
|
54
|
|
|
|
|
|
|
= ( $w1 * $match_word_point |
|
55
|
|
|
|
|
|
|
+ $w2 * $entropy_point |
|
56
|
|
|
|
|
|
|
+ $w3 * $distance_point ) / ( $w1 + $w2 + $w3 ); |
|
57
|
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
#print "M:", $match_word_point, "\n"; |
|
59
|
|
|
|
|
|
|
#print "E:", $entropy_point, "\n"; |
|
60
|
|
|
|
|
|
|
#print "D:", $distance_point, "\n"; |
|
61
|
|
|
|
|
|
|
|
|
62
|
0
|
|
|
|
|
|
return $confidence_point; |
|
63
|
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
} |
|
65
|
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
sub _distance_point { |
|
67
|
0
|
|
|
0
|
|
|
my $self = shift; |
|
68
|
0
|
|
0
|
|
|
|
my $n = shift || 3; |
|
69
|
0
|
|
|
|
|
|
my $brain = $self->context->categorizer->brain; |
|
70
|
|
|
|
|
|
|
my @categories |
|
71
|
0
|
|
|
|
|
|
= map { keys %$_; } @{ $self->score($n) }; |
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
|
|
73
|
|
|
|
|
|
|
# 必要なデータを抽出 |
|
74
|
0
|
|
|
|
|
|
my $data; |
|
75
|
0
|
|
|
|
|
|
for (@categories) { |
|
76
|
0
|
|
|
|
|
|
$data->{$_} = $brain->{training_data}->{labels}->{$_}->{attributes}; |
|
77
|
|
|
|
|
|
|
} |
|
78
|
|
|
|
|
|
|
|
|
79
|
|
|
|
|
|
|
# 重心を測定 |
|
80
|
0
|
|
|
|
|
|
my $centroid; |
|
81
|
|
|
|
|
|
|
my %counter; |
|
82
|
0
|
|
|
|
|
|
my %sum; |
|
83
|
0
|
|
|
|
|
|
while ( my ( $label, $ref ) = each(%$data) ) { |
|
84
|
0
|
|
|
|
|
|
while ( my ( $attr, $score ) = each(%$ref) ) { |
|
85
|
0
|
|
|
|
|
|
$counter{$attr}++; |
|
86
|
0
|
|
|
|
|
|
$sum{$attr} += $score; |
|
87
|
|
|
|
|
|
|
} |
|
88
|
|
|
|
|
|
|
} |
|
89
|
0
|
|
|
|
|
|
while ( my ( $key, $value ) = each(%sum) ) { |
|
90
|
0
|
|
|
|
|
|
$centroid->{$key} = $value / $counter{$key}; |
|
91
|
|
|
|
|
|
|
} |
|
92
|
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
# 重心からの平均距離を求める |
|
94
|
0
|
|
|
|
|
|
my @array; |
|
95
|
0
|
|
|
|
|
|
for (@categories) { |
|
96
|
0
|
|
|
|
|
|
my $p = $data->{$_}; |
|
97
|
0
|
|
|
|
|
|
my $distance = $self->_distance( $centroid, $p ); |
|
98
|
0
|
|
|
|
|
|
push( @array, $distance ); |
|
99
|
|
|
|
|
|
|
} |
|
100
|
0
|
|
|
|
|
|
my $avg = sum(@array) / int( keys %$data ); |
|
101
|
0
|
|
|
|
|
|
return $avg; |
|
102
|
|
|
|
|
|
|
} |
|
103
|
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
sub _distance { |
|
105
|
0
|
|
|
0
|
|
|
my $slef = shift; |
|
106
|
0
|
|
|
|
|
|
my $arg1 = shift; |
|
107
|
0
|
|
|
|
|
|
my $arg2 = shift; |
|
108
|
0
|
|
|
|
|
|
my %hash1 = %$arg1; |
|
109
|
0
|
|
|
|
|
|
my %hash2 = %$arg2; |
|
110
|
0
|
|
|
|
|
|
my $sum; |
|
111
|
0
|
|
|
|
|
|
while ( my ( $attr, $score ) = each(%hash1) ) { |
|
112
|
0
|
|
|
|
|
|
my $d = $score; |
|
113
|
0
|
0
|
|
|
|
|
if ( my $score2 = delete $hash2{$attr} ) { |
|
114
|
0
|
|
|
|
|
|
$d = $score - $score2; |
|
115
|
|
|
|
|
|
|
} |
|
116
|
|
|
|
|
|
|
else { |
|
117
|
|
|
|
|
|
|
} |
|
118
|
0
|
|
|
|
|
|
$sum += ( $d**2 ); |
|
119
|
|
|
|
|
|
|
} |
|
120
|
0
|
|
|
|
|
|
while ( my ( $attr, $score ) = each(%hash2) ) { |
|
121
|
0
|
|
|
|
|
|
$sum += ( ( 0 - $score )**2 ); |
|
122
|
|
|
|
|
|
|
} |
|
123
|
0
|
|
|
|
|
|
return sqrt($sum); |
|
124
|
|
|
|
|
|
|
} |
|
125
|
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
sub _match_word_point { |
|
127
|
0
|
|
|
0
|
|
|
my $self = shift; |
|
128
|
0
|
|
|
|
|
|
my $match = 0; |
|
129
|
0
|
|
|
|
|
|
my $no_match = 0; |
|
130
|
0
|
0
|
|
|
|
|
if ( $self->matches ) { |
|
131
|
0
|
|
|
|
|
|
for ( @{ $self->matches } ) { |
|
|
0
|
|
|
|
|
|
|
|
132
|
0
|
|
|
|
|
|
$match += $self->{word_set}->{$_}; |
|
133
|
|
|
|
|
|
|
} |
|
134
|
|
|
|
|
|
|
} |
|
135
|
0
|
0
|
|
|
|
|
if ( $self->no_matches ) { |
|
136
|
0
|
|
|
|
|
|
for ( @{ $self->no_matches } ) { |
|
|
0
|
|
|
|
|
|
|
|
137
|
0
|
|
|
|
|
|
$no_match += $self->{word_set}->{$_}; |
|
138
|
|
|
|
|
|
|
} |
|
139
|
|
|
|
|
|
|
} |
|
140
|
0
|
|
|
|
|
|
my $ratio = $match / ( $match + $no_match ); |
|
141
|
0
|
|
|
|
|
|
return $ratio; |
|
142
|
|
|
|
|
|
|
} |
|
143
|
|
|
|
|
|
|
|
|
144
|
|
|
|
|
|
|
sub _entropy_point { |
|
145
|
0
|
|
|
0
|
|
|
my $self = shift; |
|
146
|
|
|
|
|
|
|
my @scores |
|
147
|
0
|
|
|
|
|
|
= map { values %$_; } @{ $self->score(5) }; |
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
148
|
0
|
|
|
|
|
|
my $sum = sum(@scores); |
|
149
|
0
|
|
|
|
|
|
my $e = 0; |
|
150
|
0
|
|
|
|
|
|
my ( $p, $z ); |
|
151
|
0
|
|
|
|
|
|
for (@scores) { |
|
152
|
0
|
0
|
|
|
|
|
if ( $_ > 0 ) { |
|
153
|
0
|
|
|
|
|
|
$p = $_ / $sum; |
|
154
|
0
|
|
|
|
|
|
$z = -$p * $self->_log2($p); |
|
155
|
0
|
|
|
|
|
|
$e += $z; |
|
156
|
|
|
|
|
|
|
} |
|
157
|
|
|
|
|
|
|
} |
|
158
|
0
|
|
|
|
|
|
my $we = 2**$e; |
|
159
|
0
|
|
|
|
|
|
my $max = int @scores; |
|
160
|
0
|
|
|
|
|
|
my $scale = 1000; |
|
161
|
0
|
|
|
|
|
|
my $ee = $self->_log2( ( $max - $we ) * $scale ); |
|
162
|
0
|
|
|
|
|
|
my $e_max = $self->_log2( $max * $scale ); |
|
163
|
0
|
|
|
|
|
|
return $ee / $e_max; |
|
164
|
|
|
|
|
|
|
} |
|
165
|
|
|
|
|
|
|
|
|
166
|
|
|
|
|
|
|
sub _log2 { |
|
167
|
0
|
|
|
0
|
|
|
my $self = shift; |
|
168
|
0
|
|
|
|
|
|
my $n = shift; |
|
169
|
0
|
|
|
|
|
|
log($n) / log(2); |
|
170
|
|
|
|
|
|
|
} |
|
171
|
|
|
|
|
|
|
|
|
172
|
|
|
|
|
|
|
sub no_matches { |
|
173
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
|
174
|
0
|
|
|
|
|
|
my $no_matches = $self->{no_matches}; |
|
175
|
0
|
|
|
|
|
|
return $no_matches; |
|
176
|
|
|
|
|
|
|
} |
|
177
|
|
|
|
|
|
|
|
|
178
|
|
|
|
|
|
|
sub matches { |
|
179
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
|
180
|
0
|
|
|
|
|
|
my $matches = $self->{matches}; |
|
181
|
0
|
|
|
|
|
|
return $matches; |
|
182
|
|
|
|
|
|
|
} |
|
183
|
|
|
|
|
|
|
|
|
184
|
|
|
|
|
|
|
sub user_extention { |
|
185
|
0
|
|
|
0
|
0
|
|
my $self = shift; |
|
186
|
0
|
|
|
|
|
|
my $user_extention = $self->{user_extention}; |
|
187
|
0
|
|
|
|
|
|
return $user_extention; |
|
188
|
|
|
|
|
|
|
} |
|
189
|
|
|
|
|
|
|
|
|
190
|
|
|
|
|
|
|
1; |
|
191
|
|
|
|
|
|
|
__END__ |