line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Lingua::JA::Summarize; |
2
|
|
|
|
|
|
|
|
3
|
3
|
|
|
3
|
|
25567
|
use strict; |
|
3
|
|
|
|
|
7
|
|
|
3
|
|
|
|
|
124
|
|
4
|
3
|
|
|
3
|
|
17
|
use warnings; |
|
3
|
|
|
|
|
6
|
|
|
3
|
|
|
|
|
278
|
|
5
|
|
|
|
|
|
|
|
6
|
|
|
|
|
|
|
our $VERSION = 0.08; |
7
|
|
|
|
|
|
|
our @EXPORT_OK = |
8
|
|
|
|
|
|
|
qw(keyword_summary file_keyword_summary |
9
|
|
|
|
|
|
|
%LJS_Defaults %LJS_Defaults_keywords); |
10
|
|
|
|
|
|
|
our %EXPORT_TAGS = ( |
11
|
|
|
|
|
|
|
all => \@EXPORT_OK, |
12
|
|
|
|
|
|
|
); |
13
|
|
|
|
|
|
|
|
14
|
3
|
|
|
3
|
|
34
|
use base qw(Exporter Class::Accessor::Fast Class::ErrorHandler); |
|
3
|
|
|
|
|
6
|
|
|
3
|
|
|
|
|
4311
|
|
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
use Carp; |
17
|
|
|
|
|
|
|
use Encode; |
18
|
|
|
|
|
|
|
use File::Temp qw(:POSIX); |
19
|
|
|
|
|
|
|
use Jcode; |
20
|
|
|
|
|
|
|
use Lingua::JA::Summarize::Mecab; |
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
sub NG () { |
23
|
|
|
|
|
|
|
+{ map { $_ => 1 } ( |
24
|
|
|
|
|
|
|
'(', ')', '#', ',', '"', "'", '`', |
25
|
|
|
|
|
|
|
qw(! $ % & * + - . / : ; < = > ? @ [ \ ] ^ _ { | } ~), |
26
|
|
|
|
|
|
|
qw(¿Í Éà ʬ »þ Æü ·î ǯ ±ß ¥É¥ë), |
27
|
|
|
|
|
|
|
qw(°ì Æó »° »Í ¸Þ Ï» ¼· Ȭ ¶å ½½ É´ Àé Ëü ²¯ Ãû), |
28
|
|
|
|
|
|
|
qw(¢¬ ¢ ¢« ¢ª ¢Í ¢Î ¡À ¡° ¡® ¡³), |
29
|
|
|
|
|
|
|
qw(a any the who he she i to and in you is you str this ago about and new as of for if or it have by into at on an are were was be my am your we them there their from all its), |
30
|
|
|
|
|
|
|
) }; |
31
|
|
|
|
|
|
|
} |
32
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
sub DEFAULT_COST_FACTOR () { 2000 } |
34
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
my %Defaults = ( |
36
|
|
|
|
|
|
|
alnum_as_word => 1, |
37
|
|
|
|
|
|
|
concat_nouns => 1, |
38
|
|
|
|
|
|
|
charset => 'euc', |
39
|
|
|
|
|
|
|
default_cost => 1, |
40
|
|
|
|
|
|
|
jaascii_as_word => 1, |
41
|
|
|
|
|
|
|
ng => NG(), |
42
|
|
|
|
|
|
|
mecab => 'mecab', |
43
|
|
|
|
|
|
|
mecab_charset => 'euc', |
44
|
|
|
|
|
|
|
mecab_factory => sub { |
45
|
|
|
|
|
|
|
Lingua::JA::Summarize::Mecab->new(@_), |
46
|
|
|
|
|
|
|
}, |
47
|
|
|
|
|
|
|
omit_number => 1, |
48
|
|
|
|
|
|
|
singlechar_factor => 0.5, |
49
|
|
|
|
|
|
|
url_as_word => 1, |
50
|
|
|
|
|
|
|
); |
51
|
|
|
|
|
|
|
our %LJS_Defaults = (); |
52
|
|
|
|
|
|
|
foreach my $k (keys %Defaults) { |
53
|
|
|
|
|
|
|
my $n = 'LJS_' . uc($k); |
54
|
|
|
|
|
|
|
$LJS_Defaults{$k} = $ENV{$n} if defined $ENV{$n}; |
55
|
|
|
|
|
|
|
} |
56
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
our %LJS_ascii_words = (); |
58
|
|
|
|
|
|
|
our %LJS_encoded_words = (); |
59
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
__PACKAGE__->mk_accessors(keys %Defaults, qw(stats wordcount)); |
61
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
sub new { |
63
|
|
|
|
|
|
|
my ($proto, $fields) = @_; |
64
|
|
|
|
|
|
|
my $class = ref $proto || $proto; |
65
|
|
|
|
|
|
|
my $self = bless { |
66
|
|
|
|
|
|
|
%Defaults, |
67
|
|
|
|
|
|
|
%LJS_Defaults, |
68
|
|
|
|
|
|
|
($fields ? %$fields : ()), |
69
|
|
|
|
|
|
|
}, $class; |
70
|
|
|
|
|
|
|
$self->{wordcount} = 0; |
71
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
return $self; |
73
|
|
|
|
|
|
|
} |
74
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
my %Defaults_keywords = ( |
76
|
|
|
|
|
|
|
maxwords => 5, |
77
|
|
|
|
|
|
|
minwords => 0, |
78
|
|
|
|
|
|
|
threshold => 5 |
79
|
|
|
|
|
|
|
); |
80
|
|
|
|
|
|
|
our %LJS_Defaults_keywords = (); |
81
|
|
|
|
|
|
|
foreach my $k (keys %Defaults_keywords) { |
82
|
|
|
|
|
|
|
my $n = 'LJS_KEYWORDS_' . uc($k); |
83
|
|
|
|
|
|
|
$LJS_Defaults_keywords{$k} = $ENV{$n} if defined $ENV{$n}; |
84
|
|
|
|
|
|
|
} |
85
|
|
|
|
|
|
|
|
86
|
|
|
|
|
|
|
sub keywords { |
87
|
|
|
|
|
|
|
my ($self, $_args) = @_; |
88
|
|
|
|
|
|
|
my %args = ( |
89
|
|
|
|
|
|
|
%Defaults_keywords, |
90
|
|
|
|
|
|
|
%LJS_Defaults_keywords, |
91
|
|
|
|
|
|
|
($_args ? %$_args : ()), |
92
|
|
|
|
|
|
|
); |
93
|
|
|
|
|
|
|
my $stats = $self->{stats}; |
94
|
|
|
|
|
|
|
my @keywords; |
95
|
|
|
|
|
|
|
|
96
|
|
|
|
|
|
|
foreach my $word ( |
97
|
|
|
|
|
|
|
sort { $stats->{$b}->{weight} <=> $stats->{$a}->{weight} || $a cmp $b } |
98
|
|
|
|
|
|
|
keys(%$stats)) { |
99
|
|
|
|
|
|
|
last if |
100
|
|
|
|
|
|
|
$args{minwords} <= @keywords |
101
|
|
|
|
|
|
|
&& $stats->{$word}->{weight} < $args{threshold}; |
102
|
|
|
|
|
|
|
push(@keywords, $word); |
103
|
|
|
|
|
|
|
last if $args{maxwords} == @keywords; |
104
|
|
|
|
|
|
|
} |
105
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
return @keywords; |
107
|
|
|
|
|
|
|
} |
108
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
sub analyze_file { |
110
|
|
|
|
|
|
|
my ($self, $file) = @_; |
111
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
open my $fh, '<', $file or croak("failed to open: $file: $!"); |
113
|
|
|
|
|
|
|
my $text = do { local $/; <$fh> }; |
114
|
|
|
|
|
|
|
close $fh; |
115
|
|
|
|
|
|
|
|
116
|
|
|
|
|
|
|
$self->analyze($text); |
117
|
|
|
|
|
|
|
} |
118
|
|
|
|
|
|
|
|
119
|
|
|
|
|
|
|
sub analyze { |
120
|
|
|
|
|
|
|
my ($self, $text) = @_; |
121
|
|
|
|
|
|
|
|
122
|
|
|
|
|
|
|
croak("already analyzed") if $self->{stats}; |
123
|
|
|
|
|
|
|
$self->{stats} = {}; |
124
|
|
|
|
|
|
|
|
125
|
|
|
|
|
|
|
# adjust text |
126
|
|
|
|
|
|
|
Jcode::convert(\$text, 'euc', $self->charset) if $self->charset ne 'euc'; |
127
|
|
|
|
|
|
|
$text = $self->_prefilter($text); |
128
|
|
|
|
|
|
|
$text =~ s/\s*\n\s*/\n/sg; |
129
|
|
|
|
|
|
|
$text .= "\n"; |
130
|
|
|
|
|
|
|
$text = _normalize_japanese($text); |
131
|
|
|
|
|
|
|
Jcode::convert(\$text, $self->mecab_charset, 'euc') |
132
|
|
|
|
|
|
|
if $self->mecab_charset ne 'euc'; |
133
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
# write text to temporary file |
135
|
|
|
|
|
|
|
my ($fh, $tempfile) = tmpnam(); |
136
|
|
|
|
|
|
|
print $fh $text; |
137
|
|
|
|
|
|
|
close $fh; |
138
|
|
|
|
|
|
|
|
139
|
|
|
|
|
|
|
# open mecab |
140
|
|
|
|
|
|
|
my $mecab = $self->mecab_factory->($self, $tempfile); |
141
|
|
|
|
|
|
|
|
142
|
|
|
|
|
|
|
# read from mecab |
143
|
|
|
|
|
|
|
my $longword = { |
144
|
|
|
|
|
|
|
text => '', |
145
|
|
|
|
|
|
|
cost => 0, |
146
|
|
|
|
|
|
|
count => 0, |
147
|
|
|
|
|
|
|
}; |
148
|
|
|
|
|
|
|
my $add_longword = sub { |
149
|
|
|
|
|
|
|
if ($longword->{text}) { |
150
|
|
|
|
|
|
|
$self->_add_word( |
151
|
|
|
|
|
|
|
$longword->{text}, |
152
|
|
|
|
|
|
|
$longword->{cost} / (log($longword->{count}) * 0.7 + 1)); |
153
|
|
|
|
|
|
|
} |
154
|
|
|
|
|
|
|
$longword->{text} = ''; |
155
|
|
|
|
|
|
|
$longword->{cost} = 0; |
156
|
|
|
|
|
|
|
$longword->{count} = 0; |
157
|
|
|
|
|
|
|
}; |
158
|
|
|
|
|
|
|
while (my $line = $mecab->getline) { |
159
|
|
|
|
|
|
|
chomp $line; |
160
|
|
|
|
|
|
|
Jcode::convert(\$line, 'euc', $self->mecab_charset) |
161
|
|
|
|
|
|
|
if $self->mecab_charset ne 'euc'; |
162
|
|
|
|
|
|
|
if ($line =~ /\t/o) { |
163
|
|
|
|
|
|
|
my ($word, $pn, $pw, $H) = split(/\t/, $line, 4); |
164
|
|
|
|
|
|
|
$word = $self->_postfilter($word); |
165
|
|
|
|
|
|
|
$word = $self->_normalize_word($word); |
166
|
|
|
|
|
|
|
my $ng = $self->_ng_word($word); |
167
|
|
|
|
|
|
|
if ($ng) { |
168
|
|
|
|
|
|
|
$add_longword->(); |
169
|
|
|
|
|
|
|
next; |
170
|
|
|
|
|
|
|
} |
171
|
|
|
|
|
|
|
if ($H =~ /^̾»ì/) { |
172
|
|
|
|
|
|
|
if ($H =~ /(Èó¼«Î©|Âå̾»ì)/) { |
173
|
|
|
|
|
|
|
$add_longword->(); |
174
|
|
|
|
|
|
|
next; |
175
|
|
|
|
|
|
|
} elsif (! $longword->{text} && $H =~ /ÀÜÈø/) { |
176
|
|
|
|
|
|
|
# ng |
177
|
|
|
|
|
|
|
next; |
178
|
|
|
|
|
|
|
} |
179
|
|
|
|
|
|
|
if (! $self->concat_nouns && $H !~ /ÀÜÈø/) { |
180
|
|
|
|
|
|
|
$add_longword->(); |
181
|
|
|
|
|
|
|
} |
182
|
|
|
|
|
|
|
} elsif ($H eq 'UnkType') { |
183
|
|
|
|
|
|
|
# handle unknown (mostly English) words |
184
|
|
|
|
|
|
|
if ($self->jaascii_as_word) { |
185
|
|
|
|
|
|
|
if ($word =~ /^\w/ && $longword->{text} =~ /\w$/) { |
186
|
|
|
|
|
|
|
$add_longword->(); |
187
|
|
|
|
|
|
|
} |
188
|
|
|
|
|
|
|
} else { |
189
|
|
|
|
|
|
|
$add_longword->(); |
190
|
|
|
|
|
|
|
$self->_add_word($word, $pw); |
191
|
|
|
|
|
|
|
next; |
192
|
|
|
|
|
|
|
} |
193
|
|
|
|
|
|
|
} else { |
194
|
|
|
|
|
|
|
$add_longword->(); |
195
|
|
|
|
|
|
|
next; |
196
|
|
|
|
|
|
|
} |
197
|
|
|
|
|
|
|
$longword->{text} .= $word; |
198
|
|
|
|
|
|
|
$longword->{cost} += $pw; # do not use $pn |
199
|
|
|
|
|
|
|
$longword->{count}++; |
200
|
|
|
|
|
|
|
} else { |
201
|
|
|
|
|
|
|
$add_longword->(); |
202
|
|
|
|
|
|
|
} |
203
|
|
|
|
|
|
|
} |
204
|
|
|
|
|
|
|
$add_longword->(); |
205
|
|
|
|
|
|
|
unlink($tempfile); |
206
|
|
|
|
|
|
|
|
207
|
|
|
|
|
|
|
# calculate tf-idf |
208
|
|
|
|
|
|
|
$self->_calc_weight; |
209
|
|
|
|
|
|
|
|
210
|
|
|
|
|
|
|
1; |
211
|
|
|
|
|
|
|
} |
212
|
|
|
|
|
|
|
|
213
|
|
|
|
|
|
|
sub _add_word { |
214
|
|
|
|
|
|
|
my ($self, $word, $cost) = @_; |
215
|
|
|
|
|
|
|
return if $cost <= 0; |
216
|
|
|
|
|
|
|
return if $self->_ng_word($word); |
217
|
|
|
|
|
|
|
$self->{wordcount}++; |
218
|
|
|
|
|
|
|
Jcode::convert(\$word, $self->charset, 'euc') if $self->charset ne 'euc'; |
219
|
|
|
|
|
|
|
my $target = $self->{stats}->{$word}; |
220
|
|
|
|
|
|
|
if ($target) { |
221
|
|
|
|
|
|
|
$target->{count}++; |
222
|
|
|
|
|
|
|
} else { |
223
|
|
|
|
|
|
|
$self->{stats}->{$word} = { count => 1, cost => $cost }; |
224
|
|
|
|
|
|
|
} |
225
|
|
|
|
|
|
|
} |
226
|
|
|
|
|
|
|
|
227
|
|
|
|
|
|
|
sub _calc_weight { |
228
|
|
|
|
|
|
|
my $self = shift; |
229
|
|
|
|
|
|
|
foreach my $word (keys(%{$self->{stats}})) { |
230
|
|
|
|
|
|
|
my $target = $self->{stats}->{$word}; |
231
|
|
|
|
|
|
|
my $cost = $target->{cost}; |
232
|
|
|
|
|
|
|
$cost = $self->default_cost * DEFAULT_COST_FACTOR unless $cost; |
233
|
|
|
|
|
|
|
$target->{weight} = |
234
|
|
|
|
|
|
|
($target->{count} - 0.5) * $cost / $self->{wordcount} / 6; |
235
|
|
|
|
|
|
|
if ($self->_is_singlechar($word)) { |
236
|
|
|
|
|
|
|
$target->{weight} *= $self->singlechar_factor; |
237
|
|
|
|
|
|
|
} |
238
|
|
|
|
|
|
|
} |
239
|
|
|
|
|
|
|
} |
240
|
|
|
|
|
|
|
|
241
|
|
|
|
|
|
|
sub _normalize_word { |
242
|
|
|
|
|
|
|
my ($self, $word) = @_; |
243
|
|
|
|
|
|
|
$word = Jcode->new($word, 'euc')->h2z; |
244
|
|
|
|
|
|
|
$word->tr('£°-£¹£Á-£Ú£á-£ú¡Ê¡Ë', '0-9A-Za-z()'); |
245
|
|
|
|
|
|
|
lc($word); |
246
|
|
|
|
|
|
|
} |
247
|
|
|
|
|
|
|
|
248
|
|
|
|
|
|
|
sub _ng_word { |
249
|
|
|
|
|
|
|
my ($self, $word) = @_; |
250
|
|
|
|
|
|
|
return 1 if $self->omit_number && $word =~ /^\d*$/; |
251
|
|
|
|
|
|
|
return 1 if exists $self->{ng}->{$word}; |
252
|
|
|
|
|
|
|
return 1 if $word !~ /[\w\x80-\xff]/; |
253
|
|
|
|
|
|
|
undef; |
254
|
|
|
|
|
|
|
} |
255
|
|
|
|
|
|
|
|
256
|
|
|
|
|
|
|
sub _prefilter { |
257
|
|
|
|
|
|
|
my ($self, $text) = @_; |
258
|
|
|
|
|
|
|
if ($self->alnum_as_word) { |
259
|
|
|
|
|
|
|
if ($self->url_as_word) { |
260
|
|
|
|
|
|
|
$text =~ |
261
|
|
|
|
|
|
|
s!(https?://[A-Za-z0-9.:_/?#~\$\-=&%]+|[A-Za-z0-9_][A-Za-z0-9_.']*[A-Za-z0-9_])!_encode_ascii_word($1)!eg; |
262
|
|
|
|
|
|
|
} else { |
263
|
|
|
|
|
|
|
$text =~ |
264
|
|
|
|
|
|
|
s!([A-Za-z0-9_][A-Za-z0-9_.']*[A-Za-z0-9_])!_encode_ascii_word($1)!eg; |
265
|
|
|
|
|
|
|
} |
266
|
|
|
|
|
|
|
} |
267
|
|
|
|
|
|
|
$text; |
268
|
|
|
|
|
|
|
} |
269
|
|
|
|
|
|
|
|
270
|
|
|
|
|
|
|
sub _postfilter { |
271
|
|
|
|
|
|
|
my ($self, $word) = @_; |
272
|
|
|
|
|
|
|
if ($word =~ /^[A-Za-z]+$/ && |
273
|
|
|
|
|
|
|
($self->alnum_as_word || $self->url_as_word)) { |
274
|
|
|
|
|
|
|
$word = _decode_ascii_word($word); |
275
|
|
|
|
|
|
|
} |
276
|
|
|
|
|
|
|
$word; |
277
|
|
|
|
|
|
|
} |
278
|
|
|
|
|
|
|
|
279
|
|
|
|
|
|
|
sub _is_singlechar { |
280
|
|
|
|
|
|
|
my ($self, $word) = @_; |
281
|
|
|
|
|
|
|
my $enc = $self->charset; |
282
|
|
|
|
|
|
|
$enc = 'euc-jp' if $enc eq 'euc'; |
283
|
|
|
|
|
|
|
1 == length decode($enc, $word); |
284
|
|
|
|
|
|
|
} |
285
|
|
|
|
|
|
|
|
286
|
|
|
|
|
|
|
sub _encode_ascii_word { |
287
|
|
|
|
|
|
|
my ($word) = @_; |
288
|
|
|
|
|
|
|
return $word if $word !~ /^qz[a-z]{9}q$/ && $word =~ /^([A-Za-z]{1,25}|[0-9]{1,25})$/; |
289
|
|
|
|
|
|
|
return $LJS_encoded_words{$word} if ($LJS_encoded_words{$word}); |
290
|
|
|
|
|
|
|
for(;;){ |
291
|
|
|
|
|
|
|
my $p="qz"; |
292
|
|
|
|
|
|
|
for(1..9){$p.=('a'..'z')[int rand 26];} |
293
|
|
|
|
|
|
|
$p.="q"; |
294
|
|
|
|
|
|
|
unless ($LJS_encoded_words{$word}) { |
295
|
|
|
|
|
|
|
$LJS_encoded_words{$word} = $p; |
296
|
|
|
|
|
|
|
$LJS_ascii_words{$p} = $word; |
297
|
|
|
|
|
|
|
return $p; |
298
|
|
|
|
|
|
|
} |
299
|
|
|
|
|
|
|
} |
300
|
|
|
|
|
|
|
} |
301
|
|
|
|
|
|
|
|
302
|
|
|
|
|
|
|
sub _decode_ascii_word { |
303
|
|
|
|
|
|
|
my ($word) = @_; |
304
|
|
|
|
|
|
|
return $LJS_ascii_words{$word} if ($LJS_ascii_words{$word}); |
305
|
|
|
|
|
|
|
return $word; |
306
|
|
|
|
|
|
|
} |
307
|
|
|
|
|
|
|
|
308
|
|
|
|
|
|
|
sub _normalize_japanese { |
309
|
|
|
|
|
|
|
my ($in) = @_; |
310
|
|
|
|
|
|
|
my $out; |
311
|
|
|
|
|
|
|
while ($in =~ /([\x80-\xff]{2})/) { |
312
|
|
|
|
|
|
|
$out .= $`; |
313
|
|
|
|
|
|
|
$in = $'; |
314
|
|
|
|
|
|
|
if ($1 eq '¡£' || $1 eq '¡¥') { |
315
|
|
|
|
|
|
|
$out .= "¡£\n"; |
316
|
|
|
|
|
|
|
} elsif ($1 eq '¡¤') { |
317
|
|
|
|
|
|
|
$out .= "¡¢"; |
318
|
|
|
|
|
|
|
} else { |
319
|
|
|
|
|
|
|
$out .= $1; |
320
|
|
|
|
|
|
|
} |
321
|
|
|
|
|
|
|
} |
322
|
|
|
|
|
|
|
$out .= $in; |
323
|
|
|
|
|
|
|
return $out; |
324
|
|
|
|
|
|
|
} |
325
|
|
|
|
|
|
|
|
326
|
|
|
|
|
|
|
sub keyword_summary { |
327
|
|
|
|
|
|
|
my ($text, $args) = @_; |
328
|
|
|
|
|
|
|
my $s = Lingua::JA::Summarize->new($args); |
329
|
|
|
|
|
|
|
$s->analyze($text); |
330
|
|
|
|
|
|
|
return $s->keywords($args); |
331
|
|
|
|
|
|
|
} |
332
|
|
|
|
|
|
|
|
333
|
|
|
|
|
|
|
sub file_keyword_summary { |
334
|
|
|
|
|
|
|
my ($file, $args) = @_; |
335
|
|
|
|
|
|
|
my $s = Lingua::JA::Summarize->new($args); |
336
|
|
|
|
|
|
|
$s->analyze_file($file); |
337
|
|
|
|
|
|
|
return $s->keywords($args); |
338
|
|
|
|
|
|
|
} |
339
|
|
|
|
|
|
|
|
340
|
|
|
|
|
|
|
1; |
341
|
|
|
|
|
|
|
__END__ |