line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Lingua::JA::Summarize::Extract::Plugin::Parser::Ngram; |
2
|
|
|
|
|
|
|
|
3
|
1
|
|
|
1
|
|
5
|
use strict; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
40
|
|
4
|
1
|
|
|
1
|
|
5
|
use base qw( Lingua::JA::Summarize::Extract::Plugin ); |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
496
|
|
5
|
|
|
|
|
|
|
__PACKAGE__->mk_accessors(qw/ latin_gram kana_gram han_gram /); |
6
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
sub parse { |
8
|
0
|
|
|
0
|
0
|
|
my ($self) = @_; |
9
|
0
|
|
0
|
|
|
|
my $latin_gram = $self->latin_gram || 2; |
10
|
0
|
|
0
|
|
|
|
my $kana_gram = $self->kana_gram || 3; |
11
|
0
|
|
0
|
|
|
|
my $han_gram = $self->han_gram || 2; |
12
|
|
|
|
|
|
|
|
13
|
0
|
|
|
|
|
|
my $term_list = {}; |
14
|
0
|
|
|
|
|
|
$self->_gram($term_list, 'Latin', $latin_gram); |
15
|
0
|
|
|
|
|
|
$self->_gram($term_list, 'Katakana', $kana_gram); |
16
|
0
|
|
|
|
|
|
$self->_gram($term_list, 'Han', $han_gram); |
17
|
|
|
|
|
|
|
|
18
|
0
|
|
|
|
|
|
$term_list; |
19
|
|
|
|
|
|
|
} |
20
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
sub _gram { |
22
|
0
|
|
|
0
|
|
|
my($self, $list, $block, $gram) = @_; |
23
|
|
|
|
|
|
|
|
24
|
0
|
|
|
|
|
|
my $text = $self->text; |
25
|
0
|
|
|
|
|
|
while ($text =~ /(\p{$block}+)/g) { |
26
|
0
|
|
|
|
|
|
my $word = $1; |
27
|
0
|
|
|
|
|
|
my @part; |
28
|
0
|
|
|
|
|
|
for (my $i = 0;$i + $gram <= length $word;$i++) { |
29
|
0
|
|
|
|
|
|
push @part, substr $word, $i, $gram; |
30
|
|
|
|
|
|
|
} |
31
|
0
|
0
|
|
|
|
|
$list->{join ' ', @part}++ if @part; |
32
|
|
|
|
|
|
|
} |
33
|
|
|
|
|
|
|
} |
34
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
1; |
36
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
__END__ |