| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package Text::Shirasu; |
|
2
|
|
|
|
|
|
|
|
|
3
|
5
|
|
|
5
|
|
135044
|
use strict; |
|
|
5
|
|
|
|
|
15
|
|
|
|
5
|
|
|
|
|
150
|
|
|
4
|
5
|
|
|
5
|
|
32
|
use warnings; |
|
|
5
|
|
|
|
|
12
|
|
|
|
5
|
|
|
|
|
139
|
|
|
5
|
5
|
|
|
5
|
|
510
|
use utf8; |
|
|
5
|
|
|
|
|
23
|
|
|
|
5
|
|
|
|
|
35
|
|
|
6
|
|
|
|
|
|
|
|
|
7
|
5
|
|
|
5
|
|
144
|
use Exporter 'import'; |
|
|
5
|
|
|
|
|
18
|
|
|
|
5
|
|
|
|
|
157
|
|
|
8
|
5
|
|
|
5
|
|
7269
|
use Text::MeCab; |
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
use Carp 'croak'; |
|
10
|
|
|
|
|
|
|
use Text::Shirasu::Node; |
|
11
|
|
|
|
|
|
|
use Text::Shirasu::Tree; |
|
12
|
|
|
|
|
|
|
use Lingua::JA::NormalizeText; |
|
13
|
|
|
|
|
|
|
use Encode qw/encode_utf8 decode_utf8/; |
|
14
|
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
our $VERSION = "0.0.4"; |
|
16
|
|
|
|
|
|
|
our @EXPORT_OK = (@Lingua::JA::NormalizeText::EXPORT_OK, qw/normalize_hyphen normalize_symbols/); |
|
17
|
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
*nfkc = \&Lingua::JA::NormalizeText::nfkc; |
|
19
|
|
|
|
|
|
|
*nfkd = \&Lingua::JA::NormalizeText::nfkd; |
|
20
|
|
|
|
|
|
|
*nfc = \&Lingua::JA::NormalizeText::nfc; |
|
21
|
|
|
|
|
|
|
*nfd = \&Lingua::JA::NormalizeText::nfd; |
|
22
|
|
|
|
|
|
|
*decode_entities = \&Lingua::JA::NormalizeText::decode_entities; |
|
23
|
|
|
|
|
|
|
*alnum_z2h = \&Lingua::JA::NormalizeText::alnum_z2h; |
|
24
|
|
|
|
|
|
|
*alnum_h2z = \&Lingua::JA::NormalizeText::alnum_h2z; |
|
25
|
|
|
|
|
|
|
*space_z2h = \&Lingua::JA::NormalizeText::space_z2h; |
|
26
|
|
|
|
|
|
|
*space_h2z = \&Lingua::JA::NormalizeText::space_h2z; |
|
27
|
|
|
|
|
|
|
*katakana_z2h = \&Lingua::JA::NormalizeText::katakana_z2h; |
|
28
|
|
|
|
|
|
|
*katakana_h2z = \&Lingua::JA::NormalizeText::katakana_h2z; |
|
29
|
|
|
|
|
|
|
*katakana2hiragana = \&Lingua::JA::NormalizeText::katakana2hiragana; |
|
30
|
|
|
|
|
|
|
*hiragana2katakana = \&Lingua::JA::NormalizeText::hiragana2katakana; |
|
31
|
|
|
|
|
|
|
*dakuon_normalize = \&Lingua::JA::NormalizeText::dakuon_normalize; |
|
32
|
|
|
|
|
|
|
*handakuon_normalize = \&Lingua::JA::NormalizeText::handakuon_normalize; |
|
33
|
|
|
|
|
|
|
*all_dakuon_normalize = \&Lingua::JA::NormalizeText::all_dakuon_normalize; |
|
34
|
|
|
|
|
|
|
*square2katakana = \&Lingua::JA::NormalizeText::square2katakana; |
|
35
|
|
|
|
|
|
|
*circled2kana = \&Lingua::JA::NormalizeText::circled2kana; |
|
36
|
|
|
|
|
|
|
*circled2kanji = \&Lingua::JA::NormalizeText::circled2kanji; |
|
37
|
|
|
|
|
|
|
*strip_html = \&Lingua::JA::NormalizeText::strip_html; |
|
38
|
|
|
|
|
|
|
*wave2tilde = \&Lingua::JA::NormalizeText::wave2long; |
|
39
|
|
|
|
|
|
|
*tilde2wave = \&Lingua::JA::NormalizeText::tilde2wave; |
|
40
|
|
|
|
|
|
|
*wavetilde2long = \&Lingua::JA::NormalizeText::wavetilde2long; |
|
41
|
|
|
|
|
|
|
*wave2long = \&Lingua::JA::NormalizeText::wave2long; |
|
42
|
|
|
|
|
|
|
*tilde2long = \&Lingua::JA::NormalizeText::tilde2long; |
|
43
|
|
|
|
|
|
|
*fullminus2long = \&Lingua::JA::NormalizeText::fullminus2long; |
|
44
|
|
|
|
|
|
|
*dashes2long = \&Lingua::JA::NormalizeText::dashes2long; |
|
45
|
|
|
|
|
|
|
*drawing_lines2long = \&Lingua::JA::NormalizeText::drawing_lines2long; |
|
46
|
|
|
|
|
|
|
*unify_long_repeats = \&Lingua::JA::NormalizeText::unify_long_repeats; |
|
47
|
|
|
|
|
|
|
*unify_long_spaces = \&Lingua::JA::NormalizeText::unify_long_spaces; |
|
48
|
|
|
|
|
|
|
*unify_whitespaces = \&Lingua::JA::NormalizeText::unify_whitespaces; |
|
49
|
|
|
|
|
|
|
*trim = \&Lingua::JA::NormalizeText::trim; |
|
50
|
|
|
|
|
|
|
*ltrim = \&Lingua::JA::NormalizeText::ltrim; |
|
51
|
|
|
|
|
|
|
*rtrim = \&Lingua::JA::NormalizeText::rtrim; |
|
52
|
|
|
|
|
|
|
*nl2space = \&Lingua::JA::NormalizeText::nl2space; |
|
53
|
|
|
|
|
|
|
*unify_nl = \&Lingua::JA::NormalizeText::unify_nl; |
|
54
|
|
|
|
|
|
|
*tab2space = \&Lingua::JA::NormalizeText::tab2space; |
|
55
|
|
|
|
|
|
|
*old2new_kana = \&Lingua::JA::NormalizeText::old2new_kana; |
|
56
|
|
|
|
|
|
|
*remove_controls = \&Lingua::JA::NormalizeText::remove_controls; |
|
57
|
|
|
|
|
|
|
*remove_spaces = \&Lingua::JA::NormalizeText::remove_spaces; |
|
58
|
|
|
|
|
|
|
*remove_DFC = \&Lingua::JA::NormalizeText::remove_DFC; |
|
59
|
|
|
|
|
|
|
*old2new_kanji = \&Lingua::JA::NormalizeText::old2new_kanji; |
|
60
|
|
|
|
|
|
|
*decompose_parenthesized_kanji |
|
61
|
|
|
|
|
|
|
= \&Lingua::JA::NormalizeText::decompose_parenthesized_kanji; |
|
62
|
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
=encoding utf-8 |
|
64
|
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
=head1 NAME |
|
66
|
|
|
|
|
|
|
|
|
67
|
|
|
|
|
|
|
Text::Shirasu - Text::MeCab wrapped for natural language processing |
|
68
|
|
|
|
|
|
|
|
|
69
|
|
|
|
|
|
|
=head1 SYNOPSIS |
|
70
|
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
use utf8; |
|
72
|
|
|
|
|
|
|
use feature ':5.10'; |
|
73
|
|
|
|
|
|
|
use Text::Shirasu; |
|
74
|
|
|
|
|
|
|
my $ts = Text::Shirasu->new(cabocha => 1); # you can use Text::CaboCha |
|
75
|
|
|
|
|
|
|
my $normalize = $ts->normalize("昨日の晩御飯は「鮭のふりかけ」と「味噌汁」だけでした。"); |
|
76
|
|
|
|
|
|
|
$ts->parse($normalize); |
|
77
|
|
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
for my $node (@{ $ts->nodes }) { |
|
79
|
|
|
|
|
|
|
say $node->surface; |
|
80
|
|
|
|
|
|
|
} |
|
81
|
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
say $ts->join_surface; |
|
83
|
|
|
|
|
|
|
|
|
84
|
|
|
|
|
|
|
my $filter = $ts->filter(type => [qw/名詞 助動詞/], 記号 => [qw/括弧開 括弧閉/]); |
|
85
|
|
|
|
|
|
|
say $filter->join_surface; |
|
86
|
|
|
|
|
|
|
|
|
87
|
|
|
|
|
|
|
for my $tree (@{ $ts->trees }) { |
|
88
|
|
|
|
|
|
|
say $tree->surface; |
|
89
|
|
|
|
|
|
|
} |
|
90
|
|
|
|
|
|
|
|
|
91
|
|
|
|
|
|
|
=head1 DESCRIPTION |
|
92
|
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
Text::Shirasu is wrapped L. |
|
94
|
|
|
|
|
|
|
This module is easy to normalize text and filter part of speech. |
|
95
|
|
|
|
|
|
|
Also to use L by setting the cabocha option to true. |
|
96
|
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
=cut |
|
98
|
|
|
|
|
|
|
|
|
99
|
|
|
|
|
|
|
=head1 METHODS |
|
100
|
|
|
|
|
|
|
=cut |
|
101
|
|
|
|
|
|
|
=head2 new |
|
102
|
|
|
|
|
|
|
|
|
103
|
|
|
|
|
|
|
Text::Shirasu->new( |
|
104
|
|
|
|
|
|
|
# If you want to use cabocha |
|
105
|
|
|
|
|
|
|
cabocha => 1, |
|
106
|
|
|
|
|
|
|
# Text::MeCab arguments |
|
107
|
|
|
|
|
|
|
rcfile => $rcfile, # Also it will be ailias as mecabrc for Text::CaboCha |
|
108
|
|
|
|
|
|
|
dicdir => $dicdir, # Also it will be ailias as mecab_dicdir for Text::CaboCha |
|
109
|
|
|
|
|
|
|
userdic => $userdic, # Also it will be ailias as mecab_userdic for Text::CaboCha |
|
110
|
|
|
|
|
|
|
lattice_level => $lattice_level, |
|
111
|
|
|
|
|
|
|
all_morphs => $all_morphs, |
|
112
|
|
|
|
|
|
|
output_format_type => $output_format_type, |
|
113
|
|
|
|
|
|
|
partial => $partial, |
|
114
|
|
|
|
|
|
|
node_format => $node_format, |
|
115
|
|
|
|
|
|
|
unk_format => $unk_format, |
|
116
|
|
|
|
|
|
|
bos_format => $bos_format, |
|
117
|
|
|
|
|
|
|
eos_format => $eos_format, |
|
118
|
|
|
|
|
|
|
input_buffer_size => $input_buffer_size, |
|
119
|
|
|
|
|
|
|
allocate_sentence => $allocate_sentence, |
|
120
|
|
|
|
|
|
|
nbest => $nbest, |
|
121
|
|
|
|
|
|
|
theta => $theta, |
|
122
|
|
|
|
|
|
|
|
|
123
|
|
|
|
|
|
|
# Text::CaboCha arguments |
|
124
|
|
|
|
|
|
|
ne => $ne, |
|
125
|
|
|
|
|
|
|
parser_model => $parser_model_file, |
|
126
|
|
|
|
|
|
|
chunker_model => $chunker_model_file, |
|
127
|
|
|
|
|
|
|
ne_model => $ne_tagger_model_file, |
|
128
|
|
|
|
|
|
|
); |
|
129
|
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
=cut |
|
131
|
|
|
|
|
|
|
|
|
132
|
|
|
|
|
|
|
sub new { |
|
133
|
|
|
|
|
|
|
my $class = shift; |
|
134
|
|
|
|
|
|
|
my %args = ref $_[0] eq 'HASH' ? %{ $_[0] } : @_; |
|
135
|
|
|
|
|
|
|
my %cabocha_opts; |
|
136
|
|
|
|
|
|
|
my $use_cabocha = delete $args{cabocha}; |
|
137
|
|
|
|
|
|
|
if ($use_cabocha) { |
|
138
|
|
|
|
|
|
|
local $@; |
|
139
|
|
|
|
|
|
|
eval { require Text::CaboCha }; |
|
140
|
|
|
|
|
|
|
if ($@ || $Text::CaboCha::VERSION < "0.04") { |
|
141
|
|
|
|
|
|
|
croak("If you want to use some functions of Text::CaboCha, you need to install Text::CaboCha >= 0.04"); |
|
142
|
|
|
|
|
|
|
} |
|
143
|
|
|
|
|
|
|
# Arguments for Text::Cabocha |
|
144
|
|
|
|
|
|
|
for my $opt (qw/ne parser_model chunker_model ne_model/) { |
|
145
|
|
|
|
|
|
|
if (exists $args{$opt}) { |
|
146
|
|
|
|
|
|
|
$cabocha_opts{$opt} = delete $args{$opt}; |
|
147
|
|
|
|
|
|
|
} |
|
148
|
|
|
|
|
|
|
} |
|
149
|
|
|
|
|
|
|
# Get from arguments of Text::MeCab |
|
150
|
|
|
|
|
|
|
for my $opt (qw/rcfile dicdir userdic/) { |
|
151
|
|
|
|
|
|
|
if (exists $args{$opt}) { |
|
152
|
|
|
|
|
|
|
if ($opt eq 'rcfile') { |
|
153
|
|
|
|
|
|
|
$cabocha_opts{mecabrc} = $args{$opt}; |
|
154
|
|
|
|
|
|
|
} else { |
|
155
|
|
|
|
|
|
|
$cabocha_opts{"mecab_${opt}"} = $args{$opt}; |
|
156
|
|
|
|
|
|
|
} |
|
157
|
|
|
|
|
|
|
} |
|
158
|
|
|
|
|
|
|
} |
|
159
|
|
|
|
|
|
|
} |
|
160
|
|
|
|
|
|
|
|
|
161
|
|
|
|
|
|
|
my $self = bless { |
|
162
|
|
|
|
|
|
|
mecab => Text::MeCab->new(%args), |
|
163
|
|
|
|
|
|
|
nodes => +[], |
|
164
|
|
|
|
|
|
|
normalize => +[qw/ |
|
165
|
|
|
|
|
|
|
nfkc |
|
166
|
|
|
|
|
|
|
nfkd |
|
167
|
|
|
|
|
|
|
nfc |
|
168
|
|
|
|
|
|
|
nfd |
|
169
|
|
|
|
|
|
|
alnum_z2h |
|
170
|
|
|
|
|
|
|
space_z2h |
|
171
|
|
|
|
|
|
|
katakana_h2z |
|
172
|
|
|
|
|
|
|
decode_entities |
|
173
|
|
|
|
|
|
|
unify_nl |
|
174
|
|
|
|
|
|
|
unify_whitespaces |
|
175
|
|
|
|
|
|
|
unify_long_spaces |
|
176
|
|
|
|
|
|
|
trim |
|
177
|
|
|
|
|
|
|
old2new_kana |
|
178
|
|
|
|
|
|
|
old2new_kanji |
|
179
|
|
|
|
|
|
|
tab2space |
|
180
|
|
|
|
|
|
|
all_dakuon_normalize |
|
181
|
|
|
|
|
|
|
square2katakana |
|
182
|
|
|
|
|
|
|
circled2kana |
|
183
|
|
|
|
|
|
|
circled2kanji |
|
184
|
|
|
|
|
|
|
decompose_parenthesized_kanji |
|
185
|
|
|
|
|
|
|
/, \&normalize_hyphen, \&normalize_symbols |
|
186
|
|
|
|
|
|
|
], |
|
187
|
|
|
|
|
|
|
} => $class; |
|
188
|
|
|
|
|
|
|
|
|
189
|
|
|
|
|
|
|
if ($use_cabocha) { |
|
190
|
|
|
|
|
|
|
$self->{trees} = +[]; |
|
191
|
|
|
|
|
|
|
$self->{cabocha} = Text::CaboCha->new(%cabocha_opts); |
|
192
|
|
|
|
|
|
|
} |
|
193
|
|
|
|
|
|
|
|
|
194
|
|
|
|
|
|
|
return $self; |
|
195
|
|
|
|
|
|
|
} |
|
196
|
|
|
|
|
|
|
|
|
197
|
|
|
|
|
|
|
=head2 parse |
|
198
|
|
|
|
|
|
|
|
|
199
|
|
|
|
|
|
|
This method wraps the parse method of Text::MeCab. |
|
200
|
|
|
|
|
|
|
The analysis result is saved as array reference of Text::Shirasu::Node instance in the Text::Shirasu instance. |
|
201
|
|
|
|
|
|
|
Also, If you used cabocha mode, it save as array reference of Text::Shirasu::Tree instance in the Text::Shirasu instance when used this method. |
|
202
|
|
|
|
|
|
|
It return Text::Shirasu instance. |
|
203
|
|
|
|
|
|
|
|
|
204
|
|
|
|
|
|
|
$ts->parse("このおにぎりは「母」が握ってくれたものです。"); |
|
205
|
|
|
|
|
|
|
|
|
206
|
|
|
|
|
|
|
=cut |
|
207
|
|
|
|
|
|
|
|
|
208
|
|
|
|
|
|
|
sub parse { |
|
209
|
|
|
|
|
|
|
my $self = shift; |
|
210
|
|
|
|
|
|
|
my $sentence = $_[0]; |
|
211
|
|
|
|
|
|
|
|
|
212
|
|
|
|
|
|
|
croak "Sentence has not been inputted" unless $sentence; |
|
213
|
|
|
|
|
|
|
|
|
214
|
|
|
|
|
|
|
my $mt = $self->{mecab}; |
|
215
|
|
|
|
|
|
|
|
|
216
|
|
|
|
|
|
|
# initialize |
|
217
|
|
|
|
|
|
|
$self->{nodes} = []; |
|
218
|
|
|
|
|
|
|
my $node = $mt->parse($sentence); |
|
219
|
|
|
|
|
|
|
|
|
220
|
|
|
|
|
|
|
# when cabocha mode |
|
221
|
|
|
|
|
|
|
if (exists $self->{cabocha}) { |
|
222
|
|
|
|
|
|
|
my $ct = $self->{cabocha}; |
|
223
|
|
|
|
|
|
|
my $tree = $ct->parse_from_node($node); |
|
224
|
|
|
|
|
|
|
my $cid = 0; |
|
225
|
|
|
|
|
|
|
for my $token (@{ $tree->tokens }) { |
|
226
|
|
|
|
|
|
|
if ($token->chunk) { |
|
227
|
|
|
|
|
|
|
push @{ $self->{trees} }, bless { |
|
228
|
|
|
|
|
|
|
cid => $cid++, |
|
229
|
|
|
|
|
|
|
link => $token->chunk->link, |
|
230
|
|
|
|
|
|
|
head_pos => $token->chunk->head_pos, |
|
231
|
|
|
|
|
|
|
func_pos => $token->chunk->func_pos, |
|
232
|
|
|
|
|
|
|
score => $token->chunk->score, |
|
233
|
|
|
|
|
|
|
surface => $token->surface, |
|
234
|
|
|
|
|
|
|
feature => [ split /,/, $token->feature ], |
|
235
|
|
|
|
|
|
|
ne => $token->ne, |
|
236
|
|
|
|
|
|
|
}, 'Text::Shirasu::Tree'; |
|
237
|
|
|
|
|
|
|
} |
|
238
|
|
|
|
|
|
|
} |
|
239
|
|
|
|
|
|
|
} |
|
240
|
|
|
|
|
|
|
|
|
241
|
|
|
|
|
|
|
for (; $node && $node->surface; $node = $node->next) { |
|
242
|
|
|
|
|
|
|
push @{ $self->{nodes} }, bless { |
|
243
|
|
|
|
|
|
|
id => $node->id, |
|
244
|
|
|
|
|
|
|
surface => $node->surface, |
|
245
|
|
|
|
|
|
|
feature => [ split /,/, $node->feature ], |
|
246
|
|
|
|
|
|
|
length => $node->length, |
|
247
|
|
|
|
|
|
|
rlength => $node->rlength, |
|
248
|
|
|
|
|
|
|
rcattr => $node->rcattr, |
|
249
|
|
|
|
|
|
|
lcattr => $node->lcattr, |
|
250
|
|
|
|
|
|
|
stat => $node->stat, |
|
251
|
|
|
|
|
|
|
isbest => $node->isbest, |
|
252
|
|
|
|
|
|
|
alpha => $node->alpha, |
|
253
|
|
|
|
|
|
|
beta => $node->beta, |
|
254
|
|
|
|
|
|
|
prob => $node->prob, |
|
255
|
|
|
|
|
|
|
wcost => $node->wcost, |
|
256
|
|
|
|
|
|
|
cost => $node->cost, |
|
257
|
|
|
|
|
|
|
}, 'Text::Shirasu::Node'; |
|
258
|
|
|
|
|
|
|
} |
|
259
|
|
|
|
|
|
|
|
|
260
|
|
|
|
|
|
|
return $self; |
|
261
|
|
|
|
|
|
|
} |
|
262
|
|
|
|
|
|
|
|
|
263
|
|
|
|
|
|
|
=head2 normalize |
|
264
|
|
|
|
|
|
|
|
|
265
|
|
|
|
|
|
|
It will normalize text using L. |
|
266
|
|
|
|
|
|
|
|
|
267
|
|
|
|
|
|
|
$ts->normalize("あ━ ”(*)” を〰〰 ’+1’") |
|
268
|
|
|
|
|
|
|
$ts->normalize("テキスト〰〰", qw/nfkc, alnum_z2h/, \&your_create_routine) |
|
269
|
|
|
|
|
|
|
|
|
270
|
|
|
|
|
|
|
It accepts a string as the first argument, and receives the Lingua::JA::NormalizeText options and subroutines after the second argument. |
|
271
|
|
|
|
|
|
|
If you do not specify a subroutine to be used in normalization, use the following Lingua::JA::NormalizeText options and subroutines by default. |
|
272
|
|
|
|
|
|
|
|
|
273
|
|
|
|
|
|
|
Please read the documentation of L for details on how each Lingua::JA::NormalizeText option works. |
|
274
|
|
|
|
|
|
|
|
|
275
|
|
|
|
|
|
|
Lingua::JA::NormalizeText options |
|
276
|
|
|
|
|
|
|
|
|
277
|
|
|
|
|
|
|
C |
|
278
|
|
|
|
|
|
|
|
|
279
|
|
|
|
|
|
|
Subroutines |
|
280
|
|
|
|
|
|
|
|
|
281
|
|
|
|
|
|
|
C |
|
282
|
|
|
|
|
|
|
|
|
283
|
|
|
|
|
|
|
=cut |
|
284
|
|
|
|
|
|
|
|
|
285
|
|
|
|
|
|
|
sub normalize { |
|
286
|
|
|
|
|
|
|
my $self = shift; |
|
287
|
|
|
|
|
|
|
my $text = shift; |
|
288
|
|
|
|
|
|
|
my $normalizer = Lingua::JA::NormalizeText->new(@_ ? @_ : @{ $self->{normalize} }); |
|
289
|
|
|
|
|
|
|
$normalizer->normalize(utf8::is_utf8($text) ? $text : decode_utf8($text)); |
|
290
|
|
|
|
|
|
|
} |
|
291
|
|
|
|
|
|
|
|
|
292
|
|
|
|
|
|
|
=head2 filter |
|
293
|
|
|
|
|
|
|
|
|
294
|
|
|
|
|
|
|
Please use after parse method execution. |
|
295
|
|
|
|
|
|
|
Filter the surface based on the features stored in the Text::Shirasu instance. |
|
296
|
|
|
|
|
|
|
Passing subtype to value with part of speech name as key allows you to more filter the string. |
|
297
|
|
|
|
|
|
|
|
|
298
|
|
|
|
|
|
|
# filtering nodes only |
|
299
|
|
|
|
|
|
|
$ts->filter(type => [qw/名詞/]); |
|
300
|
|
|
|
|
|
|
$ts->filter(type => [qw/名詞 記号/], 記号 => [qw/括弧開 括弧閉/]); |
|
301
|
|
|
|
|
|
|
|
|
302
|
|
|
|
|
|
|
# filtering trees only |
|
303
|
|
|
|
|
|
|
$ts->filter(tree => 1, node => 0, type => [qw/名詞/]); |
|
304
|
|
|
|
|
|
|
$ts->filter(tree => 1, node => 0, type => [qw/名詞 記号/], 記号 => [qw/括弧開 括弧閉/]); |
|
305
|
|
|
|
|
|
|
|
|
306
|
|
|
|
|
|
|
# filtering nodes and trees |
|
307
|
|
|
|
|
|
|
$ts->filter(tree => 1, type => [qw/名詞/]); |
|
308
|
|
|
|
|
|
|
$ts->filter(tree => 1, type => [qw/名詞 記号/], 記号 => [qw/括弧開 括弧閉/]); |
|
309
|
|
|
|
|
|
|
|
|
310
|
|
|
|
|
|
|
=cut |
|
311
|
|
|
|
|
|
|
|
|
312
|
|
|
|
|
|
|
sub filter { |
|
313
|
|
|
|
|
|
|
my $self = shift; |
|
314
|
|
|
|
|
|
|
my %params = ref $_[0] eq 'HASH' ? %{ $_[0] } : @_; |
|
315
|
|
|
|
|
|
|
|
|
316
|
|
|
|
|
|
|
# and search filter |
|
317
|
|
|
|
|
|
|
my @type = @{ delete $params{type} } |
|
318
|
|
|
|
|
|
|
or croak 'Query has not been inputted: "type"'; |
|
319
|
|
|
|
|
|
|
|
|
320
|
|
|
|
|
|
|
# create parameter as /名詞|動詞/ or /名詞/ |
|
321
|
|
|
|
|
|
|
my $query = encode_utf8 join '|', map { $_ } @type; |
|
322
|
|
|
|
|
|
|
|
|
323
|
|
|
|
|
|
|
# filtering trees |
|
324
|
|
|
|
|
|
|
if (delete $params{tree}) { |
|
325
|
|
|
|
|
|
|
$self->{trees} = [ |
|
326
|
|
|
|
|
|
|
grep { |
|
327
|
|
|
|
|
|
|
$_->{feature}->[0] =~ /($query)/ |
|
328
|
|
|
|
|
|
|
and _sub_query( $_->{feature}->[1], $params{decode_utf8($1)} ) |
|
329
|
|
|
|
|
|
|
} @{ $self->{trees} } |
|
330
|
|
|
|
|
|
|
]; |
|
331
|
|
|
|
|
|
|
} |
|
332
|
|
|
|
|
|
|
|
|
333
|
|
|
|
|
|
|
# filtering nodes if unset "node" argument or "node => true value" |
|
334
|
|
|
|
|
|
|
if (!exists $params{node} || delete $params{node}) { |
|
335
|
|
|
|
|
|
|
$self->{nodes} = [ |
|
336
|
|
|
|
|
|
|
grep { |
|
337
|
|
|
|
|
|
|
$_->{feature}->[0] =~ /($query)/ |
|
338
|
|
|
|
|
|
|
and _sub_query( $_->{feature}->[1], $params{decode_utf8($1)} ) |
|
339
|
|
|
|
|
|
|
} @{ $self->{nodes} } |
|
340
|
|
|
|
|
|
|
]; |
|
341
|
|
|
|
|
|
|
} |
|
342
|
|
|
|
|
|
|
|
|
343
|
|
|
|
|
|
|
return $self; |
|
344
|
|
|
|
|
|
|
} |
|
345
|
|
|
|
|
|
|
|
|
346
|
|
|
|
|
|
|
|
|
347
|
|
|
|
|
|
|
=head2 join_surface |
|
348
|
|
|
|
|
|
|
|
|
349
|
|
|
|
|
|
|
Returns a string that combined the surfaces stored in the instance. |
|
350
|
|
|
|
|
|
|
|
|
351
|
|
|
|
|
|
|
$ts->join_surface |
|
352
|
|
|
|
|
|
|
|
|
353
|
|
|
|
|
|
|
=cut |
|
354
|
|
|
|
|
|
|
|
|
355
|
|
|
|
|
|
|
sub join_surface { |
|
356
|
|
|
|
|
|
|
my $self = shift; |
|
357
|
|
|
|
|
|
|
croak "Does not exist parsed nodes" unless exists $self->{nodes}; |
|
358
|
|
|
|
|
|
|
return join '', map { $_->{surface} } @{ $self->{nodes} }; |
|
359
|
|
|
|
|
|
|
} |
|
360
|
|
|
|
|
|
|
|
|
361
|
|
|
|
|
|
|
=head2 nodes |
|
362
|
|
|
|
|
|
|
|
|
363
|
|
|
|
|
|
|
Return the array reference of the Text::Shirasu::Node instance. |
|
364
|
|
|
|
|
|
|
|
|
365
|
|
|
|
|
|
|
$ts->nodes |
|
366
|
|
|
|
|
|
|
|
|
367
|
|
|
|
|
|
|
=cut |
|
368
|
|
|
|
|
|
|
|
|
369
|
|
|
|
|
|
|
sub nodes { $_[0]->{nodes} } |
|
370
|
|
|
|
|
|
|
|
|
371
|
|
|
|
|
|
|
=head2 trees |
|
372
|
|
|
|
|
|
|
|
|
373
|
|
|
|
|
|
|
Return the array reference of the Text::Shirasu::Tree instance. |
|
374
|
|
|
|
|
|
|
|
|
375
|
|
|
|
|
|
|
$ts->trees |
|
376
|
|
|
|
|
|
|
|
|
377
|
|
|
|
|
|
|
=cut |
|
378
|
|
|
|
|
|
|
|
|
379
|
|
|
|
|
|
|
sub trees { $_[0]->{trees} } |
|
380
|
|
|
|
|
|
|
|
|
381
|
|
|
|
|
|
|
=head2 mecab |
|
382
|
|
|
|
|
|
|
|
|
383
|
|
|
|
|
|
|
Return the Text::MeCab instance. |
|
384
|
|
|
|
|
|
|
|
|
385
|
|
|
|
|
|
|
$ts->mecab |
|
386
|
|
|
|
|
|
|
|
|
387
|
|
|
|
|
|
|
=cut |
|
388
|
|
|
|
|
|
|
|
|
389
|
|
|
|
|
|
|
sub mecab { $_[0]->{mecab} } |
|
390
|
|
|
|
|
|
|
|
|
391
|
|
|
|
|
|
|
=head2 cabocha |
|
392
|
|
|
|
|
|
|
|
|
393
|
|
|
|
|
|
|
Return the Text::CaboCha instance. |
|
394
|
|
|
|
|
|
|
|
|
395
|
|
|
|
|
|
|
$ts->cabocha |
|
396
|
|
|
|
|
|
|
|
|
397
|
|
|
|
|
|
|
=cut |
|
398
|
|
|
|
|
|
|
|
|
399
|
|
|
|
|
|
|
sub cabocha { $_[0]->{cabocha} } |
|
400
|
|
|
|
|
|
|
|
|
401
|
|
|
|
|
|
|
# private |
|
402
|
|
|
|
|
|
|
sub _sub_query { |
|
403
|
|
|
|
|
|
|
my ( $subtype, $query ) = @_; |
|
404
|
|
|
|
|
|
|
|
|
405
|
|
|
|
|
|
|
return 1 unless ref $query eq 'ARRAY'; |
|
406
|
|
|
|
|
|
|
|
|
407
|
|
|
|
|
|
|
my $judge = join '|', map { encode_utf8($_) } @$query; |
|
408
|
|
|
|
|
|
|
|
|
409
|
|
|
|
|
|
|
return $subtype =~ /$judge/; |
|
410
|
|
|
|
|
|
|
} |
|
411
|
|
|
|
|
|
|
|
|
412
|
|
|
|
|
|
|
1; |
|
413
|
|
|
|
|
|
|
|
|
414
|
|
|
|
|
|
|
=head1 SUBROUTINES |
|
415
|
|
|
|
|
|
|
|
|
416
|
|
|
|
|
|
|
These subroutines perform the following substitution. |
|
417
|
|
|
|
|
|
|
|
|
418
|
|
|
|
|
|
|
=head2 normalize_hyphen |
|
419
|
|
|
|
|
|
|
|
|
420
|
|
|
|
|
|
|
s/[˗֊‐‑‒–⁃⁻₋−]/-/g; |
|
421
|
|
|
|
|
|
|
s/[﹣-ー—―─━ー]/ー/g; |
|
422
|
|
|
|
|
|
|
s/[~∼∾〜〰~]//g; |
|
423
|
|
|
|
|
|
|
s/ー+/ー/g; |
|
424
|
|
|
|
|
|
|
|
|
425
|
|
|
|
|
|
|
=head2 normalize_symbols |
|
426
|
|
|
|
|
|
|
|
|
427
|
|
|
|
|
|
|
tr/。、・「」/。、・「」/; |
|
428
|
|
|
|
|
|
|
|
|
429
|
|
|
|
|
|
|
=cut |
|
430
|
|
|
|
|
|
|
|
|
431
|
|
|
|
|
|
|
sub normalize_hyphen { |
|
432
|
|
|
|
|
|
|
local $_ = shift; |
|
433
|
|
|
|
|
|
|
return undef unless defined $_; |
|
434
|
|
|
|
|
|
|
s/[˗֊‐‑‒–⁃⁻₋−]/-/g; |
|
435
|
|
|
|
|
|
|
s/[﹣-ー—―─━ー]/ー/g; |
|
436
|
|
|
|
|
|
|
s/[~∼∾〜〰~]//g; |
|
437
|
|
|
|
|
|
|
s/ー+/ー/g; |
|
438
|
|
|
|
|
|
|
$_; |
|
439
|
|
|
|
|
|
|
} |
|
440
|
|
|
|
|
|
|
|
|
441
|
|
|
|
|
|
|
sub normalize_symbols { |
|
442
|
|
|
|
|
|
|
local $_ = shift; |
|
443
|
|
|
|
|
|
|
return undef unless defined $_; |
|
444
|
|
|
|
|
|
|
tr/。、・「」/。、・「」/; |
|
445
|
|
|
|
|
|
|
$_; |
|
446
|
|
|
|
|
|
|
} |
|
447
|
|
|
|
|
|
|
|
|
448
|
|
|
|
|
|
|
=head1 LICENSE |
|
449
|
|
|
|
|
|
|
|
|
450
|
|
|
|
|
|
|
Copyright (C) Kei Kamikawa(Code-Hex). |
|
451
|
|
|
|
|
|
|
|
|
452
|
|
|
|
|
|
|
This library is free software; you can redistribute it and/or modify |
|
453
|
|
|
|
|
|
|
it under the same terms as Perl itself. |
|
454
|
|
|
|
|
|
|
|
|
455
|
|
|
|
|
|
|
=head1 AUTHOR |
|
456
|
|
|
|
|
|
|
|
|
457
|
|
|
|
|
|
|
Kei Kamikawa Ex00.x7f@gmail.comE |
|
458
|
|
|
|
|
|
|
|
|
459
|
|
|
|
|
|
|
=cut |