line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Text::Shirasu; |
2
|
|
|
|
|
|
|
|
3
|
5
|
|
|
5
|
|
135044
|
use strict; |
|
5
|
|
|
|
|
15
|
|
|
5
|
|
|
|
|
150
|
|
4
|
5
|
|
|
5
|
|
32
|
use warnings; |
|
5
|
|
|
|
|
12
|
|
|
5
|
|
|
|
|
139
|
|
5
|
5
|
|
|
5
|
|
510
|
use utf8; |
|
5
|
|
|
|
|
23
|
|
|
5
|
|
|
|
|
35
|
|
6
|
|
|
|
|
|
|
|
7
|
5
|
|
|
5
|
|
144
|
use Exporter 'import'; |
|
5
|
|
|
|
|
18
|
|
|
5
|
|
|
|
|
157
|
|
8
|
5
|
|
|
5
|
|
7269
|
use Text::MeCab; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
use Carp 'croak'; |
10
|
|
|
|
|
|
|
use Text::Shirasu::Node; |
11
|
|
|
|
|
|
|
use Text::Shirasu::Tree; |
12
|
|
|
|
|
|
|
use Lingua::JA::NormalizeText; |
13
|
|
|
|
|
|
|
use Encode qw/encode_utf8 decode_utf8/; |
14
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
our $VERSION = "0.0.4"; |
16
|
|
|
|
|
|
|
our @EXPORT_OK = (@Lingua::JA::NormalizeText::EXPORT_OK, qw/normalize_hyphen normalize_symbols/); |
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
*nfkc = \&Lingua::JA::NormalizeText::nfkc; |
19
|
|
|
|
|
|
|
*nfkd = \&Lingua::JA::NormalizeText::nfkd; |
20
|
|
|
|
|
|
|
*nfc = \&Lingua::JA::NormalizeText::nfc; |
21
|
|
|
|
|
|
|
*nfd = \&Lingua::JA::NormalizeText::nfd; |
22
|
|
|
|
|
|
|
*decode_entities = \&Lingua::JA::NormalizeText::decode_entities; |
23
|
|
|
|
|
|
|
*alnum_z2h = \&Lingua::JA::NormalizeText::alnum_z2h; |
24
|
|
|
|
|
|
|
*alnum_h2z = \&Lingua::JA::NormalizeText::alnum_h2z; |
25
|
|
|
|
|
|
|
*space_z2h = \&Lingua::JA::NormalizeText::space_z2h; |
26
|
|
|
|
|
|
|
*space_h2z = \&Lingua::JA::NormalizeText::space_h2z; |
27
|
|
|
|
|
|
|
*katakana_z2h = \&Lingua::JA::NormalizeText::katakana_z2h; |
28
|
|
|
|
|
|
|
*katakana_h2z = \&Lingua::JA::NormalizeText::katakana_h2z; |
29
|
|
|
|
|
|
|
*katakana2hiragana = \&Lingua::JA::NormalizeText::katakana2hiragana; |
30
|
|
|
|
|
|
|
*hiragana2katakana = \&Lingua::JA::NormalizeText::hiragana2katakana; |
31
|
|
|
|
|
|
|
*dakuon_normalize = \&Lingua::JA::NormalizeText::dakuon_normalize; |
32
|
|
|
|
|
|
|
*handakuon_normalize = \&Lingua::JA::NormalizeText::handakuon_normalize; |
33
|
|
|
|
|
|
|
*all_dakuon_normalize = \&Lingua::JA::NormalizeText::all_dakuon_normalize; |
34
|
|
|
|
|
|
|
*square2katakana = \&Lingua::JA::NormalizeText::square2katakana; |
35
|
|
|
|
|
|
|
*circled2kana = \&Lingua::JA::NormalizeText::circled2kana; |
36
|
|
|
|
|
|
|
*circled2kanji = \&Lingua::JA::NormalizeText::circled2kanji; |
37
|
|
|
|
|
|
|
*strip_html = \&Lingua::JA::NormalizeText::strip_html; |
38
|
|
|
|
|
|
|
*wave2tilde = \&Lingua::JA::NormalizeText::wave2long; |
39
|
|
|
|
|
|
|
*tilde2wave = \&Lingua::JA::NormalizeText::tilde2wave; |
40
|
|
|
|
|
|
|
*wavetilde2long = \&Lingua::JA::NormalizeText::wavetilde2long; |
41
|
|
|
|
|
|
|
*wave2long = \&Lingua::JA::NormalizeText::wave2long; |
42
|
|
|
|
|
|
|
*tilde2long = \&Lingua::JA::NormalizeText::tilde2long; |
43
|
|
|
|
|
|
|
*fullminus2long = \&Lingua::JA::NormalizeText::fullminus2long; |
44
|
|
|
|
|
|
|
*dashes2long = \&Lingua::JA::NormalizeText::dashes2long; |
45
|
|
|
|
|
|
|
*drawing_lines2long = \&Lingua::JA::NormalizeText::drawing_lines2long; |
46
|
|
|
|
|
|
|
*unify_long_repeats = \&Lingua::JA::NormalizeText::unify_long_repeats; |
47
|
|
|
|
|
|
|
*unify_long_spaces = \&Lingua::JA::NormalizeText::unify_long_spaces; |
48
|
|
|
|
|
|
|
*unify_whitespaces = \&Lingua::JA::NormalizeText::unify_whitespaces; |
49
|
|
|
|
|
|
|
*trim = \&Lingua::JA::NormalizeText::trim; |
50
|
|
|
|
|
|
|
*ltrim = \&Lingua::JA::NormalizeText::ltrim; |
51
|
|
|
|
|
|
|
*rtrim = \&Lingua::JA::NormalizeText::rtrim; |
52
|
|
|
|
|
|
|
*nl2space = \&Lingua::JA::NormalizeText::nl2space; |
53
|
|
|
|
|
|
|
*unify_nl = \&Lingua::JA::NormalizeText::unify_nl; |
54
|
|
|
|
|
|
|
*tab2space = \&Lingua::JA::NormalizeText::tab2space; |
55
|
|
|
|
|
|
|
*old2new_kana = \&Lingua::JA::NormalizeText::old2new_kana; |
56
|
|
|
|
|
|
|
*remove_controls = \&Lingua::JA::NormalizeText::remove_controls; |
57
|
|
|
|
|
|
|
*remove_spaces = \&Lingua::JA::NormalizeText::remove_spaces; |
58
|
|
|
|
|
|
|
*remove_DFC = \&Lingua::JA::NormalizeText::remove_DFC; |
59
|
|
|
|
|
|
|
*old2new_kanji = \&Lingua::JA::NormalizeText::old2new_kanji; |
60
|
|
|
|
|
|
|
*decompose_parenthesized_kanji |
61
|
|
|
|
|
|
|
= \&Lingua::JA::NormalizeText::decompose_parenthesized_kanji; |
62
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
=encoding utf-8 |
64
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
=head1 NAME |
66
|
|
|
|
|
|
|
|
67
|
|
|
|
|
|
|
Text::Shirasu - Text::MeCab wrapped for natural language processing |
68
|
|
|
|
|
|
|
|
69
|
|
|
|
|
|
|
=head1 SYNOPSIS |
70
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
use utf8; |
72
|
|
|
|
|
|
|
use feature ':5.10'; |
73
|
|
|
|
|
|
|
use Text::Shirasu; |
74
|
|
|
|
|
|
|
my $ts = Text::Shirasu->new(cabocha => 1); # you can use Text::CaboCha |
75
|
|
|
|
|
|
|
my $normalize = $ts->normalize("昨日の晩御飯は「鮭のふりかけ」と「味噌汁」だけでした。"); |
76
|
|
|
|
|
|
|
$ts->parse($normalize); |
77
|
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
for my $node (@{ $ts->nodes }) { |
79
|
|
|
|
|
|
|
say $node->surface; |
80
|
|
|
|
|
|
|
} |
81
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
say $ts->join_surface; |
83
|
|
|
|
|
|
|
|
84
|
|
|
|
|
|
|
my $filter = $ts->filter(type => [qw/名詞 助動詞/], 記号 => [qw/括弧開 括弧閉/]); |
85
|
|
|
|
|
|
|
say $filter->join_surface; |
86
|
|
|
|
|
|
|
|
87
|
|
|
|
|
|
|
for my $tree (@{ $ts->trees }) { |
88
|
|
|
|
|
|
|
say $tree->surface; |
89
|
|
|
|
|
|
|
} |
90
|
|
|
|
|
|
|
|
91
|
|
|
|
|
|
|
=head1 DESCRIPTION |
92
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
Text::Shirasu is wrapped L. |
94
|
|
|
|
|
|
|
This module is easy to normalize text and filter part of speech. |
95
|
|
|
|
|
|
|
Also to use L by setting the cabocha option to true. |
96
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
=cut |
98
|
|
|
|
|
|
|
|
99
|
|
|
|
|
|
|
=head1 METHODS |
100
|
|
|
|
|
|
|
=cut |
101
|
|
|
|
|
|
|
=head2 new |
102
|
|
|
|
|
|
|
|
103
|
|
|
|
|
|
|
Text::Shirasu->new( |
104
|
|
|
|
|
|
|
# If you want to use cabocha |
105
|
|
|
|
|
|
|
cabocha => 1, |
106
|
|
|
|
|
|
|
# Text::MeCab arguments |
107
|
|
|
|
|
|
|
rcfile => $rcfile, # Also it will be ailias as mecabrc for Text::CaboCha |
108
|
|
|
|
|
|
|
dicdir => $dicdir, # Also it will be ailias as mecab_dicdir for Text::CaboCha |
109
|
|
|
|
|
|
|
userdic => $userdic, # Also it will be ailias as mecab_userdic for Text::CaboCha |
110
|
|
|
|
|
|
|
lattice_level => $lattice_level, |
111
|
|
|
|
|
|
|
all_morphs => $all_morphs, |
112
|
|
|
|
|
|
|
output_format_type => $output_format_type, |
113
|
|
|
|
|
|
|
partial => $partial, |
114
|
|
|
|
|
|
|
node_format => $node_format, |
115
|
|
|
|
|
|
|
unk_format => $unk_format, |
116
|
|
|
|
|
|
|
bos_format => $bos_format, |
117
|
|
|
|
|
|
|
eos_format => $eos_format, |
118
|
|
|
|
|
|
|
input_buffer_size => $input_buffer_size, |
119
|
|
|
|
|
|
|
allocate_sentence => $allocate_sentence, |
120
|
|
|
|
|
|
|
nbest => $nbest, |
121
|
|
|
|
|
|
|
theta => $theta, |
122
|
|
|
|
|
|
|
|
123
|
|
|
|
|
|
|
# Text::CaboCha arguments |
124
|
|
|
|
|
|
|
ne => $ne, |
125
|
|
|
|
|
|
|
parser_model => $parser_model_file, |
126
|
|
|
|
|
|
|
chunker_model => $chunker_model_file, |
127
|
|
|
|
|
|
|
ne_model => $ne_tagger_model_file, |
128
|
|
|
|
|
|
|
); |
129
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
=cut |
131
|
|
|
|
|
|
|
|
132
|
|
|
|
|
|
|
sub new { |
133
|
|
|
|
|
|
|
my $class = shift; |
134
|
|
|
|
|
|
|
my %args = ref $_[0] eq 'HASH' ? %{ $_[0] } : @_; |
135
|
|
|
|
|
|
|
my %cabocha_opts; |
136
|
|
|
|
|
|
|
my $use_cabocha = delete $args{cabocha}; |
137
|
|
|
|
|
|
|
if ($use_cabocha) { |
138
|
|
|
|
|
|
|
local $@; |
139
|
|
|
|
|
|
|
eval { require Text::CaboCha }; |
140
|
|
|
|
|
|
|
if ($@ || $Text::CaboCha::VERSION < "0.04") { |
141
|
|
|
|
|
|
|
croak("If you want to use some functions of Text::CaboCha, you need to install Text::CaboCha >= 0.04"); |
142
|
|
|
|
|
|
|
} |
143
|
|
|
|
|
|
|
# Arguments for Text::Cabocha |
144
|
|
|
|
|
|
|
for my $opt (qw/ne parser_model chunker_model ne_model/) { |
145
|
|
|
|
|
|
|
if (exists $args{$opt}) { |
146
|
|
|
|
|
|
|
$cabocha_opts{$opt} = delete $args{$opt}; |
147
|
|
|
|
|
|
|
} |
148
|
|
|
|
|
|
|
} |
149
|
|
|
|
|
|
|
# Get from arguments of Text::MeCab |
150
|
|
|
|
|
|
|
for my $opt (qw/rcfile dicdir userdic/) { |
151
|
|
|
|
|
|
|
if (exists $args{$opt}) { |
152
|
|
|
|
|
|
|
if ($opt eq 'rcfile') { |
153
|
|
|
|
|
|
|
$cabocha_opts{mecabrc} = $args{$opt}; |
154
|
|
|
|
|
|
|
} else { |
155
|
|
|
|
|
|
|
$cabocha_opts{"mecab_${opt}"} = $args{$opt}; |
156
|
|
|
|
|
|
|
} |
157
|
|
|
|
|
|
|
} |
158
|
|
|
|
|
|
|
} |
159
|
|
|
|
|
|
|
} |
160
|
|
|
|
|
|
|
|
161
|
|
|
|
|
|
|
my $self = bless { |
162
|
|
|
|
|
|
|
mecab => Text::MeCab->new(%args), |
163
|
|
|
|
|
|
|
nodes => +[], |
164
|
|
|
|
|
|
|
normalize => +[qw/ |
165
|
|
|
|
|
|
|
nfkc |
166
|
|
|
|
|
|
|
nfkd |
167
|
|
|
|
|
|
|
nfc |
168
|
|
|
|
|
|
|
nfd |
169
|
|
|
|
|
|
|
alnum_z2h |
170
|
|
|
|
|
|
|
space_z2h |
171
|
|
|
|
|
|
|
katakana_h2z |
172
|
|
|
|
|
|
|
decode_entities |
173
|
|
|
|
|
|
|
unify_nl |
174
|
|
|
|
|
|
|
unify_whitespaces |
175
|
|
|
|
|
|
|
unify_long_spaces |
176
|
|
|
|
|
|
|
trim |
177
|
|
|
|
|
|
|
old2new_kana |
178
|
|
|
|
|
|
|
old2new_kanji |
179
|
|
|
|
|
|
|
tab2space |
180
|
|
|
|
|
|
|
all_dakuon_normalize |
181
|
|
|
|
|
|
|
square2katakana |
182
|
|
|
|
|
|
|
circled2kana |
183
|
|
|
|
|
|
|
circled2kanji |
184
|
|
|
|
|
|
|
decompose_parenthesized_kanji |
185
|
|
|
|
|
|
|
/, \&normalize_hyphen, \&normalize_symbols |
186
|
|
|
|
|
|
|
], |
187
|
|
|
|
|
|
|
} => $class; |
188
|
|
|
|
|
|
|
|
189
|
|
|
|
|
|
|
if ($use_cabocha) { |
190
|
|
|
|
|
|
|
$self->{trees} = +[]; |
191
|
|
|
|
|
|
|
$self->{cabocha} = Text::CaboCha->new(%cabocha_opts); |
192
|
|
|
|
|
|
|
} |
193
|
|
|
|
|
|
|
|
194
|
|
|
|
|
|
|
return $self; |
195
|
|
|
|
|
|
|
} |
196
|
|
|
|
|
|
|
|
197
|
|
|
|
|
|
|
=head2 parse |
198
|
|
|
|
|
|
|
|
199
|
|
|
|
|
|
|
This method wraps the parse method of Text::MeCab. |
200
|
|
|
|
|
|
|
The analysis result is saved as array reference of Text::Shirasu::Node instance in the Text::Shirasu instance. |
201
|
|
|
|
|
|
|
Also, If you used cabocha mode, it save as array reference of Text::Shirasu::Tree instance in the Text::Shirasu instance when used this method. |
202
|
|
|
|
|
|
|
It return Text::Shirasu instance. |
203
|
|
|
|
|
|
|
|
204
|
|
|
|
|
|
|
$ts->parse("このおにぎりは「母」が握ってくれたものです。"); |
205
|
|
|
|
|
|
|
|
206
|
|
|
|
|
|
|
=cut |
207
|
|
|
|
|
|
|
|
208
|
|
|
|
|
|
|
sub parse { |
209
|
|
|
|
|
|
|
my $self = shift; |
210
|
|
|
|
|
|
|
my $sentence = $_[0]; |
211
|
|
|
|
|
|
|
|
212
|
|
|
|
|
|
|
croak "Sentence has not been inputted" unless $sentence; |
213
|
|
|
|
|
|
|
|
214
|
|
|
|
|
|
|
my $mt = $self->{mecab}; |
215
|
|
|
|
|
|
|
|
216
|
|
|
|
|
|
|
# initialize |
217
|
|
|
|
|
|
|
$self->{nodes} = []; |
218
|
|
|
|
|
|
|
my $node = $mt->parse($sentence); |
219
|
|
|
|
|
|
|
|
220
|
|
|
|
|
|
|
# when cabocha mode |
221
|
|
|
|
|
|
|
if (exists $self->{cabocha}) { |
222
|
|
|
|
|
|
|
my $ct = $self->{cabocha}; |
223
|
|
|
|
|
|
|
my $tree = $ct->parse_from_node($node); |
224
|
|
|
|
|
|
|
my $cid = 0; |
225
|
|
|
|
|
|
|
for my $token (@{ $tree->tokens }) { |
226
|
|
|
|
|
|
|
if ($token->chunk) { |
227
|
|
|
|
|
|
|
push @{ $self->{trees} }, bless { |
228
|
|
|
|
|
|
|
cid => $cid++, |
229
|
|
|
|
|
|
|
link => $token->chunk->link, |
230
|
|
|
|
|
|
|
head_pos => $token->chunk->head_pos, |
231
|
|
|
|
|
|
|
func_pos => $token->chunk->func_pos, |
232
|
|
|
|
|
|
|
score => $token->chunk->score, |
233
|
|
|
|
|
|
|
surface => $token->surface, |
234
|
|
|
|
|
|
|
feature => [ split /,/, $token->feature ], |
235
|
|
|
|
|
|
|
ne => $token->ne, |
236
|
|
|
|
|
|
|
}, 'Text::Shirasu::Tree'; |
237
|
|
|
|
|
|
|
} |
238
|
|
|
|
|
|
|
} |
239
|
|
|
|
|
|
|
} |
240
|
|
|
|
|
|
|
|
241
|
|
|
|
|
|
|
for (; $node && $node->surface; $node = $node->next) { |
242
|
|
|
|
|
|
|
push @{ $self->{nodes} }, bless { |
243
|
|
|
|
|
|
|
id => $node->id, |
244
|
|
|
|
|
|
|
surface => $node->surface, |
245
|
|
|
|
|
|
|
feature => [ split /,/, $node->feature ], |
246
|
|
|
|
|
|
|
length => $node->length, |
247
|
|
|
|
|
|
|
rlength => $node->rlength, |
248
|
|
|
|
|
|
|
rcattr => $node->rcattr, |
249
|
|
|
|
|
|
|
lcattr => $node->lcattr, |
250
|
|
|
|
|
|
|
stat => $node->stat, |
251
|
|
|
|
|
|
|
isbest => $node->isbest, |
252
|
|
|
|
|
|
|
alpha => $node->alpha, |
253
|
|
|
|
|
|
|
beta => $node->beta, |
254
|
|
|
|
|
|
|
prob => $node->prob, |
255
|
|
|
|
|
|
|
wcost => $node->wcost, |
256
|
|
|
|
|
|
|
cost => $node->cost, |
257
|
|
|
|
|
|
|
}, 'Text::Shirasu::Node'; |
258
|
|
|
|
|
|
|
} |
259
|
|
|
|
|
|
|
|
260
|
|
|
|
|
|
|
return $self; |
261
|
|
|
|
|
|
|
} |
262
|
|
|
|
|
|
|
|
263
|
|
|
|
|
|
|
=head2 normalize |
264
|
|
|
|
|
|
|
|
265
|
|
|
|
|
|
|
It will normalize text using L. |
266
|
|
|
|
|
|
|
|
267
|
|
|
|
|
|
|
$ts->normalize("あ━ ”(*)” を〰〰 ’+1’") |
268
|
|
|
|
|
|
|
$ts->normalize("テキスト〰〰", qw/nfkc, alnum_z2h/, \&your_create_routine) |
269
|
|
|
|
|
|
|
|
270
|
|
|
|
|
|
|
It accepts a string as the first argument, and receives the Lingua::JA::NormalizeText options and subroutines after the second argument. |
271
|
|
|
|
|
|
|
If you do not specify a subroutine to be used in normalization, use the following Lingua::JA::NormalizeText options and subroutines by default. |
272
|
|
|
|
|
|
|
|
273
|
|
|
|
|
|
|
Please read the documentation of L for details on how each Lingua::JA::NormalizeText option works. |
274
|
|
|
|
|
|
|
|
275
|
|
|
|
|
|
|
Lingua::JA::NormalizeText options |
276
|
|
|
|
|
|
|
|
277
|
|
|
|
|
|
|
C |
278
|
|
|
|
|
|
|
|
279
|
|
|
|
|
|
|
Subroutines |
280
|
|
|
|
|
|
|
|
281
|
|
|
|
|
|
|
C |
282
|
|
|
|
|
|
|
|
283
|
|
|
|
|
|
|
=cut |
284
|
|
|
|
|
|
|
|
285
|
|
|
|
|
|
|
sub normalize { |
286
|
|
|
|
|
|
|
my $self = shift; |
287
|
|
|
|
|
|
|
my $text = shift; |
288
|
|
|
|
|
|
|
my $normalizer = Lingua::JA::NormalizeText->new(@_ ? @_ : @{ $self->{normalize} }); |
289
|
|
|
|
|
|
|
$normalizer->normalize(utf8::is_utf8($text) ? $text : decode_utf8($text)); |
290
|
|
|
|
|
|
|
} |
291
|
|
|
|
|
|
|
|
292
|
|
|
|
|
|
|
=head2 filter |
293
|
|
|
|
|
|
|
|
294
|
|
|
|
|
|
|
Please use after parse method execution. |
295
|
|
|
|
|
|
|
Filter the surface based on the features stored in the Text::Shirasu instance. |
296
|
|
|
|
|
|
|
Passing subtype to value with part of speech name as key allows you to more filter the string. |
297
|
|
|
|
|
|
|
|
298
|
|
|
|
|
|
|
# filtering nodes only |
299
|
|
|
|
|
|
|
$ts->filter(type => [qw/名詞/]); |
300
|
|
|
|
|
|
|
$ts->filter(type => [qw/名詞 記号/], 記号 => [qw/括弧開 括弧閉/]); |
301
|
|
|
|
|
|
|
|
302
|
|
|
|
|
|
|
# filtering trees only |
303
|
|
|
|
|
|
|
$ts->filter(tree => 1, node => 0, type => [qw/名詞/]); |
304
|
|
|
|
|
|
|
$ts->filter(tree => 1, node => 0, type => [qw/名詞 記号/], 記号 => [qw/括弧開 括弧閉/]); |
305
|
|
|
|
|
|
|
|
306
|
|
|
|
|
|
|
# filtering nodes and trees |
307
|
|
|
|
|
|
|
$ts->filter(tree => 1, type => [qw/名詞/]); |
308
|
|
|
|
|
|
|
$ts->filter(tree => 1, type => [qw/名詞 記号/], 記号 => [qw/括弧開 括弧閉/]); |
309
|
|
|
|
|
|
|
|
310
|
|
|
|
|
|
|
=cut |
311
|
|
|
|
|
|
|
|
312
|
|
|
|
|
|
|
sub filter { |
313
|
|
|
|
|
|
|
my $self = shift; |
314
|
|
|
|
|
|
|
my %params = ref $_[0] eq 'HASH' ? %{ $_[0] } : @_; |
315
|
|
|
|
|
|
|
|
316
|
|
|
|
|
|
|
# and search filter |
317
|
|
|
|
|
|
|
my @type = @{ delete $params{type} } |
318
|
|
|
|
|
|
|
or croak 'Query has not been inputted: "type"'; |
319
|
|
|
|
|
|
|
|
320
|
|
|
|
|
|
|
# create parameter as /名詞|動詞/ or /名詞/ |
321
|
|
|
|
|
|
|
my $query = encode_utf8 join '|', map { $_ } @type; |
322
|
|
|
|
|
|
|
|
323
|
|
|
|
|
|
|
# filtering trees |
324
|
|
|
|
|
|
|
if (delete $params{tree}) { |
325
|
|
|
|
|
|
|
$self->{trees} = [ |
326
|
|
|
|
|
|
|
grep { |
327
|
|
|
|
|
|
|
$_->{feature}->[0] =~ /($query)/ |
328
|
|
|
|
|
|
|
and _sub_query( $_->{feature}->[1], $params{decode_utf8($1)} ) |
329
|
|
|
|
|
|
|
} @{ $self->{trees} } |
330
|
|
|
|
|
|
|
]; |
331
|
|
|
|
|
|
|
} |
332
|
|
|
|
|
|
|
|
333
|
|
|
|
|
|
|
# filtering nodes if unset "node" argument or "node => true value" |
334
|
|
|
|
|
|
|
if (!exists $params{node} || delete $params{node}) { |
335
|
|
|
|
|
|
|
$self->{nodes} = [ |
336
|
|
|
|
|
|
|
grep { |
337
|
|
|
|
|
|
|
$_->{feature}->[0] =~ /($query)/ |
338
|
|
|
|
|
|
|
and _sub_query( $_->{feature}->[1], $params{decode_utf8($1)} ) |
339
|
|
|
|
|
|
|
} @{ $self->{nodes} } |
340
|
|
|
|
|
|
|
]; |
341
|
|
|
|
|
|
|
} |
342
|
|
|
|
|
|
|
|
343
|
|
|
|
|
|
|
return $self; |
344
|
|
|
|
|
|
|
} |
345
|
|
|
|
|
|
|
|
346
|
|
|
|
|
|
|
|
347
|
|
|
|
|
|
|
=head2 join_surface |
348
|
|
|
|
|
|
|
|
349
|
|
|
|
|
|
|
Returns a string that combined the surfaces stored in the instance. |
350
|
|
|
|
|
|
|
|
351
|
|
|
|
|
|
|
$ts->join_surface |
352
|
|
|
|
|
|
|
|
353
|
|
|
|
|
|
|
=cut |
354
|
|
|
|
|
|
|
|
355
|
|
|
|
|
|
|
sub join_surface { |
356
|
|
|
|
|
|
|
my $self = shift; |
357
|
|
|
|
|
|
|
croak "Does not exist parsed nodes" unless exists $self->{nodes}; |
358
|
|
|
|
|
|
|
return join '', map { $_->{surface} } @{ $self->{nodes} }; |
359
|
|
|
|
|
|
|
} |
360
|
|
|
|
|
|
|
|
361
|
|
|
|
|
|
|
=head2 nodes |
362
|
|
|
|
|
|
|
|
363
|
|
|
|
|
|
|
Return the array reference of the Text::Shirasu::Node instance. |
364
|
|
|
|
|
|
|
|
365
|
|
|
|
|
|
|
$ts->nodes |
366
|
|
|
|
|
|
|
|
367
|
|
|
|
|
|
|
=cut |
368
|
|
|
|
|
|
|
|
369
|
|
|
|
|
|
|
sub nodes { $_[0]->{nodes} } |
370
|
|
|
|
|
|
|
|
371
|
|
|
|
|
|
|
=head2 trees |
372
|
|
|
|
|
|
|
|
373
|
|
|
|
|
|
|
Return the array reference of the Text::Shirasu::Tree instance. |
374
|
|
|
|
|
|
|
|
375
|
|
|
|
|
|
|
$ts->trees |
376
|
|
|
|
|
|
|
|
377
|
|
|
|
|
|
|
=cut |
378
|
|
|
|
|
|
|
|
379
|
|
|
|
|
|
|
sub trees { $_[0]->{trees} } |
380
|
|
|
|
|
|
|
|
381
|
|
|
|
|
|
|
=head2 mecab |
382
|
|
|
|
|
|
|
|
383
|
|
|
|
|
|
|
Return the Text::MeCab instance. |
384
|
|
|
|
|
|
|
|
385
|
|
|
|
|
|
|
$ts->mecab |
386
|
|
|
|
|
|
|
|
387
|
|
|
|
|
|
|
=cut |
388
|
|
|
|
|
|
|
|
389
|
|
|
|
|
|
|
sub mecab { $_[0]->{mecab} } |
390
|
|
|
|
|
|
|
|
391
|
|
|
|
|
|
|
=head2 cabocha |
392
|
|
|
|
|
|
|
|
393
|
|
|
|
|
|
|
Return the Text::CaboCha instance. |
394
|
|
|
|
|
|
|
|
395
|
|
|
|
|
|
|
$ts->cabocha |
396
|
|
|
|
|
|
|
|
397
|
|
|
|
|
|
|
=cut |
398
|
|
|
|
|
|
|
|
399
|
|
|
|
|
|
|
sub cabocha { $_[0]->{cabocha} } |
400
|
|
|
|
|
|
|
|
401
|
|
|
|
|
|
|
# private |
402
|
|
|
|
|
|
|
sub _sub_query { |
403
|
|
|
|
|
|
|
my ( $subtype, $query ) = @_; |
404
|
|
|
|
|
|
|
|
405
|
|
|
|
|
|
|
return 1 unless ref $query eq 'ARRAY'; |
406
|
|
|
|
|
|
|
|
407
|
|
|
|
|
|
|
my $judge = join '|', map { encode_utf8($_) } @$query; |
408
|
|
|
|
|
|
|
|
409
|
|
|
|
|
|
|
return $subtype =~ /$judge/; |
410
|
|
|
|
|
|
|
} |
411
|
|
|
|
|
|
|
|
412
|
|
|
|
|
|
|
1; |
413
|
|
|
|
|
|
|
|
414
|
|
|
|
|
|
|
=head1 SUBROUTINES |
415
|
|
|
|
|
|
|
|
416
|
|
|
|
|
|
|
These subroutines perform the following substitution. |
417
|
|
|
|
|
|
|
|
418
|
|
|
|
|
|
|
=head2 normalize_hyphen |
419
|
|
|
|
|
|
|
|
420
|
|
|
|
|
|
|
s/[˗֊‐‑‒–⁃⁻₋−]/-/g; |
421
|
|
|
|
|
|
|
s/[﹣-ー—―─━ー]/ー/g; |
422
|
|
|
|
|
|
|
s/[~∼∾〜〰~]//g; |
423
|
|
|
|
|
|
|
s/ー+/ー/g; |
424
|
|
|
|
|
|
|
|
425
|
|
|
|
|
|
|
=head2 normalize_symbols |
426
|
|
|
|
|
|
|
|
427
|
|
|
|
|
|
|
tr/。、・「」/。、・「」/; |
428
|
|
|
|
|
|
|
|
429
|
|
|
|
|
|
|
=cut |
430
|
|
|
|
|
|
|
|
431
|
|
|
|
|
|
|
sub normalize_hyphen { |
432
|
|
|
|
|
|
|
local $_ = shift; |
433
|
|
|
|
|
|
|
return undef unless defined $_; |
434
|
|
|
|
|
|
|
s/[˗֊‐‑‒–⁃⁻₋−]/-/g; |
435
|
|
|
|
|
|
|
s/[﹣-ー—―─━ー]/ー/g; |
436
|
|
|
|
|
|
|
s/[~∼∾〜〰~]//g; |
437
|
|
|
|
|
|
|
s/ー+/ー/g; |
438
|
|
|
|
|
|
|
$_; |
439
|
|
|
|
|
|
|
} |
440
|
|
|
|
|
|
|
|
441
|
|
|
|
|
|
|
sub normalize_symbols { |
442
|
|
|
|
|
|
|
local $_ = shift; |
443
|
|
|
|
|
|
|
return undef unless defined $_; |
444
|
|
|
|
|
|
|
tr/。、・「」/。、・「」/; |
445
|
|
|
|
|
|
|
$_; |
446
|
|
|
|
|
|
|
} |
447
|
|
|
|
|
|
|
|
448
|
|
|
|
|
|
|
=head1 LICENSE |
449
|
|
|
|
|
|
|
|
450
|
|
|
|
|
|
|
Copyright (C) Kei Kamikawa(Code-Hex). |
451
|
|
|
|
|
|
|
|
452
|
|
|
|
|
|
|
This library is free software; you can redistribute it and/or modify |
453
|
|
|
|
|
|
|
it under the same terms as Perl itself. |
454
|
|
|
|
|
|
|
|
455
|
|
|
|
|
|
|
=head1 AUTHOR |
456
|
|
|
|
|
|
|
|
457
|
|
|
|
|
|
|
Kei Kamikawa Ex00.x7f@gmail.comE |
458
|
|
|
|
|
|
|
|
459
|
|
|
|
|
|
|
=cut |