line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Text::Shirasu; |
2
|
|
|
|
|
|
|
|
3
|
5
|
|
|
5
|
|
84143
|
use strict; |
|
5
|
|
|
|
|
8
|
|
|
5
|
|
|
|
|
107
|
|
4
|
5
|
|
|
5
|
|
15
|
use warnings; |
|
5
|
|
|
|
|
5
|
|
|
5
|
|
|
|
|
85
|
|
5
|
5
|
|
|
5
|
|
458
|
use utf8; |
|
5
|
|
|
|
|
15
|
|
|
5
|
|
|
|
|
17
|
|
6
|
5
|
|
|
5
|
|
86
|
use Exporter 'import'; |
|
5
|
|
|
|
|
6
|
|
|
5
|
|
|
|
|
96
|
|
7
|
5
|
|
|
5
|
|
3436
|
use Text::MeCab; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
8
|
|
|
|
|
|
|
use Carp 'croak'; |
9
|
|
|
|
|
|
|
use Text::Shirasu::Node; |
10
|
|
|
|
|
|
|
use Lingua::JA::NormalizeText; |
11
|
|
|
|
|
|
|
use Encode qw/encode_utf8 decode_utf8/; |
12
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
our $VERSION = "0.0.3"; |
14
|
|
|
|
|
|
|
our @EXPORT_OK = (@Lingua::JA::NormalizeText::EXPORT_OK, qw/normalize_hyphen normalize_symbols/); |
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
*nfkc = \&Lingua::JA::NormalizeText::nfkc; |
17
|
|
|
|
|
|
|
*nfkd = \&Lingua::JA::NormalizeText::nfkd; |
18
|
|
|
|
|
|
|
*nfc = \&Lingua::JA::NormalizeText::nfc; |
19
|
|
|
|
|
|
|
*nfd = \&Lingua::JA::NormalizeText::nfd; |
20
|
|
|
|
|
|
|
*decode_entities = \&Lingua::JA::NormalizeText::decode_entities; |
21
|
|
|
|
|
|
|
*alnum_z2h = \&Lingua::JA::NormalizeText::alnum_z2h; |
22
|
|
|
|
|
|
|
*alnum_h2z = \&Lingua::JA::NormalizeText::alnum_h2z; |
23
|
|
|
|
|
|
|
*space_z2h = \&Lingua::JA::NormalizeText::space_z2h; |
24
|
|
|
|
|
|
|
*space_h2z = \&Lingua::JA::NormalizeText::space_h2z; |
25
|
|
|
|
|
|
|
*katakana_z2h = \&Lingua::JA::NormalizeText::katakana_z2h; |
26
|
|
|
|
|
|
|
*katakana_h2z = \&Lingua::JA::NormalizeText::katakana_h2z; |
27
|
|
|
|
|
|
|
*katakana2hiragana = \&Lingua::JA::NormalizeText::katakana2hiragana; |
28
|
|
|
|
|
|
|
*hiragana2katakana = \&Lingua::JA::NormalizeText::hiragana2katakana; |
29
|
|
|
|
|
|
|
*dakuon_normalize = \&Lingua::JA::NormalizeText::dakuon_normalize; |
30
|
|
|
|
|
|
|
*handakuon_normalize = \&Lingua::JA::NormalizeText::handakuon_normalize; |
31
|
|
|
|
|
|
|
*all_dakuon_normalize = \&Lingua::JA::NormalizeText::all_dakuon_normalize; |
32
|
|
|
|
|
|
|
*square2katakana = \&Lingua::JA::NormalizeText::square2katakana; |
33
|
|
|
|
|
|
|
*circled2kana = \&Lingua::JA::NormalizeText::circled2kana; |
34
|
|
|
|
|
|
|
*circled2kanji = \&Lingua::JA::NormalizeText::circled2kanji; |
35
|
|
|
|
|
|
|
*strip_html = \&Lingua::JA::NormalizeText::strip_html; |
36
|
|
|
|
|
|
|
*wave2tilde = \&Lingua::JA::NormalizeText::wave2long; |
37
|
|
|
|
|
|
|
*tilde2wave = \&Lingua::JA::NormalizeText::tilde2wave; |
38
|
|
|
|
|
|
|
*wavetilde2long = \&Lingua::JA::NormalizeText::wavetilde2long; |
39
|
|
|
|
|
|
|
*wave2long = \&Lingua::JA::NormalizeText::wave2long; |
40
|
|
|
|
|
|
|
*tilde2long = \&Lingua::JA::NormalizeText::tilde2long; |
41
|
|
|
|
|
|
|
*fullminus2long = \&Lingua::JA::NormalizeText::fullminus2long; |
42
|
|
|
|
|
|
|
*dashes2long = \&Lingua::JA::NormalizeText::dashes2long; |
43
|
|
|
|
|
|
|
*drawing_lines2long = \&Lingua::JA::NormalizeText::drawing_lines2long; |
44
|
|
|
|
|
|
|
*unify_long_repeats = \&Lingua::JA::NormalizeText::unify_long_repeats; |
45
|
|
|
|
|
|
|
*unify_long_spaces = \&Lingua::JA::NormalizeText::unify_long_spaces; |
46
|
|
|
|
|
|
|
*unify_whitespaces = \&Lingua::JA::NormalizeText::unify_whitespaces; |
47
|
|
|
|
|
|
|
*trim = \&Lingua::JA::NormalizeText::trim; |
48
|
|
|
|
|
|
|
*ltrim = \&Lingua::JA::NormalizeText::ltrim; |
49
|
|
|
|
|
|
|
*rtrim = \&Lingua::JA::NormalizeText::rtrim; |
50
|
|
|
|
|
|
|
*nl2space = \&Lingua::JA::NormalizeText::nl2space; |
51
|
|
|
|
|
|
|
*unify_nl = \&Lingua::JA::NormalizeText::unify_nl; |
52
|
|
|
|
|
|
|
*tab2space = \&Lingua::JA::NormalizeText::tab2space; |
53
|
|
|
|
|
|
|
*old2new_kana = \&Lingua::JA::NormalizeText::old2new_kana; |
54
|
|
|
|
|
|
|
*remove_controls = \&Lingua::JA::NormalizeText::remove_controls; |
55
|
|
|
|
|
|
|
*remove_spaces = \&Lingua::JA::NormalizeText::remove_spaces; |
56
|
|
|
|
|
|
|
*remove_DFC = \&Lingua::JA::NormalizeText::remove_DFC; |
57
|
|
|
|
|
|
|
*old2new_kanji = \&Lingua::JA::NormalizeText::old2new_kanji; |
58
|
|
|
|
|
|
|
*decompose_parenthesized_kanji |
59
|
|
|
|
|
|
|
= \&Lingua::JA::NormalizeText::decompose_parenthesized_kanji; |
60
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
=encoding utf-8 |
62
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
=head1 NAME |
64
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
Text::Shirasu - Text::MeCab wrapped for natural language processing |
66
|
|
|
|
|
|
|
|
67
|
|
|
|
|
|
|
=head1 SYNOPSIS |
68
|
|
|
|
|
|
|
|
69
|
|
|
|
|
|
|
use utf8; |
70
|
|
|
|
|
|
|
use feature ':5.10'; |
71
|
|
|
|
|
|
|
use Text::Shirasu; |
72
|
|
|
|
|
|
|
my $ts = Text::Shirasu->new; # this parameter same as Text::MeCab |
73
|
|
|
|
|
|
|
my $normalize = $ts->normalize("昨日の晩御飯は「鮭のふりかけ」と「味噌汁」だけでした。"); |
74
|
|
|
|
|
|
|
$ts->parse($normalize); |
75
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
for my $node (@{ $ts->nodes }) { |
77
|
|
|
|
|
|
|
say $node->surface; |
78
|
|
|
|
|
|
|
} |
79
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
say $ts->join_surface; |
81
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
my $filter = $ts->filter(type => [qw/名詞 助動詞/], 記号 => [qw/括弧開 括弧閉/]); |
83
|
|
|
|
|
|
|
say $filter->join_surface; |
84
|
|
|
|
|
|
|
|
85
|
|
|
|
|
|
|
=head1 DESCRIPTION |
86
|
|
|
|
|
|
|
|
87
|
|
|
|
|
|
|
Text::Shirasu is wrapped L. |
88
|
|
|
|
|
|
|
This module is easy to normalize text and filter part of speech. |
89
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
=cut |
91
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
sub new { |
93
|
|
|
|
|
|
|
my $class = shift; |
94
|
|
|
|
|
|
|
my %args = ref $_[0] eq 'HASH' ? %{ $_[0] } : @_; |
95
|
|
|
|
|
|
|
return bless { |
96
|
|
|
|
|
|
|
mecab => Text::MeCab->new(%args), |
97
|
|
|
|
|
|
|
nodes => +[], |
98
|
|
|
|
|
|
|
normalize => +[qw/ |
99
|
|
|
|
|
|
|
nfkc |
100
|
|
|
|
|
|
|
nfkd |
101
|
|
|
|
|
|
|
nfc |
102
|
|
|
|
|
|
|
nfd |
103
|
|
|
|
|
|
|
alnum_z2h |
104
|
|
|
|
|
|
|
space_z2h |
105
|
|
|
|
|
|
|
katakana_h2z |
106
|
|
|
|
|
|
|
decode_entities |
107
|
|
|
|
|
|
|
unify_nl |
108
|
|
|
|
|
|
|
unify_whitespaces |
109
|
|
|
|
|
|
|
unify_long_spaces |
110
|
|
|
|
|
|
|
trim |
111
|
|
|
|
|
|
|
old2new_kana |
112
|
|
|
|
|
|
|
old2new_kanji |
113
|
|
|
|
|
|
|
tab2space |
114
|
|
|
|
|
|
|
all_dakuon_normalize |
115
|
|
|
|
|
|
|
square2katakana |
116
|
|
|
|
|
|
|
circled2kana |
117
|
|
|
|
|
|
|
circled2kanji |
118
|
|
|
|
|
|
|
decompose_parenthesized_kanji |
119
|
|
|
|
|
|
|
/, \&normalize_hyphen, \&normalize_symbols |
120
|
|
|
|
|
|
|
], |
121
|
|
|
|
|
|
|
} => $class; |
122
|
|
|
|
|
|
|
} |
123
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
=head1 METHODS |
125
|
|
|
|
|
|
|
=cut |
126
|
|
|
|
|
|
|
|
127
|
|
|
|
|
|
|
=head2 parse |
128
|
|
|
|
|
|
|
|
129
|
|
|
|
|
|
|
This method wraps the parse method of Text::MeCab. |
130
|
|
|
|
|
|
|
The analysis result is saved as Text::Shirasu::Node instance in the Text::Shirasu instance. So, It will return Text::Shirasu instance. |
131
|
|
|
|
|
|
|
|
132
|
|
|
|
|
|
|
$ts->parse("このおにぎりは「母」が握ってくれたものです。"); |
133
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
=cut |
135
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
sub parse { |
137
|
|
|
|
|
|
|
my $self = shift; |
138
|
|
|
|
|
|
|
my $sentence = $_[0]; |
139
|
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
croak "Sentence has not been inputted" unless $sentence; |
141
|
|
|
|
|
|
|
|
142
|
|
|
|
|
|
|
my $mt = $self->{mecab}; |
143
|
|
|
|
|
|
|
|
144
|
|
|
|
|
|
|
# initialize |
145
|
|
|
|
|
|
|
$self->{nodes} = []; |
146
|
|
|
|
|
|
|
|
147
|
|
|
|
|
|
|
for (my $node = $mt->parse($sentence); $node && $node->surface; $node = $node->next) { |
148
|
|
|
|
|
|
|
push @{ $self->{nodes} }, bless { |
149
|
|
|
|
|
|
|
id => $node->id, |
150
|
|
|
|
|
|
|
surface => $node->surface, |
151
|
|
|
|
|
|
|
feature => [ split /,/, $node->feature ], |
152
|
|
|
|
|
|
|
length => $node->length, |
153
|
|
|
|
|
|
|
rlength => $node->rlength, |
154
|
|
|
|
|
|
|
rcattr => $node->rcattr, |
155
|
|
|
|
|
|
|
lcattr => $node->lcattr, |
156
|
|
|
|
|
|
|
stat => $node->stat, |
157
|
|
|
|
|
|
|
isbest => $node->isbest, |
158
|
|
|
|
|
|
|
alpha => $node->alpha, |
159
|
|
|
|
|
|
|
beta => $node->beta, |
160
|
|
|
|
|
|
|
prob => $node->prob, |
161
|
|
|
|
|
|
|
wcost => $node->wcost, |
162
|
|
|
|
|
|
|
cost => $node->cost, |
163
|
|
|
|
|
|
|
}, 'Text::Shirasu::Node'; |
164
|
|
|
|
|
|
|
} |
165
|
|
|
|
|
|
|
|
166
|
|
|
|
|
|
|
return $self; |
167
|
|
|
|
|
|
|
} |
168
|
|
|
|
|
|
|
|
169
|
|
|
|
|
|
|
=head2 normalize |
170
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
It will normalize text using L. |
172
|
|
|
|
|
|
|
|
173
|
|
|
|
|
|
|
$ts->normalize("あ━ ”(*)” を〰〰 ’+1’") |
174
|
|
|
|
|
|
|
$ts->normalize("テキスト〰〰", qw/nfkc, alnum_z2h/, \&your_create_routine) |
175
|
|
|
|
|
|
|
|
176
|
|
|
|
|
|
|
It accepts a string as the first argument, and receives the Lingua::JA::NormalizeText options and subroutines after the second argument. |
177
|
|
|
|
|
|
|
If you do not specify a subroutine to be used in normalization, use the following Lingua::JA::NormalizeText options and subroutines by default. |
178
|
|
|
|
|
|
|
|
179
|
|
|
|
|
|
|
Please read the documentation of L for details on how each Lingua::JA::NormalizeText option works. |
180
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
Lingua::JA::NormalizeText options |
182
|
|
|
|
|
|
|
|
183
|
|
|
|
|
|
|
C |
184
|
|
|
|
|
|
|
|
185
|
|
|
|
|
|
|
Subroutines |
186
|
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
C |
188
|
|
|
|
|
|
|
|
189
|
|
|
|
|
|
|
=cut |
190
|
|
|
|
|
|
|
|
191
|
|
|
|
|
|
|
sub normalize { |
192
|
|
|
|
|
|
|
my $self = shift; |
193
|
|
|
|
|
|
|
my $text = shift; |
194
|
|
|
|
|
|
|
my $normalizer = Lingua::JA::NormalizeText->new(@_ ? @_ : @{ $self->{normalize} }); |
195
|
|
|
|
|
|
|
$normalizer->normalize(utf8::is_utf8($text) ? $text : decode_utf8($text)); |
196
|
|
|
|
|
|
|
} |
197
|
|
|
|
|
|
|
|
198
|
|
|
|
|
|
|
=head2 filter |
199
|
|
|
|
|
|
|
|
200
|
|
|
|
|
|
|
Please use after parse method execution. |
201
|
|
|
|
|
|
|
Filter the surface based on the features stored in the Text::Shirasu instance. |
202
|
|
|
|
|
|
|
Passing subtype to value with part of speech name as key allows you to more filter the string. |
203
|
|
|
|
|
|
|
|
204
|
|
|
|
|
|
|
$ts->filter(type => [qw/名詞/]); |
205
|
|
|
|
|
|
|
$ts->filter(type => [qw/名詞 記号/], 記号 => [qw/括弧開 括弧閉/]); |
206
|
|
|
|
|
|
|
|
207
|
|
|
|
|
|
|
=cut |
208
|
|
|
|
|
|
|
|
209
|
|
|
|
|
|
|
sub filter { |
210
|
|
|
|
|
|
|
my $self = shift; |
211
|
|
|
|
|
|
|
my %params = ref $_[0] eq 'HASH' ? %{ $_[0] } : @_; |
212
|
|
|
|
|
|
|
|
213
|
|
|
|
|
|
|
# and search filter |
214
|
|
|
|
|
|
|
my @type = @{ delete $params{type} } |
215
|
|
|
|
|
|
|
or croak 'Query has not been inputted: "type"'; |
216
|
|
|
|
|
|
|
|
217
|
|
|
|
|
|
|
# create parameter as /名詞|動詞/ or /名詞/ |
218
|
|
|
|
|
|
|
my $query = encode_utf8 join '|', map { $_ } @type; |
219
|
|
|
|
|
|
|
|
220
|
|
|
|
|
|
|
$self->{nodes} = [ |
221
|
|
|
|
|
|
|
grep { |
222
|
|
|
|
|
|
|
$_->{feature}->[0] =~ /($query)/ |
223
|
|
|
|
|
|
|
and _sub_query( $_->{feature}->[1], $params{decode_utf8($1)} ) |
224
|
|
|
|
|
|
|
} @{ $self->{nodes} } |
225
|
|
|
|
|
|
|
]; |
226
|
|
|
|
|
|
|
|
227
|
|
|
|
|
|
|
return $self; |
228
|
|
|
|
|
|
|
} |
229
|
|
|
|
|
|
|
|
230
|
|
|
|
|
|
|
=head2 join_surface |
231
|
|
|
|
|
|
|
|
232
|
|
|
|
|
|
|
Returns a string that combined the surfaces stored in the instance. |
233
|
|
|
|
|
|
|
|
234
|
|
|
|
|
|
|
$ts->join_surface |
235
|
|
|
|
|
|
|
|
236
|
|
|
|
|
|
|
=cut |
237
|
|
|
|
|
|
|
|
238
|
|
|
|
|
|
|
sub join_surface { |
239
|
|
|
|
|
|
|
my $self = shift; |
240
|
|
|
|
|
|
|
croak "Does not exist parsed nodes" unless exists $self->{nodes}; |
241
|
|
|
|
|
|
|
return join '', map { $_->{surface} } @{ $self->{nodes} }; |
242
|
|
|
|
|
|
|
} |
243
|
|
|
|
|
|
|
|
244
|
|
|
|
|
|
|
=head2 nodes |
245
|
|
|
|
|
|
|
|
246
|
|
|
|
|
|
|
Return the array reference of the Text::Shirasu::Node instance. |
247
|
|
|
|
|
|
|
|
248
|
|
|
|
|
|
|
$ts->nodes |
249
|
|
|
|
|
|
|
|
250
|
|
|
|
|
|
|
=cut |
251
|
|
|
|
|
|
|
|
252
|
|
|
|
|
|
|
sub nodes { $_[0]->{nodes} } |
253
|
|
|
|
|
|
|
|
254
|
|
|
|
|
|
|
=head2 mecab |
255
|
|
|
|
|
|
|
|
256
|
|
|
|
|
|
|
Return the Text::MeCab instance. |
257
|
|
|
|
|
|
|
|
258
|
|
|
|
|
|
|
$ts->mecab |
259
|
|
|
|
|
|
|
|
260
|
|
|
|
|
|
|
=cut |
261
|
|
|
|
|
|
|
|
262
|
|
|
|
|
|
|
sub mecab { $_[0]->{mecab} } |
263
|
|
|
|
|
|
|
|
264
|
|
|
|
|
|
|
# private |
265
|
|
|
|
|
|
|
sub _sub_query { |
266
|
|
|
|
|
|
|
my ( $subtype, $query ) = @_; |
267
|
|
|
|
|
|
|
|
268
|
|
|
|
|
|
|
return 1 unless ref $query eq 'ARRAY'; |
269
|
|
|
|
|
|
|
|
270
|
|
|
|
|
|
|
my $judge = join '|', map { encode_utf8($_) } @$query; |
271
|
|
|
|
|
|
|
|
272
|
|
|
|
|
|
|
return $subtype =~ /$judge/; |
273
|
|
|
|
|
|
|
} |
274
|
|
|
|
|
|
|
|
275
|
|
|
|
|
|
|
1; |
276
|
|
|
|
|
|
|
|
277
|
|
|
|
|
|
|
=head1 SUBROUTINES |
278
|
|
|
|
|
|
|
|
279
|
|
|
|
|
|
|
These subroutines perform the following substitution. |
280
|
|
|
|
|
|
|
|
281
|
|
|
|
|
|
|
=head2 normalize_hyphen |
282
|
|
|
|
|
|
|
|
283
|
|
|
|
|
|
|
s/[˗֊‐‑‒–⁃⁻₋−]/-/g; |
284
|
|
|
|
|
|
|
s/[﹣-ー—―─━ー]/ー/g; |
285
|
|
|
|
|
|
|
s/[~∼∾〜〰~]//g; |
286
|
|
|
|
|
|
|
s/ー+/ー/g; |
287
|
|
|
|
|
|
|
|
288
|
|
|
|
|
|
|
=head2 normalize_symbols |
289
|
|
|
|
|
|
|
|
290
|
|
|
|
|
|
|
tr/。、・「」/。、・「」/; |
291
|
|
|
|
|
|
|
|
292
|
|
|
|
|
|
|
=cut |
293
|
|
|
|
|
|
|
|
294
|
|
|
|
|
|
|
sub normalize_hyphen { |
295
|
|
|
|
|
|
|
local $_ = shift; |
296
|
|
|
|
|
|
|
return undef unless defined $_; |
297
|
|
|
|
|
|
|
s/[˗֊‐‑‒–⁃⁻₋−]/-/g; |
298
|
|
|
|
|
|
|
s/[﹣-ー—―─━ー]/ー/g; |
299
|
|
|
|
|
|
|
s/[~∼∾〜〰~]//g; |
300
|
|
|
|
|
|
|
s/ー+/ー/g; |
301
|
|
|
|
|
|
|
$_; |
302
|
|
|
|
|
|
|
} |
303
|
|
|
|
|
|
|
|
304
|
|
|
|
|
|
|
sub normalize_symbols { |
305
|
|
|
|
|
|
|
local $_ = shift; |
306
|
|
|
|
|
|
|
return undef unless defined $_; |
307
|
|
|
|
|
|
|
tr/。、・「」/。、・「」/; |
308
|
|
|
|
|
|
|
$_; |
309
|
|
|
|
|
|
|
} |
310
|
|
|
|
|
|
|
|
311
|
|
|
|
|
|
|
=head1 LICENSE |
312
|
|
|
|
|
|
|
|
313
|
|
|
|
|
|
|
Copyright (C) Kei Kamikawa(Code-Hex). |
314
|
|
|
|
|
|
|
|
315
|
|
|
|
|
|
|
This library is free software; you can redistribute it and/or modify |
316
|
|
|
|
|
|
|
it under the same terms as Perl itself. |
317
|
|
|
|
|
|
|
|
318
|
|
|
|
|
|
|
=head1 AUTHOR |
319
|
|
|
|
|
|
|
|
320
|
|
|
|
|
|
|
Kei Kamikawa Ex00.x7f@gmail.comE |
321
|
|
|
|
|
|
|
|
322
|
|
|
|
|
|
|
=cut |