line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Lingua::ZH::MMSEG; |
2
|
3
|
|
|
3
|
|
121361
|
use strict; |
|
3
|
|
|
|
|
9
|
|
|
3
|
|
|
|
|
202
|
|
3
|
3
|
|
|
3
|
|
22
|
use warnings; |
|
3
|
|
|
|
|
7
|
|
|
3
|
|
|
|
|
107
|
|
4
|
3
|
|
|
3
|
|
1393
|
use utf8; |
|
3
|
|
|
|
|
18
|
|
|
3
|
|
|
|
|
23
|
|
5
|
3
|
|
|
3
|
|
9405
|
use Encode qw (is_utf8); |
|
3
|
|
|
|
|
43377
|
|
|
3
|
|
|
|
|
324
|
|
6
|
3
|
|
|
3
|
|
13646
|
use encoding 'utf8'; |
|
3
|
|
|
|
|
11596
|
|
|
3
|
|
|
|
|
22
|
|
7
|
3
|
|
|
3
|
|
1897
|
use List::Util qw(sum); |
|
3
|
|
|
|
|
43
|
|
|
3
|
|
|
|
|
378
|
|
8
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
our $VERSION=0.4005; |
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
require Exporter; |
12
|
|
|
|
|
|
|
our @ISA = qw(Exporter); |
13
|
|
|
|
|
|
|
our @EXPORT = qw(mmseg fmm word_freq); |
14
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
=encoding utf8 |
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
=head1 NAME |
18
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
Lingua::ZH::MMSEG Mandarin Chinese segmentation |
20
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
=head1 SYNOPSIS |
22
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
#!/usr/bin/perl |
24
|
|
|
|
|
|
|
use utf8; |
25
|
|
|
|
|
|
|
use Lingua::ZH::MMSEG; |
26
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
my $zh_string="現代漢語的複合動詞可分三個結構語意關係來探討"; |
28
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
my @phrases = mmseg($zh_string); |
30
|
|
|
|
|
|
|
# use MMSEG algorithm |
31
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
my @phrases = fmm($zh_string); |
33
|
|
|
|
|
|
|
# use Forward Maximum Matching algorithm |
34
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
while (<>) { |
36
|
|
|
|
|
|
|
chomp; |
37
|
|
|
|
|
|
|
push @phrases, mmseg; |
38
|
|
|
|
|
|
|
} # mmseg and fmm will parse $_ automaticly |
39
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
print $_, word_freq($_) for @phrases; |
41
|
|
|
|
|
|
|
# you can get phrase frequency by calling word_freq |
42
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
=head1 DESCRIPTION |
44
|
|
|
|
|
|
|
|
45
|
|
|
|
|
|
|
A problem in computational analysis of Chinese text is that there are no word |
46
|
|
|
|
|
|
|
boundaries in conventionally printed text. Since the word is such a fundamental |
47
|
|
|
|
|
|
|
linguistic unit, it is necessary to identify words in Chinese text so that |
48
|
|
|
|
|
|
|
higher-level analyses can be performed. |
49
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
Lingua::ZH::MMSEG implements L |
51
|
|
|
|
|
|
|
original developed by L. The whole module is |
52
|
|
|
|
|
|
|
rewritten in pure Perl, and the phrase library is |
53
|
|
|
|
|
|
|
L<新酷音 forked from OpenFoundry|http://www.openfoundry.org/of/projects/436>. |
54
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
=head1 INSTALL |
56
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
To install this module, just type |
58
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
cpanm Lingua::ZH::MMSEG |
60
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
If you don't have cpanm, |
62
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
curl -LO http://bit.ly/cpanm |
64
|
|
|
|
|
|
|
chmod +x cpanm |
65
|
|
|
|
|
|
|
sudo cp cpanm /usr/local/bin |
66
|
|
|
|
|
|
|
|
67
|
|
|
|
|
|
|
=head1 FUNCTIONS |
68
|
|
|
|
|
|
|
|
69
|
|
|
|
|
|
|
=head2 mmseg |
70
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
@phrases = mmseg($zh_string); |
72
|
|
|
|
|
|
|
@phrases = mmseg; |
73
|
|
|
|
|
|
|
# use $_ automatically |
74
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
C convert a mandarin Chinese string to a sequence of phrases using |
76
|
|
|
|
|
|
|
L algorithm. If there were any |
77
|
|
|
|
|
|
|
english containted in the input string, it simply parse the linked ascii code as |
78
|
|
|
|
|
|
|
one phrase. For example: |
79
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
$_ = "這裡有中文Today is Wednesday.這邊又有中文 I go to school on Friday."; |
81
|
|
|
|
|
|
|
print "$_\n" for mmseg; |
82
|
|
|
|
|
|
|
|
83
|
|
|
|
|
|
|
這裡有 |
84
|
|
|
|
|
|
|
中文 |
85
|
|
|
|
|
|
|
Today is Wednesday. |
86
|
|
|
|
|
|
|
這邊 |
87
|
|
|
|
|
|
|
又有 |
88
|
|
|
|
|
|
|
中文 |
89
|
|
|
|
|
|
|
I go to school on Friday. |
90
|
|
|
|
|
|
|
|
91
|
|
|
|
|
|
|
The ascii characters are recognized by C[ -~]+/>. |
92
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
=head2 fmm (Forward Maximum Matching) |
94
|
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
@phrases = fmm($zh_string); |
96
|
|
|
|
|
|
|
@phrases = fmm; |
97
|
|
|
|
|
|
|
# use $_ automatically |
98
|
|
|
|
|
|
|
|
99
|
|
|
|
|
|
|
C uses forward maximum matching (so called longest match principle) to |
100
|
|
|
|
|
|
|
convert a mandarin Chinese string to a sequence of phrases. It uses the same |
101
|
|
|
|
|
|
|
rule of C to deal with ascii string. The advantage of C is it has |
102
|
|
|
|
|
|
|
lower complexity compare to C; the disadvantage is it cannot solve |
103
|
|
|
|
|
|
|
ambiguity when there is multiple way to seperate a string. |
104
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
=head2 word_freq |
106
|
|
|
|
|
|
|
|
107
|
|
|
|
|
|
|
$freq = word_freq($phrase); |
108
|
|
|
|
|
|
|
$freq = word_freq; |
109
|
|
|
|
|
|
|
# use $_ automatically |
110
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
C return the phrase frequency defined in L<新酷音|http://www.openfoundry.org/of/projects/436>. |
112
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
=head1 AUTHOR |
114
|
|
|
|
|
|
|
|
115
|
|
|
|
|
|
|
Felix Ren-Chyan Chern (dryman) C<< >> |
116
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
=head1 LICENSE AND COPYRIGHT |
118
|
|
|
|
|
|
|
|
119
|
|
|
|
|
|
|
L |
120
|
|
|
|
|
|
|
|
121
|
|
|
|
|
|
|
=cut |
122
|
|
|
|
|
|
|
|
123
|
|
|
|
|
|
|
our %dict; |
124
|
|
|
|
|
|
|
|
125
|
|
|
|
|
|
|
while (){ |
126
|
|
|
|
|
|
|
chomp; |
127
|
|
|
|
|
|
|
my ($phrase,$freq) = split; |
128
|
|
|
|
|
|
|
$dict{$phrase}=$freq; |
129
|
|
|
|
|
|
|
} |
130
|
|
|
|
|
|
|
|
131
|
|
|
|
|
|
|
sub word_freq { |
132
|
0
|
0
|
|
0
|
1
|
0
|
my $string = $_[0] ? $_[0] : $_; |
133
|
0
|
|
|
|
|
0
|
$dict{$string}; |
134
|
|
|
|
|
|
|
} |
135
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
sub mmseg { |
137
|
1
|
50
|
|
1
|
1
|
12
|
my $string = $_[0] ? $_[0] : $_; |
138
|
1
|
|
|
|
|
2
|
my @phrases; |
139
|
1
|
50
|
|
|
|
8
|
die unless is_utf8($string); |
140
|
1
|
|
|
|
|
4
|
chomp ($string); |
141
|
1
|
|
|
|
|
8
|
for my $str (split (/([ -~]+)/, $string)) { |
142
|
1
|
50
|
|
|
|
8
|
if ($str =~ /^[ -~]/) { |
143
|
0
|
|
|
|
|
0
|
push @phrases, $str; |
144
|
0
|
|
|
|
|
0
|
next; |
145
|
|
|
|
|
|
|
} |
146
|
1
|
|
|
|
|
3
|
while($str){ |
147
|
7
|
|
|
|
|
20
|
my $word1 = &_mmseg($str); |
148
|
7
|
|
|
|
|
12
|
push @phrases, $word1; |
149
|
7
|
|
|
|
|
33
|
$str = substr $str, length $word1; |
150
|
|
|
|
|
|
|
} |
151
|
|
|
|
|
|
|
} |
152
|
1
|
|
|
|
|
9
|
return @phrases; |
153
|
|
|
|
|
|
|
} |
154
|
|
|
|
|
|
|
|
155
|
|
|
|
|
|
|
sub fmm { |
156
|
1
|
50
|
|
1
|
1
|
14
|
my $string = $_[0] ? $_[0] : $_; |
157
|
1
|
|
|
|
|
3
|
my @phrases; |
158
|
1
|
50
|
|
|
|
9
|
die unless is_utf8($string); |
159
|
1
|
|
|
|
|
3
|
chomp ($string); |
160
|
1
|
|
|
|
|
12
|
for my $str (split (/([ -~])+/, $string)) { |
161
|
1
|
50
|
|
|
|
23
|
if ($str =~ /^[ -~]/) { |
162
|
0
|
|
|
|
|
0
|
push @phrases, $str, |
163
|
|
|
|
|
|
|
next; |
164
|
|
|
|
|
|
|
} |
165
|
1
|
|
|
|
|
6
|
while($str){ |
166
|
7
|
|
|
|
|
23
|
for (reverse (1..(length $str))) { |
167
|
48
|
|
|
|
|
85
|
my $match = substr $str, 0, $_; |
168
|
48
|
100
|
66
|
|
|
243
|
if (defined $dict{$match} or $_==1){ |
169
|
7
|
|
|
|
|
12
|
push @phrases, $match; |
170
|
7
|
|
|
|
|
16
|
$str = substr $str, $_; |
171
|
7
|
|
|
|
|
23
|
last; |
172
|
|
|
|
|
|
|
} |
173
|
|
|
|
|
|
|
} |
174
|
|
|
|
|
|
|
} |
175
|
|
|
|
|
|
|
} |
176
|
1
|
|
|
|
|
10
|
return @phrases; |
177
|
|
|
|
|
|
|
} |
178
|
|
|
|
|
|
|
|
179
|
|
|
|
|
|
|
|
180
|
|
|
|
|
|
|
sub _mmseg { |
181
|
7
|
|
|
7
|
|
12
|
my $str = shift; |
182
|
7
|
|
|
|
|
12
|
my @chunk = &_findChunk($str); |
183
|
|
|
|
|
|
|
|
184
|
7
|
50
|
|
|
|
21
|
return $chunk[0]->{w1} if $#chunk == 0; |
185
|
|
|
|
|
|
|
|
186
|
|
|
|
|
|
|
# rule 1, find max length chunks |
187
|
7
|
|
|
|
|
22
|
my @mlc_tmp = sort {$b->{len} <=> $a->{len}} @chunk; |
|
38
|
|
|
|
|
64
|
|
188
|
7
|
|
|
|
|
13
|
my @max_len_chunk = grep {$_->{len} == $mlc_tmp[0]->{len}} @mlc_tmp; |
|
31
|
|
|
|
|
61
|
|
189
|
7
|
100
|
|
|
|
38
|
return $max_len_chunk[0]->{w1} if $#max_len_chunk == 0; |
190
|
|
|
|
|
|
|
|
191
|
|
|
|
|
|
|
# rule 2, find max avg length chunks |
192
|
3
|
|
|
|
|
7
|
my @malc_tmp = sort {$b->{avglen} <=> $a->{avglen}} @max_len_chunk; |
|
4
|
|
|
|
|
13
|
|
193
|
3
|
|
|
|
|
5
|
my @max_avglen_chunk = grep {$_->{avglen} == $malc_tmp[0]->{avglen}} @malc_tmp; |
|
7
|
|
|
|
|
19
|
|
194
|
3
|
100
|
|
|
|
19
|
return $max_avglen_chunk[0]->{w1} if $#max_avglen_chunk == 0; |
195
|
|
|
|
|
|
|
|
196
|
|
|
|
|
|
|
# rule 3, smallest varience |
197
|
1
|
|
|
|
|
3
|
for (@max_len_chunk) { |
198
|
2
|
|
|
|
|
11
|
my $avg = $_->{avglen}; |
199
|
2
|
|
|
|
|
7
|
my @word = ($_->{w1},$_->{w2},$_->{w3}); |
200
|
2
|
50
|
|
|
|
4
|
pop @word unless $word[$#word]; |
201
|
2
|
|
|
|
|
4
|
my @len = map {length $_} @word; |
|
6
|
|
|
|
|
10
|
|
202
|
2
|
|
|
|
|
2
|
my $varience = sqrt ((sum (map {$_**2-$avg**2} @len))/(scalar @len)); |
|
6
|
|
|
|
|
97
|
|
203
|
2
|
|
|
|
|
7
|
$_->{varience} = $varience; |
204
|
|
|
|
|
|
|
} |
205
|
1
|
|
|
|
|
3
|
my @mvc_tmp = sort {$a->{varience} <=> $b->{varience}} @max_len_chunk; |
|
1
|
|
|
|
|
3
|
|
206
|
2
|
|
|
|
|
7
|
my @min_varience_chunk = |
207
|
1
|
|
|
|
|
2
|
grep {abs($_->{varience} - $mvc_tmp[0]->{varience})<0.01} @mvc_tmp; |
208
|
1
|
50
|
|
|
|
4
|
return $min_varience_chunk[0]->{w1} if $#min_varience_chunk == 0; |
209
|
|
|
|
|
|
|
# rule 4, check length one word and choose max freq of it |
210
|
1
|
|
|
|
|
2
|
for (@min_varience_chunk) { |
211
|
2
|
|
|
|
|
4
|
my $freq = 0; |
212
|
2
|
100
|
66
|
|
|
15
|
$freq += $dict{$_->{w1}} if length $_->{w1} == 1 and defined $dict{$_->{w1}}; |
213
|
2
|
100
|
66
|
|
|
12
|
$freq += $dict{$_->{w2}} if length $_->{w2} == 1 and defined $dict{$_->{w2}}; |
214
|
2
|
50
|
33
|
|
|
13
|
$freq += $dict{$_->{w3}} if length $_->{w3} == 1 and defined $dict{$_->{w3}}; |
215
|
2
|
|
|
|
|
4
|
$_->{freq} = $freq; |
216
|
|
|
|
|
|
|
} |
217
|
1
|
|
|
|
|
3
|
my @last = sort {$b->{freq} <=> $a->{freq}} @min_varience_chunk; |
|
1
|
|
|
|
|
4
|
|
218
|
1
|
|
|
|
|
8
|
return $last[0]->{w1}; |
219
|
|
|
|
|
|
|
} |
220
|
|
|
|
|
|
|
|
221
|
|
|
|
|
|
|
|
222
|
|
|
|
|
|
|
|
223
|
|
|
|
|
|
|
|
224
|
|
|
|
|
|
|
sub _findChunk{ |
225
|
7
|
|
|
7
|
|
10
|
my $str = shift; |
226
|
7
|
|
|
|
|
7
|
my $index = 0; |
227
|
7
|
|
|
|
|
9
|
my @chunk; |
228
|
|
|
|
|
|
|
|
229
|
|
|
|
|
|
|
my @word1; |
230
|
|
|
|
|
|
|
# will fail if $str="" |
231
|
7
|
|
|
|
|
18
|
for (1..(length($str) - $index)){ |
232
|
54
|
|
|
|
|
89
|
my $substr = substr $str, $index, $_; |
233
|
54
|
100
|
66
|
|
|
241
|
push @word1, $substr if defined $dict{$substr} or $_==1; |
234
|
|
|
|
|
|
|
} |
235
|
7
|
|
|
|
|
14
|
foreach my $w1 (@word1){ |
236
|
13
|
|
|
|
|
26
|
my $l1 = length $w1; |
237
|
13
|
|
|
|
|
16
|
my $index = $index + $l1; |
238
|
13
|
|
|
|
|
12
|
my @word2; |
239
|
|
|
|
|
|
|
|
240
|
13
|
100
|
|
|
|
27
|
if (length($str) - $index == 0){ |
241
|
1
|
|
|
|
|
5
|
push @chunk, { |
242
|
|
|
|
|
|
|
w1 => $w1, |
243
|
|
|
|
|
|
|
w2 => undef, |
244
|
|
|
|
|
|
|
w3 => undef, |
245
|
|
|
|
|
|
|
len => $l1, |
246
|
|
|
|
|
|
|
avglen => $l1, |
247
|
|
|
|
|
|
|
}; |
248
|
1
|
|
|
|
|
4
|
next; |
249
|
|
|
|
|
|
|
} |
250
|
|
|
|
|
|
|
|
251
|
12
|
|
|
|
|
23
|
for (1..(length($str) - $index)){ |
252
|
78
|
|
|
|
|
131
|
my $substr = substr $str, $index, $_; |
253
|
78
|
100
|
66
|
|
|
318
|
push @word2, $substr if defined $dict{$substr} or $_==1; |
254
|
|
|
|
|
|
|
} |
255
|
|
|
|
|
|
|
|
256
|
12
|
|
|
|
|
22
|
foreach my $w2 (@word2){ |
257
|
19
|
|
|
|
|
28
|
my $l2 = length $w2; |
258
|
19
|
|
|
|
|
24
|
my $index = $index + $l2; |
259
|
|
|
|
|
|
|
|
260
|
19
|
100
|
|
|
|
43
|
if (length($str) - $index == 0){ |
261
|
2
|
|
|
|
|
10
|
push @chunk, { |
262
|
|
|
|
|
|
|
w1 => $w1, |
263
|
|
|
|
|
|
|
w2 => $w2, |
264
|
|
|
|
|
|
|
w3 => undef, |
265
|
|
|
|
|
|
|
len => $l1+$l2, |
266
|
|
|
|
|
|
|
avglen => ($l1+$l2)/2, |
267
|
|
|
|
|
|
|
}; |
268
|
2
|
|
|
|
|
10
|
next; |
269
|
|
|
|
|
|
|
} |
270
|
|
|
|
|
|
|
|
271
|
17
|
|
|
|
|
27
|
for (1..(length($str) - $index)){ |
272
|
101
|
|
|
|
|
156
|
my $substr = substr $str, $index, $_; |
273
|
101
|
100
|
66
|
|
|
485
|
if (defined $dict{$substr} or $_==1){ |
274
|
28
|
|
|
|
|
31
|
my $w3 = $substr; |
275
|
28
|
|
|
|
|
37
|
my $l3 = length $w3; |
276
|
28
|
|
|
|
|
169
|
push @chunk, { |
277
|
|
|
|
|
|
|
w1 => $w1, |
278
|
|
|
|
|
|
|
w2 => $w2, |
279
|
|
|
|
|
|
|
w3 => $w3, |
280
|
|
|
|
|
|
|
len => $l1+$l2+$l3, |
281
|
|
|
|
|
|
|
avglen => ($l1+$l2+$l3)/3, |
282
|
|
|
|
|
|
|
}; |
283
|
|
|
|
|
|
|
} |
284
|
|
|
|
|
|
|
} |
285
|
|
|
|
|
|
|
} |
286
|
|
|
|
|
|
|
} |
287
|
7
|
|
|
|
|
29
|
return @chunk; |
288
|
|
|
|
|
|
|
} |
289
|
|
|
|
|
|
|
|
290
|
|
|
|
|
|
|
1; |
291
|
|
|
|
|
|
|
|
292
|
|
|
|
|
|
|
__DATA__ |