|  line  | 
 stmt  | 
 bran  | 
 cond  | 
 sub  | 
 pod  | 
 time  | 
 code  | 
| 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 package Lingua::ZH::MMSEG;  | 
| 
2
 | 
3
 | 
 
 | 
 
 | 
  
3
  
 | 
 
 | 
121361
 | 
 use strict;  | 
| 
 
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
9
 | 
    | 
| 
 
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
202
 | 
    | 
| 
3
 | 
3
 | 
 
 | 
 
 | 
  
3
  
 | 
 
 | 
22
 | 
 use warnings;  | 
| 
 
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
7
 | 
    | 
| 
 
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
107
 | 
    | 
| 
4
 | 
3
 | 
 
 | 
 
 | 
  
3
  
 | 
 
 | 
1393
 | 
 use utf8;  | 
| 
 
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
18
 | 
    | 
| 
 
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
23
 | 
    | 
| 
5
 | 
3
 | 
 
 | 
 
 | 
  
3
  
 | 
 
 | 
9405
 | 
 use Encode qw (is_utf8);  | 
| 
 
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
43377
 | 
    | 
| 
 
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
324
 | 
    | 
| 
6
 | 
3
 | 
 
 | 
 
 | 
  
3
  
 | 
 
 | 
13646
 | 
 use encoding 'utf8';  | 
| 
 
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
11596
 | 
    | 
| 
 
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
22
 | 
    | 
| 
7
 | 
3
 | 
 
 | 
 
 | 
  
3
  
 | 
 
 | 
1897
 | 
 use List::Util qw(sum);  | 
| 
 
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
43
 | 
    | 
| 
 
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
378
 | 
    | 
| 
8
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
9
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 our $VERSION=0.4005;  | 
| 
10
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
11
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 require Exporter;  | 
| 
12
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 our @ISA = qw(Exporter);  | 
| 
13
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 our @EXPORT = qw(mmseg fmm word_freq);  | 
| 
14
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
15
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =encoding utf8  | 
| 
16
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
17
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head1 NAME  | 
| 
18
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
19
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Lingua::ZH::MMSEG Mandarin Chinese segmentation  | 
| 
20
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
21
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head1 SYNOPSIS  | 
| 
22
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
23
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     #!/usr/bin/perl  | 
| 
24
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     use utf8;  | 
| 
25
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     use Lingua::ZH::MMSEG;  | 
| 
26
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
27
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     my $zh_string="現代漢語的複合動詞可分三個結構語意關係來探討";  | 
| 
28
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
29
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     my @phrases = mmseg($zh_string);  | 
| 
30
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # use MMSEG algorithm  | 
| 
31
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
32
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     my @phrases = fmm($zh_string);  | 
| 
33
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # use Forward Maximum Matching algorithm  | 
| 
34
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
35
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     while (<>) {  | 
| 
36
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       chomp;  | 
| 
37
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       push @phrases, mmseg;  | 
| 
38
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     } # mmseg and fmm will parse $_ automaticly  | 
| 
39
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
40
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     print $_, word_freq($_) for @phrases;  | 
| 
41
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # you can get phrase frequency by calling word_freq  | 
| 
42
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
43
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head1 DESCRIPTION  | 
| 
44
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
45
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 A problem in computational analysis of Chinese text is that there are no word  | 
| 
46
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 boundaries in conventionally printed text. Since the word is such a fundamental  | 
| 
47
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 linguistic unit, it is necessary to identify words in Chinese text so that  | 
| 
48
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 higher-level analyses can be performed.   | 
| 
49
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
50
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Lingua::ZH::MMSEG implements L  | 
| 
51
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 original developed by L. The whole module is  | 
| 
52
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 rewritten in pure Perl, and the phrase library is   | 
| 
53
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 L<新酷音 forked from OpenFoundry|http://www.openfoundry.org/of/projects/436>.  | 
| 
54
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
55
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head1 INSTALL  | 
| 
56
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
57
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 To install this module, just type  | 
| 
58
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
59
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     cpanm Lingua::ZH::MMSEG  | 
| 
60
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
61
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 If you don't have cpanm,  | 
| 
62
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
63
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     curl -LO http://bit.ly/cpanm  | 
| 
64
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     chmod +x cpanm  | 
| 
65
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     sudo cp cpanm /usr/local/bin  | 
| 
66
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
67
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head1 FUNCTIONS  | 
| 
68
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
69
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head2 mmseg  | 
| 
70
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
71
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     @phrases = mmseg($zh_string);  | 
| 
72
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     @phrases = mmseg;   | 
| 
73
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # use $_ automatically  | 
| 
74
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
75
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 C convert a mandarin Chinese string to a sequence of phrases using   | 
| 
76
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 L algorithm. If there were any  | 
| 
77
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 english containted in the input string, it simply parse the linked ascii code as  | 
| 
78
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 one phrase. For example:  | 
| 
79
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
80
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     $_ = "這裡有中文Today is Wednesday.這邊又有中文 I go to school on Friday.";  | 
| 
81
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     print "$_\n" for mmseg;  | 
| 
82
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
83
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     這裡有  | 
| 
84
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     中文  | 
| 
85
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     Today is Wednesday.  | 
| 
86
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     這邊  | 
| 
87
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     又有  | 
| 
88
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     中文  | 
| 
89
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
      I go to school on Friday.  | 
| 
90
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
91
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 The ascii characters are recognized by C[ -~]+/>.  | 
| 
92
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
93
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head2 fmm (Forward Maximum Matching)  | 
| 
94
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
95
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     @phrases = fmm($zh_string);  | 
| 
96
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     @phrases = fmm;   | 
| 
97
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # use $_ automatically  | 
| 
98
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
99
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 C uses forward maximum matching (so called longest match principle) to  | 
| 
100
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 convert a mandarin Chinese string to a sequence of phrases. It uses the same  | 
| 
101
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 rule of C to deal with ascii string. The advantage of C is it has  | 
| 
102
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 lower complexity compare to C; the disadvantage is it cannot solve  | 
| 
103
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 ambiguity when there is multiple way to seperate a string.  | 
| 
104
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
105
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head2 word_freq  | 
| 
106
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
107
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     $freq = word_freq($phrase);  | 
| 
108
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     $freq = word_freq;  | 
| 
109
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # use $_ automatically  | 
| 
110
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
111
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 C return the phrase frequency defined in L<新酷音|http://www.openfoundry.org/of/projects/436>.  | 
| 
112
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
113
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head1 AUTHOR  | 
| 
114
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
115
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Felix Ren-Chyan Chern (dryman) C<<  >>  | 
| 
116
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
117
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head1 LICENSE AND COPYRIGHT  | 
| 
118
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
119
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 L  | 
| 
120
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
121
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =cut  | 
| 
122
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
123
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 our %dict;  | 
| 
124
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
125
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 while (){  | 
| 
126
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   chomp;  | 
| 
127
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   my ($phrase,$freq) = split;  | 
| 
128
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   $dict{$phrase}=$freq;  | 
| 
129
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
130
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
131
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub word_freq {  | 
| 
132
 | 
0
 | 
  
  0
  
 | 
 
 | 
  
0
  
 | 
  
1
  
 | 
0
 | 
   my $string = $_[0] ? $_[0] : $_;  | 
| 
133
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
   $dict{$string};  | 
| 
134
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
135
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
136
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub mmseg {  | 
| 
137
 | 
1
 | 
  
 50
  
 | 
 
 | 
  
1
  
 | 
  
1
  
 | 
12
 | 
   my $string = $_[0] ? $_[0] : $_;  | 
| 
138
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
2
 | 
   my @phrases;  | 
| 
139
 | 
1
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
8
 | 
   die unless is_utf8($string);  | 
| 
140
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
4
 | 
   chomp ($string);  | 
| 
141
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
8
 | 
   for my $str (split (/([ -~]+)/, $string)) {  | 
| 
142
 | 
1
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
8
 | 
     if ($str =~ /^[ -~]/) {  | 
| 
143
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
       push @phrases, $str;  | 
| 
144
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
       next;  | 
| 
145
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
146
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
3
 | 
     while($str){  | 
| 
147
 | 
7
 | 
 
 | 
 
 | 
 
 | 
 
 | 
20
 | 
       my $word1 = &_mmseg($str);  | 
| 
148
 | 
7
 | 
 
 | 
 
 | 
 
 | 
 
 | 
12
 | 
       push @phrases, $word1;  | 
| 
149
 | 
7
 | 
 
 | 
 
 | 
 
 | 
 
 | 
33
 | 
       $str = substr $str, length $word1;  | 
| 
150
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
151
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   }  | 
| 
152
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
9
 | 
   return @phrases;  | 
| 
153
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
154
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
155
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub fmm {  | 
| 
156
 | 
1
 | 
  
 50
  
 | 
 
 | 
  
1
  
 | 
  
1
  
 | 
14
 | 
   my $string = $_[0] ? $_[0] : $_;  | 
| 
157
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
3
 | 
   my @phrases;  | 
| 
158
 | 
1
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
9
 | 
   die unless is_utf8($string);  | 
| 
159
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
3
 | 
   chomp ($string);  | 
| 
160
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
12
 | 
   for my $str (split (/([ -~])+/, $string)) {  | 
| 
161
 | 
1
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
23
 | 
     if ($str =~ /^[ -~]/) {  | 
| 
162
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
       push @phrases, $str,  | 
| 
163
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       next;  | 
| 
164
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
165
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
6
 | 
     while($str){  | 
| 
166
 | 
7
 | 
 
 | 
 
 | 
 
 | 
 
 | 
23
 | 
       for (reverse (1..(length $str))) {  | 
| 
167
 | 
48
 | 
 
 | 
 
 | 
 
 | 
 
 | 
85
 | 
         my $match = substr $str, 0, $_;  | 
| 
168
 | 
48
 | 
  
100
  
 | 
  
 66
  
 | 
 
 | 
 
 | 
243
 | 
         if (defined $dict{$match} or $_==1){  | 
| 
169
 | 
7
 | 
 
 | 
 
 | 
 
 | 
 
 | 
12
 | 
           push @phrases, $match;  | 
| 
170
 | 
7
 | 
 
 | 
 
 | 
 
 | 
 
 | 
16
 | 
           $str = substr $str, $_;  | 
| 
171
 | 
7
 | 
 
 | 
 
 | 
 
 | 
 
 | 
23
 | 
           last;  | 
| 
172
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }  | 
| 
173
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       }  | 
| 
174
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
175
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   }  | 
| 
176
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
10
 | 
   return @phrases;  | 
| 
177
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
178
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
179
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
180
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub _mmseg {  | 
| 
181
 | 
7
 | 
 
 | 
 
 | 
  
7
  
 | 
 
 | 
12
 | 
   my $str = shift;  | 
| 
182
 | 
7
 | 
 
 | 
 
 | 
 
 | 
 
 | 
12
 | 
   my @chunk = &_findChunk($str);  | 
| 
183
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
184
 | 
7
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
21
 | 
   return $chunk[0]->{w1} if $#chunk == 0;  | 
| 
185
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
186
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # rule 1, find max length chunks  | 
| 
187
 | 
7
 | 
 
 | 
 
 | 
 
 | 
 
 | 
22
 | 
   my @mlc_tmp = sort {$b->{len} <=> $a->{len}} @chunk;  | 
| 
 
 | 
38
 | 
 
 | 
 
 | 
 
 | 
 
 | 
64
 | 
    | 
| 
188
 | 
7
 | 
 
 | 
 
 | 
 
 | 
 
 | 
13
 | 
   my @max_len_chunk = grep {$_->{len} == $mlc_tmp[0]->{len}} @mlc_tmp;  | 
| 
 
 | 
31
 | 
 
 | 
 
 | 
 
 | 
 
 | 
61
 | 
    | 
| 
189
 | 
7
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
38
 | 
   return $max_len_chunk[0]->{w1} if $#max_len_chunk == 0;  | 
| 
190
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
191
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # rule 2, find max avg length chunks  | 
| 
192
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
7
 | 
   my @malc_tmp = sort {$b->{avglen} <=> $a->{avglen}} @max_len_chunk;  | 
| 
 
 | 
4
 | 
 
 | 
 
 | 
 
 | 
 
 | 
13
 | 
    | 
| 
193
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
5
 | 
   my @max_avglen_chunk = grep {$_->{avglen} == $malc_tmp[0]->{avglen}} @malc_tmp;  | 
| 
 
 | 
7
 | 
 
 | 
 
 | 
 
 | 
 
 | 
19
 | 
    | 
| 
194
 | 
3
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
19
 | 
   return $max_avglen_chunk[0]->{w1} if $#max_avglen_chunk == 0;  | 
| 
195
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
196
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # rule 3, smallest varience  | 
| 
197
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
3
 | 
   for (@max_len_chunk) {  | 
| 
198
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
11
 | 
     my $avg = $_->{avglen};  | 
| 
199
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
7
 | 
     my @word = ($_->{w1},$_->{w2},$_->{w3});  | 
| 
200
 | 
2
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
4
 | 
     pop @word unless $word[$#word];  | 
| 
201
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
4
 | 
     my @len = map {length $_} @word;  | 
| 
 
 | 
6
 | 
 
 | 
 
 | 
 
 | 
 
 | 
10
 | 
    | 
| 
202
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
2
 | 
     my $varience = sqrt ((sum (map {$_**2-$avg**2} @len))/(scalar @len));  | 
| 
 
 | 
6
 | 
 
 | 
 
 | 
 
 | 
 
 | 
97
 | 
    | 
| 
203
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
7
 | 
     $_->{varience} = $varience;  | 
| 
204
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   }  | 
| 
205
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
3
 | 
   my @mvc_tmp = sort {$a->{varience} <=> $b->{varience}} @max_len_chunk;  | 
| 
 
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
3
 | 
    | 
| 
206
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
7
 | 
   my @min_varience_chunk =   | 
| 
207
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
2
 | 
     grep {abs($_->{varience} - $mvc_tmp[0]->{varience})<0.01} @mvc_tmp;  | 
| 
208
 | 
1
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
4
 | 
   return $min_varience_chunk[0]->{w1} if $#min_varience_chunk == 0;  | 
| 
209
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # rule 4, check length one word and choose max freq of it  | 
| 
210
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
2
 | 
   for (@min_varience_chunk) {  | 
| 
211
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
4
 | 
     my $freq = 0;  | 
| 
212
 | 
2
 | 
  
100
  
 | 
  
 66
  
 | 
 
 | 
 
 | 
15
 | 
     $freq += $dict{$_->{w1}} if length $_->{w1} == 1 and defined $dict{$_->{w1}};  | 
| 
213
 | 
2
 | 
  
100
  
 | 
  
 66
  
 | 
 
 | 
 
 | 
12
 | 
     $freq += $dict{$_->{w2}} if length $_->{w2} == 1 and defined $dict{$_->{w2}};  | 
| 
214
 | 
2
 | 
  
 50
  
 | 
  
 33
  
 | 
 
 | 
 
 | 
13
 | 
     $freq += $dict{$_->{w3}} if length $_->{w3} == 1 and defined $dict{$_->{w3}};  | 
| 
215
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
4
 | 
     $_->{freq} = $freq;  | 
| 
216
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   }  | 
| 
217
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
3
 | 
   my @last = sort {$b->{freq} <=> $a->{freq}} @min_varience_chunk;  | 
| 
 
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
4
 | 
    | 
| 
218
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
8
 | 
   return $last[0]->{w1};  | 
| 
219
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
220
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
221
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       | 
| 
222
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
223
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
224
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub _findChunk{  | 
| 
225
 | 
7
 | 
 
 | 
 
 | 
  
7
  
 | 
 
 | 
10
 | 
   my $str = shift;  | 
| 
226
 | 
7
 | 
 
 | 
 
 | 
 
 | 
 
 | 
7
 | 
   my $index = 0;  | 
| 
227
 | 
7
 | 
 
 | 
 
 | 
 
 | 
 
 | 
9
 | 
   my @chunk;  | 
| 
228
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
229
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   my @word1;  | 
| 
230
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # will fail if $str=""  | 
| 
231
 | 
7
 | 
 
 | 
 
 | 
 
 | 
 
 | 
18
 | 
   for (1..(length($str) - $index)){  | 
| 
232
 | 
54
 | 
 
 | 
 
 | 
 
 | 
 
 | 
89
 | 
     my $substr = substr $str, $index, $_;  | 
| 
233
 | 
54
 | 
  
100
  
 | 
  
 66
  
 | 
 
 | 
 
 | 
241
 | 
     push @word1, $substr if defined $dict{$substr} or $_==1;  | 
| 
234
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   }  | 
| 
235
 | 
7
 | 
 
 | 
 
 | 
 
 | 
 
 | 
14
 | 
   foreach my $w1 (@word1){  | 
| 
236
 | 
13
 | 
 
 | 
 
 | 
 
 | 
 
 | 
26
 | 
     my $l1 = length $w1;  | 
| 
237
 | 
13
 | 
 
 | 
 
 | 
 
 | 
 
 | 
16
 | 
     my $index = $index + $l1;  | 
| 
238
 | 
13
 | 
 
 | 
 
 | 
 
 | 
 
 | 
12
 | 
     my @word2;  | 
| 
239
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       | 
| 
240
 | 
13
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
27
 | 
     if (length($str) - $index == 0){  | 
| 
241
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
5
 | 
       push @chunk, {  | 
| 
242
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         w1 => $w1,  | 
| 
243
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         w2 => undef,  | 
| 
244
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         w3 => undef,  | 
| 
245
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         len => $l1,  | 
| 
246
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         avglen => $l1,  | 
| 
247
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       };  | 
| 
248
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
4
 | 
       next;  | 
| 
249
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
250
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
251
 | 
12
 | 
 
 | 
 
 | 
 
 | 
 
 | 
23
 | 
     for (1..(length($str) - $index)){  | 
| 
252
 | 
78
 | 
 
 | 
 
 | 
 
 | 
 
 | 
131
 | 
       my $substr = substr $str, $index, $_;  | 
| 
253
 | 
78
 | 
  
100
  
 | 
  
 66
  
 | 
 
 | 
 
 | 
318
 | 
       push @word2, $substr if defined $dict{$substr} or $_==1;  | 
| 
254
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
255
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
256
 | 
12
 | 
 
 | 
 
 | 
 
 | 
 
 | 
22
 | 
     foreach my $w2 (@word2){  | 
| 
257
 | 
19
 | 
 
 | 
 
 | 
 
 | 
 
 | 
28
 | 
       my $l2 = length $w2;  | 
| 
258
 | 
19
 | 
 
 | 
 
 | 
 
 | 
 
 | 
24
 | 
       my $index = $index + $l2;  | 
| 
259
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
260
 | 
19
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
43
 | 
       if (length($str) - $index == 0){  | 
| 
261
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
10
 | 
         push @chunk, {  | 
| 
262
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
           w1 => $w1,  | 
| 
263
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
           w2 => $w2,  | 
| 
264
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
           w3 => undef,  | 
| 
265
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
           len => $l1+$l2,  | 
| 
266
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
           avglen => ($l1+$l2)/2,  | 
| 
267
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         };  | 
| 
268
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
10
 | 
         next;  | 
| 
269
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       }  | 
| 
270
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
271
 | 
17
 | 
 
 | 
 
 | 
 
 | 
 
 | 
27
 | 
       for (1..(length($str) - $index)){  | 
| 
272
 | 
101
 | 
 
 | 
 
 | 
 
 | 
 
 | 
156
 | 
         my $substr = substr $str, $index, $_;  | 
| 
273
 | 
101
 | 
  
100
  
 | 
  
 66
  
 | 
 
 | 
 
 | 
485
 | 
         if (defined $dict{$substr} or $_==1){  | 
| 
274
 | 
28
 | 
 
 | 
 
 | 
 
 | 
 
 | 
31
 | 
           my $w3 = $substr;  | 
| 
275
 | 
28
 | 
 
 | 
 
 | 
 
 | 
 
 | 
37
 | 
           my $l3 = length $w3;  | 
| 
276
 | 
28
 | 
 
 | 
 
 | 
 
 | 
 
 | 
169
 | 
           push @chunk, {  | 
| 
277
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             w1 => $w1,  | 
| 
278
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             w2 => $w2,  | 
| 
279
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             w3 => $w3,  | 
| 
280
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             len => $l1+$l2+$l3,  | 
| 
281
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             avglen => ($l1+$l2+$l3)/3,  | 
| 
282
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
           };  | 
| 
283
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }  | 
| 
284
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       }  | 
| 
285
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
286
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   }  | 
| 
287
 | 
7
 | 
 
 | 
 
 | 
 
 | 
 
 | 
29
 | 
   return @chunk;  | 
| 
288
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
289
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
290
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 1;  | 
| 
291
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
292
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 __DATA__  |