| line | stmt | bran | cond | sub | pod | time | code | 
| 1 |  |  |  |  |  |  | package Lingua::ZH::WordSegmenter; | 
| 2 |  |  |  |  |  |  |  | 
| 3 | 1 |  |  | 1 |  | 33019 | use warnings; | 
|  | 1 |  |  |  |  | 3 |  | 
|  | 1 |  |  |  |  | 77 |  | 
| 4 | 1 |  |  | 1 |  | 6 | use strict; | 
|  | 1 |  |  |  |  | 4 |  | 
|  | 1 |  |  |  |  | 38 |  | 
| 5 | 1 |  |  | 1 |  | 1039 | use Encode; | 
|  | 1 |  |  |  |  | 103582 |  | 
|  | 1 |  |  |  |  | 811 |  | 
| 6 |  |  |  |  |  |  |  | 
| 7 |  |  |  |  |  |  | # Constructor new | 
| 8 |  |  |  |  |  |  | sub new { | 
| 9 | 0 |  |  | 0 | 1 |  | my $proto  = shift; | 
| 10 | 0 |  | 0 |  |  |  | my $class  = ref($proto) || $proto; | 
| 11 |  |  |  |  |  |  |  | 
| 12 | 0 |  |  |  |  |  | my $self = {}; | 
| 13 | 0 |  |  |  |  |  | bless($self, $class); | 
| 14 |  |  |  |  |  |  |  | 
| 15 |  |  |  |  |  |  | # Run initialisation code | 
| 16 | 0 |  |  |  |  |  | return $self->_init(@_); | 
| 17 |  |  |  |  |  |  | } | 
| 18 |  |  |  |  |  |  |  | 
| 19 |  |  |  |  |  |  | sub _init{ | 
| 20 | 0 |  |  | 0 |  |  | my $self = shift; | 
| 21 |  |  |  |  |  |  |  | 
| 22 | 0 |  |  |  |  |  | $self->{dic_encoding} = 'gbk'; | 
| 23 | 0 |  |  |  |  |  | $self->{seperator} = ' '; | 
| 24 | 0 |  |  |  |  |  | $self->{verbose}=0; | 
| 25 |  |  |  |  |  |  |  | 
| 26 | 0 | 0 |  |  |  |  | if (@_ != 0) { # We are expecting our configuration to come as an anonymous hash | 
| 27 | 0 | 0 |  |  |  |  | if (ref $_[0] eq 'HASH') { | 
| 28 | 0 |  |  |  |  |  | my $hash=$_[0]; | 
| 29 | 0 |  |  |  |  |  | foreach my $key (keys %$hash) { | 
| 30 | 0 |  |  |  |  |  | $self->{lc($key)}=$hash->{$key}; | 
| 31 |  |  |  |  |  |  | } | 
| 32 |  |  |  |  |  |  | }else { # Using a more conventional named args | 
| 33 | 0 |  |  |  |  |  | my %args = @_; | 
| 34 | 0 |  |  |  |  |  | foreach my $key (keys %args) { | 
| 35 | 0 |  |  |  |  |  | $self->{lc($key)}=$args{$key}; | 
| 36 |  |  |  |  |  |  | } | 
| 37 |  |  |  |  |  |  | } | 
| 38 |  |  |  |  |  |  | } | 
| 39 |  |  |  |  |  |  |  | 
| 40 | 0 | 0 |  |  |  |  | if($self->{dic}){ | 
| 41 | 0 |  |  |  |  |  | my $FH; | 
| 42 | 0 | 0 |  |  |  |  | open $FH,$self->{dic} or die "Cant open file $self->{dic}, $!\n"; | 
| 43 | 0 |  |  |  |  |  | $self->_load_dic($FH); | 
| 44 | 0 |  |  |  |  |  | close $FH; | 
| 45 |  |  |  |  |  |  | }else{ | 
| 46 | 0 |  |  |  |  |  | $self->_load_dic(\*DATA); | 
| 47 |  |  |  |  |  |  | } | 
| 48 |  |  |  |  |  |  |  | 
| 49 | 0 |  |  |  |  |  | return $self; | 
| 50 |  |  |  |  |  |  | } | 
| 51 |  |  |  |  |  |  |  | 
| 52 |  |  |  |  |  |  | sub _dump_dic{ | 
| 53 | 0 |  |  | 0 |  |  | my $self = shift; | 
| 54 | 0 |  |  |  |  |  | print "$_:$self->{headchar_maxlen}->{$_}:$self->{headchar_str}->{$_}\n" foreach keys %{$self->{headchar_maxlen}}; | 
|  | 0 |  |  |  |  |  |  | 
| 55 | 0 |  |  |  |  |  | print "$_:$self->{word_freq}->{$_}\n" foreach keys %{$self->{word_freq}}; | 
|  | 0 |  |  |  |  |  |  | 
| 56 |  |  |  |  |  |  | } | 
| 57 |  |  |  |  |  |  |  | 
| 58 |  |  |  |  |  |  | sub _load_dic{ | 
| 59 | 0 |  |  | 0 |  |  | my $self = shift; | 
| 60 | 0 |  |  |  |  |  | my $FH=shift; | 
| 61 |  |  |  |  |  |  |  | 
| 62 |  |  |  |  |  |  |  | 
| 63 | 0 |  |  |  |  |  | $self->{word_freq}={}; | 
| 64 | 0 |  |  |  |  |  | $self->{headchar_maxlen}={}; | 
| 65 |  |  |  |  |  |  |  | 
| 66 | 0 | 0 |  |  |  |  | if($self->{verbose}){ | 
| 67 | 0 |  |  |  |  |  | print "loading dic ...\n"; | 
| 68 |  |  |  |  |  |  | } | 
| 69 |  |  |  |  |  |  |  | 
| 70 | 0 |  |  |  |  |  | while(my $line = <$FH>){ | 
| 71 | 0 |  |  |  |  |  | chomp $line; | 
| 72 | 0 |  |  |  |  |  | $line = decode($self->{dic_encoding},$line); | 
| 73 |  |  |  |  |  |  |  | 
| 74 | 0 |  |  |  |  |  | my ($word,$freq) = split(/\s+/,$line); | 
| 75 | 0 |  |  |  |  |  | my $len=length($word); | 
| 76 | 0 |  |  |  |  |  | $self->{word_freq}->{$word}=$freq; | 
| 77 |  |  |  |  |  |  |  | 
| 78 | 1 | 0 |  | 1 |  | 1124 | if($word =~ m!^(\p{Han})!){ | 
|  | 1 |  |  |  |  | 11 |  | 
|  | 1 |  |  |  |  | 14 |  | 
|  | 0 |  |  |  |  |  |  | 
| 79 | 0 |  |  |  |  |  | my $headchar=$1; | 
| 80 | 0 | 0 |  |  |  |  | if (not exists $self->{headchar_maxlen}->{$headchar}){ | 
| 81 | 0 |  |  |  |  |  | $self->{headchar_maxlen}->{$headchar} = $len; | 
| 82 | 0 |  |  |  |  |  | $self->{headchar_str}->{$headchar} = $word; | 
| 83 |  |  |  |  |  |  | }else{ | 
| 84 | 0 | 0 |  |  |  |  | if ($self->{headchar_maxlen}->{$headchar} < $len){ | 
| 85 | 0 |  |  |  |  |  | $self->{headchar_maxlen}->{$headchar} = $len; | 
| 86 | 0 |  |  |  |  |  | $self->{headchar_str}->{$headchar} = $word; | 
| 87 |  |  |  |  |  |  | } | 
| 88 |  |  |  |  |  |  | } | 
| 89 |  |  |  |  |  |  | } | 
| 90 |  |  |  |  |  |  | } | 
| 91 |  |  |  |  |  |  |  | 
| 92 |  |  |  |  |  |  | } | 
| 93 |  |  |  |  |  |  |  | 
| 94 |  |  |  |  |  |  | sub seg { | 
| 95 | 0 |  |  | 0 | 1 |  | my $self = shift; | 
| 96 | 0 |  |  |  |  |  | my $text = shift; | 
| 97 | 0 |  |  |  |  |  | my $encoding = shift; | 
| 98 |  |  |  |  |  |  |  | 
| 99 | 0 |  | 0 |  |  |  | $encoding ||= 'gbk'; | 
| 100 | 0 |  |  |  |  |  | $text = decode($encoding,$text); | 
| 101 |  |  |  |  |  |  |  | 
| 102 | 0 |  |  |  |  |  | my $result=""; | 
| 103 |  |  |  |  |  |  |  | 
| 104 | 0 |  |  |  |  |  | while($text){ | 
| 105 | 0 | 0 |  |  |  |  | if($text =~ s!^(\p{Han}+)!!){ | 
| 106 | 0 |  |  |  |  |  | my $han_str = $1; | 
| 107 | 0 |  |  |  |  |  | $result .= $self->_seg_zh($han_str).$self->{seperator}; | 
| 108 |  |  |  |  |  |  | }else{ | 
| 109 | 0 |  |  |  |  |  | $text =~ s!^([^\p{Han}]+)!!; | 
| 110 | 0 |  |  |  |  |  | my $str = $1; | 
| 111 | 0 |  |  |  |  |  | $str =~ s!\s+!$self->{seperator}!g; | 
| 112 | 0 |  |  |  |  |  | $result .= $str.$self->{seperator}; | 
| 113 |  |  |  |  |  |  | } | 
| 114 |  |  |  |  |  |  | } | 
| 115 | 0 |  |  |  |  |  | return $result; | 
| 116 |  |  |  |  |  |  | } | 
| 117 |  |  |  |  |  |  |  | 
| 118 |  |  |  |  |  |  | sub _seg_zh{ | 
| 119 | 0 |  |  | 0 |  |  | my $self = shift; | 
| 120 | 0 |  |  |  |  |  | my $input = shift; | 
| 121 | 0 |  |  |  |  |  | my $result=""; | 
| 122 |  |  |  |  |  |  |  | 
| 123 | 0 | 0 |  |  |  |  | if($self->{verbose}){ | 
| 124 | 0 |  |  |  |  |  | print "Try to segment string $input\n"; | 
| 125 |  |  |  |  |  |  | } | 
| 126 |  |  |  |  |  |  |  | 
| 127 | 0 |  |  |  |  |  | my $len=length($input); | 
| 128 | 0 |  |  |  |  |  | my @arctable=(); | 
| 129 |  |  |  |  |  |  |  | 
| 130 |  |  |  |  |  |  |  | 
| 131 | 0 | 0 |  |  |  |  | if($self->{verbose}){ | 
| 132 | 0 |  |  |  |  |  | print "step0, initialize the arctable\n"; | 
| 133 |  |  |  |  |  |  | } | 
| 134 |  |  |  |  |  |  |  | 
| 135 | 0 |  |  |  |  |  | for(my $i=0;$i<$len;$i++){ | 
| 136 | 0 |  |  |  |  |  | for(my $j=0;$j<$len;$j++){ | 
| 137 | 0 | 0 |  |  |  |  | if($i==$j){ | 
| 138 | 0 |  |  |  |  |  | $arctable[$i][$j]=1; | 
| 139 |  |  |  |  |  |  | }else{ | 
| 140 | 0 |  |  |  |  |  | $arctable[$i][$j]=-1; | 
| 141 |  |  |  |  |  |  | } | 
| 142 |  |  |  |  |  |  | } | 
| 143 |  |  |  |  |  |  | } | 
| 144 |  |  |  |  |  |  |  | 
| 145 |  |  |  |  |  |  |  | 
| 146 | 0 | 0 |  |  |  |  | if($self->{verbose}){ | 
| 147 | 0 |  |  |  |  |  | print "step1: search for all possible arcs in the input string\n"; | 
| 148 |  |  |  |  |  |  | } | 
| 149 |  |  |  |  |  |  |  | 
| 150 | 0 |  |  |  |  |  | my @chars = split('',$input); | 
| 151 |  |  |  |  |  |  |  | 
| 152 | 0 |  |  |  |  |  | for(my $i=0;$i<$len;$i++){ | 
| 153 |  |  |  |  |  |  |  | 
| 154 |  |  |  |  |  |  | #from this position, try to find all possible words led by this character | 
| 155 | 0 |  |  |  |  |  | my $possiblelen=$self->{headchar_maxlen}->{$chars[$i]}; | 
| 156 | 0 | 0 |  |  |  |  | $possiblelen=1 if (not defined $possiblelen); | 
| 157 |  |  |  |  |  |  |  | 
| 158 | 0 | 0 |  |  |  |  | if($self->{verbose}){ | 
| 159 | 0 |  |  |  |  |  | print "\n$chars[$i]=$possiblelen\n"; | 
| 160 |  |  |  |  |  |  | } | 
| 161 |  |  |  |  |  |  |  | 
| 162 | 0 | 0 |  |  |  |  | if(($possiblelen+$i) > ($len-1)){ | 
| 163 | 0 |  |  |  |  |  | $possiblelen=$len-$i; | 
| 164 |  |  |  |  |  |  | } | 
| 165 |  |  |  |  |  |  |  | 
| 166 |  |  |  |  |  |  | #all possible words with more than 2 characters | 
| 167 | 0 |  |  |  |  |  | while($possiblelen>=2){ | 
| 168 | 0 |  |  |  |  |  | my $substr = substr($input,$i,$possiblelen); | 
| 169 |  |  |  |  |  |  |  | 
| 170 | 0 | 0 |  |  |  |  | if($self->{verbose}){ | 
| 171 | 0 |  |  |  |  |  | print "s=$substr,len=$possiblelen\n"; | 
| 172 |  |  |  |  |  |  | } | 
| 173 |  |  |  |  |  |  |  | 
| 174 | 0 | 0 |  |  |  |  | if($self->{word_freq}->{$substr}){ | 
| 175 | 0 | 0 |  |  |  |  | if($self->{verbose}){ | 
| 176 | 0 |  |  |  |  |  | print "$substr found\n"; | 
| 177 |  |  |  |  |  |  | } | 
| 178 |  |  |  |  |  |  |  | 
| 179 | 0 |  |  |  |  |  | $arctable[$i][$i+$possiblelen-1]=$self->{word_freq}->{$substr}; | 
| 180 |  |  |  |  |  |  | } | 
| 181 | 0 |  |  |  |  |  | $possiblelen--; | 
| 182 |  |  |  |  |  |  | } | 
| 183 |  |  |  |  |  |  | } | 
| 184 |  |  |  |  |  |  |  | 
| 185 | 0 | 0 |  |  |  |  | if($self->{verbose}){ | 
| 186 | 0 |  |  |  |  |  | for(my $i=0;$i<$len;$i++){ | 
| 187 | 0 |  |  |  |  |  | for(my $j=0;$j<$len;$j++){ | 
| 188 | 0 |  |  |  |  |  | print "  ",$arctable[$i][$j]; | 
| 189 |  |  |  |  |  |  | } | 
| 190 | 0 |  |  |  |  |  | print "\n"; | 
| 191 |  |  |  |  |  |  | } | 
| 192 |  |  |  |  |  |  | } | 
| 193 |  |  |  |  |  |  |  | 
| 194 | 0 | 0 |  |  |  |  | if($self->{verbose}){ | 
| 195 | 0 |  |  |  |  |  | print "step2: from the arc table, try to find the best path as segmentation\n"; | 
| 196 |  |  |  |  |  |  | } | 
| 197 |  |  |  |  |  |  |  | 
| 198 |  |  |  |  |  |  |  | 
| 199 | 0 |  |  |  |  |  | my @lrlabel=(); | 
| 200 | 0 |  |  |  |  |  | my @rllabel=(); | 
| 201 |  |  |  |  |  |  |  | 
| 202 | 0 |  |  |  |  |  | for(my $k=0;$k<$len;$k++){ | 
| 203 | 0 |  |  |  |  |  | $lrlabel[$k]=0; | 
| 204 | 0 |  |  |  |  |  | $rllabel[$k]=0; | 
| 205 |  |  |  |  |  |  | } | 
| 206 |  |  |  |  |  |  |  | 
| 207 | 0 |  |  |  |  |  | my $lrfreq=0; | 
| 208 | 0 |  |  |  |  |  | my $index=0; | 
| 209 |  |  |  |  |  |  |  | 
| 210 | 0 |  |  |  |  |  | while($index<$len){ | 
| 211 | 0 |  |  |  |  |  | my $endindex=$len-1; | 
| 212 | 0 |  |  |  |  |  | my $found=0; | 
| 213 |  |  |  |  |  |  |  | 
| 214 | 0 |  | 0 |  |  |  | while((!$found)&&($endindex>=$index)){ | 
| 215 | 0 | 0 |  |  |  |  | if($arctable[$index][$endindex]!=-1){ | 
| 216 | 0 |  |  |  |  |  | $lrfreq+=log($arctable[$index][$endindex]); | 
| 217 | 0 |  |  |  |  |  | $found=1; | 
| 218 |  |  |  |  |  |  | } | 
| 219 |  |  |  |  |  |  | else{ | 
| 220 | 0 |  |  |  |  |  | $endindex--; | 
| 221 |  |  |  |  |  |  | } | 
| 222 |  |  |  |  |  |  | } | 
| 223 |  |  |  |  |  |  |  | 
| 224 | 0 |  |  |  |  |  | $lrlabel[$endindex]=1; | 
| 225 | 0 |  |  |  |  |  | $index=$endindex+1; | 
| 226 |  |  |  |  |  |  | } | 
| 227 |  |  |  |  |  |  |  | 
| 228 | 0 |  |  |  |  |  | my $rlfreq=0; | 
| 229 | 0 |  |  |  |  |  | $index=$len-1; | 
| 230 |  |  |  |  |  |  |  | 
| 231 | 0 |  |  |  |  |  | while($index>=0){ | 
| 232 | 0 |  |  |  |  |  | my $startindex=0; | 
| 233 | 0 |  |  |  |  |  | my $found=0; | 
| 234 | 0 |  | 0 |  |  |  | while((!$found)&&($startindex<=$index)){ | 
| 235 | 0 | 0 |  |  |  |  | if($arctable[$startindex][$index]!=-1){ | 
| 236 | 0 |  |  |  |  |  | $found=1; | 
| 237 | 0 |  |  |  |  |  | $rlfreq+=log($arctable[$startindex][$index]); | 
| 238 |  |  |  |  |  |  | } | 
| 239 |  |  |  |  |  |  | else{ | 
| 240 | 0 |  |  |  |  |  | $startindex++; | 
| 241 |  |  |  |  |  |  | } | 
| 242 |  |  |  |  |  |  | } | 
| 243 |  |  |  |  |  |  |  | 
| 244 | 0 |  |  |  |  |  | $rllabel[$startindex]=1; | 
| 245 | 0 |  |  |  |  |  | $index=$startindex-1; | 
| 246 |  |  |  |  |  |  | } | 
| 247 |  |  |  |  |  |  |  | 
| 248 | 0 | 0 |  |  |  |  | if($self->{verbose}){ | 
| 249 | 0 |  |  |  |  |  | print "Step3: create result\n"; | 
| 250 |  |  |  |  |  |  | } | 
| 251 |  |  |  |  |  |  |  | 
| 252 | 0 | 0 |  |  |  |  | if($lrfreq>$rlfreq){ | 
| 253 | 0 |  |  |  |  |  | for(my $p=0;$p<$len;$p++){ | 
| 254 | 0 |  |  |  |  |  | $result .= $chars[$p]; | 
| 255 |  |  |  |  |  |  |  | 
| 256 | 0 | 0 |  |  |  |  | if($lrlabel[$p]==1){ | 
| 257 | 0 |  |  |  |  |  | $result .= $self->{seperator}; | 
| 258 |  |  |  |  |  |  | } | 
| 259 |  |  |  |  |  |  | } | 
| 260 |  |  |  |  |  |  | }else{ | 
| 261 | 0 |  |  |  |  |  | for(my $p=0;$p<$len;$p++){ | 
| 262 | 0 | 0 |  |  |  |  | if($rllabel[$p]==1){ | 
| 263 | 0 |  |  |  |  |  | $result .= $self->{seperator}; | 
| 264 |  |  |  |  |  |  | } | 
| 265 | 0 |  |  |  |  |  | $result .= $chars[$p]; | 
| 266 |  |  |  |  |  |  | } | 
| 267 |  |  |  |  |  |  | } | 
| 268 |  |  |  |  |  |  |  | 
| 269 | 0 | 0 |  |  |  |  | if($self->{verbose}){ | 
| 270 | 0 |  |  |  |  |  | print "result=$result\n"; | 
| 271 |  |  |  |  |  |  | } | 
| 272 |  |  |  |  |  |  |  | 
| 273 | 0 |  |  |  |  |  | return $result; | 
| 274 |  |  |  |  |  |  | } | 
| 275 |  |  |  |  |  |  |  | 
| 276 |  |  |  |  |  |  |  | 
| 277 |  |  |  |  |  |  |  | 
| 278 |  |  |  |  |  |  | =head1 NAME | 
| 279 |  |  |  |  |  |  |  | 
| 280 |  |  |  |  |  |  | Lingua::ZH::WordSegmenter - Simplified Chinese Word Segmentation | 
| 281 |  |  |  |  |  |  |  | 
| 282 |  |  |  |  |  |  | =head1 VERSION | 
| 283 |  |  |  |  |  |  |  | 
| 284 |  |  |  |  |  |  | Version 0.01 | 
| 285 |  |  |  |  |  |  |  | 
| 286 |  |  |  |  |  |  | =cut | 
| 287 |  |  |  |  |  |  |  | 
| 288 |  |  |  |  |  |  | our $VERSION = '0.01'; | 
| 289 |  |  |  |  |  |  |  | 
| 290 |  |  |  |  |  |  | =head1 SYNOPSIS | 
| 291 |  |  |  |  |  |  |  | 
| 292 |  |  |  |  |  |  | use Lingua::ZH::WordSegmenter; | 
| 293 |  |  |  |  |  |  |  | 
| 294 |  |  |  |  |  |  | my $segmenter = Lingua::ZH::WordSegmenter->new(); | 
| 295 |  |  |  |  |  |  | print encode('gbk', $segmenter->seg($_) ); | 
| 296 |  |  |  |  |  |  |  | 
| 297 |  |  |  |  |  |  | =head1 Description | 
| 298 |  |  |  |  |  |  |  | 
| 299 |  |  |  |  |  |  | This is a perl version of simplified Chinese word segmentation. | 
| 300 |  |  |  |  |  |  |  | 
| 301 |  |  |  |  |  |  | The algorithm for this segmenter is to search the longest word at each | 
| 302 |  |  |  |  |  |  | point from both left and right directions, and choose the one with | 
| 303 |  |  |  |  |  |  | higher frequency product. | 
| 304 |  |  |  |  |  |  |  | 
| 305 |  |  |  |  |  |  | The original program is from the CPAN module Lingua::ZH::WordSegment | 
| 306 |  |  |  |  |  |  | (http://search.cpan.org/~chenyr/) I did the follwing changes: 1) make | 
| 307 |  |  |  |  |  |  | the interface object oriented; 2) make the internal string into utf8; | 
| 308 |  |  |  |  |  |  | 3) using sogou's dictionary (http://www.sogou.com/labs/dl/w.html) as | 
| 309 |  |  |  |  |  |  | the default dictionary. | 
| 310 |  |  |  |  |  |  |  | 
| 311 |  |  |  |  |  |  |  | 
| 312 |  |  |  |  |  |  | =head1 METHODS | 
| 313 |  |  |  |  |  |  |  | 
| 314 |  |  |  |  |  |  | =over 4 | 
| 315 |  |  |  |  |  |  |  | 
| 316 |  |  |  |  |  |  | =item $segmenter = Lingua::ZH::WordSegmenter->new(%opinions) | 
| 317 |  |  |  |  |  |  |  | 
| 318 |  |  |  |  |  |  | This method constructs a new C object and | 
| 319 |  |  |  |  |  |  | returns it.  Key/value pair arguments may be provided to set up the | 
| 320 |  |  |  |  |  |  | initial state.  The following options correspond to attribute methods | 
| 321 |  |  |  |  |  |  | described below: | 
| 322 |  |  |  |  |  |  |  | 
| 323 |  |  |  |  |  |  | KEY            PURPOSE                       DEFAULT | 
| 324 |  |  |  |  |  |  | -----------    -------------                 -------------------- | 
| 325 |  |  |  |  |  |  | dic            filename of the dic           sogou dic | 
| 326 |  |  |  |  |  |  | dic_encoding   encoding of the dic           "gbk" | 
| 327 |  |  |  |  |  |  | seperator      string to seperate wrods      " " | 
| 328 |  |  |  |  |  |  | verbose        show the segment process      0 | 
| 329 |  |  |  |  |  |  |  | 
| 330 |  |  |  |  |  |  | =item $segmenter->seg($input,[$encoding]) | 
| 331 |  |  |  |  |  |  |  | 
| 332 |  |  |  |  |  |  | Segment a input string, you can specify the encoding by the optional | 
| 333 |  |  |  |  |  |  | parameter. | 
| 334 |  |  |  |  |  |  |  | 
| 335 |  |  |  |  |  |  | The return result is encoded in utf8 format. | 
| 336 |  |  |  |  |  |  |  | 
| 337 |  |  |  |  |  |  | =back | 
| 338 |  |  |  |  |  |  |  | 
| 339 |  |  |  |  |  |  | =head1 SEE ALSO | 
| 340 |  |  |  |  |  |  |  | 
| 341 |  |  |  |  |  |  | L | 
| 342 |  |  |  |  |  |  |  | 
| 343 |  |  |  |  |  |  | =head1 AUTHOR | 
| 344 |  |  |  |  |  |  |  | 
| 345 |  |  |  |  |  |  | Zhang Jun, C<<  >> | 
| 346 |  |  |  |  |  |  |  | 
| 347 |  |  |  |  |  |  | =head1 COPYRIGHT & LICENSE | 
| 348 |  |  |  |  |  |  |  | 
| 349 |  |  |  |  |  |  | Copyright 2007 Zhang Jun, all rights reserved. | 
| 350 |  |  |  |  |  |  |  | 
| 351 |  |  |  |  |  |  | This program is free software; you can redistribute it and/or modify it | 
| 352 |  |  |  |  |  |  | under the same terms as Perl itself. | 
| 353 |  |  |  |  |  |  |  | 
| 354 |  |  |  |  |  |  | =cut | 
| 355 |  |  |  |  |  |  |  | 
| 356 |  |  |  |  |  |  | 1; # End of Lingua::ZH::WordSegmenter | 
| 357 |  |  |  |  |  |  |  | 
| 358 |  |  |  |  |  |  | __DATA__ |