File Coverage

blib/lib/Lingua/ZH/WordSegmenter.pm

Criterion	Covered	Total	%
statement	12	159	7.5
branch	0	56	0.0
condition	0	11	0.0
subroutine	4	10	40.0
pod	2	2	100.0
total	18	238	7.5

line	stmt	bran	cond	sub	pod	time	code
1							package Lingua::ZH::WordSegmenter;
2
3	1			1		33019	use warnings;
	1					3
	1					77
4	1			1		6	use strict;
	1					4
	1					38
5	1			1		1039	use Encode;
	1					103582
	1					811
6
7							# Constructor new
8							sub new {
9	0			0	1		my $proto = shift;
10	0		0				my $class = ref($proto) \|\| $proto;
11
12	0						my $self = {};
13	0						bless($self, $class);
14
15							# Run initialisation code
16	0						return $self->_init(@_);
17							}
18
19							sub _init{
20	0			0			my $self = shift;
21
22	0						$self->{dic_encoding} = 'gbk';
23	0						$self->{seperator} = ' ';
24	0						$self->{verbose}=0;
25
26	0	0					if (@_ != 0) { # We are expecting our configuration to come as an anonymous hash
27	0	0					if (ref $_[0] eq 'HASH') {
28	0						my $hash=$_[0];
29	0						foreach my $key (keys %$hash) {
30	0						$self->{lc($key)}=$hash->{$key};
31							}
32							}else { # Using a more conventional named args
33	0						my %args = @_;
34	0						foreach my $key (keys %args) {
35	0						$self->{lc($key)}=$args{$key};
36							}
37							}
38							}
39
40	0	0					if($self->{dic}){
41	0						my $FH;
42	0	0					open $FH,$self->{dic} or die "Cant open file $self->{dic}, $!\n";
43	0						$self->_load_dic($FH);
44	0						close $FH;
45							}else{
46	0						$self->_load_dic(\*DATA);
47							}
48
49	0						return $self;
50							}
51
52							sub _dump_dic{
53	0			0			my $self = shift;
54	0						print "$_:$self->{headchar_maxlen}->{$_}:$self->{headchar_str}->{$_}\n" foreach keys %{$self->{headchar_maxlen}};
	0
55	0						print "$_:$self->{word_freq}->{$_}\n" foreach keys %{$self->{word_freq}};
	0
56							}
57
58							sub _load_dic{
59	0			0			my $self = shift;
60	0						my $FH=shift;
61
62
63	0						$self->{word_freq}={};
64	0						$self->{headchar_maxlen}={};
65
66	0	0					if($self->{verbose}){
67	0						print "loading dic ...\n";
68							}
69
70	0						while(my $line = <$FH>){
71	0						chomp $line;
72	0						$line = decode($self->{dic_encoding},$line);
73
74	0						my ($word,$freq) = split(/\s+/,$line);
75	0						my $len=length($word);
76	0						$self->{word_freq}->{$word}=$freq;
77
78	1	0		1		1124	if($word =~ m!^(\p{Han})!){
	1					11
	1					14
	0
79	0						my $headchar=$1;
80	0	0					if (not exists $self->{headchar_maxlen}->{$headchar}){
81	0						$self->{headchar_maxlen}->{$headchar} = $len;
82	0						$self->{headchar_str}->{$headchar} = $word;
83							}else{
84	0	0					if ($self->{headchar_maxlen}->{$headchar} < $len){
85	0						$self->{headchar_maxlen}->{$headchar} = $len;
86	0						$self->{headchar_str}->{$headchar} = $word;
87							}
88							}
89							}
90							}
91
92							}
93
94							sub seg {
95	0			0	1		my $self = shift;
96	0						my $text = shift;
97	0						my $encoding = shift;
98
99	0		0				$encoding \|\|= 'gbk';
100	0						$text = decode($encoding,$text);
101
102	0						my $result="";
103
104	0						while($text){
105	0	0					if($text =~ s!^(\p{Han}+)!!){
106	0						my $han_str = $1;
107	0						$result .= $self->_seg_zh($han_str).$self->{seperator};
108							}else{
109	0						$text =~ s!^([^\p{Han}]+)!!;
110	0						my $str = $1;
111	0						$str =~ s!\s+!$self->{seperator}!g;
112	0						$result .= $str.$self->{seperator};
113							}
114							}
115	0						return $result;
116							}
117
118							sub _seg_zh{
119	0			0			my $self = shift;
120	0						my $input = shift;
121	0						my $result="";
122
123	0	0					if($self->{verbose}){
124	0						print "Try to segment string $input\n";
125							}
126
127	0						my $len=length($input);
128	0						my @arctable=();
129
130
131	0	0					if($self->{verbose}){
132	0						print "step0, initialize the arctable\n";
133							}
134
135	0						for(my $i=0;$i<$len;$i++){
136	0						for(my $j=0;$j<$len;$j++){
137	0	0					if($i==$j){
138	0						$arctable[$i][$j]=1;
139							}else{
140	0						$arctable[$i][$j]=-1;
141							}
142							}
143							}
144
145
146	0	0					if($self->{verbose}){
147	0						print "step1: search for all possible arcs in the input string\n";
148							}
149
150	0						my @chars = split('',$input);
151
152	0						for(my $i=0;$i<$len;$i++){
153
154							#from this position, try to find all possible words led by this character
155	0						my $possiblelen=$self->{headchar_maxlen}->{$chars[$i]};
156	0	0					$possiblelen=1 if (not defined $possiblelen);
157
158	0	0					if($self->{verbose}){
159	0						print "\n$chars[$i]=$possiblelen\n";
160							}
161
162	0	0					if(($possiblelen+$i) > ($len-1)){
163	0						$possiblelen=$len-$i;
164							}
165
166							#all possible words with more than 2 characters
167	0						while($possiblelen>=2){
168	0						my $substr = substr($input,$i,$possiblelen);
169
170	0	0					if($self->{verbose}){
171	0						print "s=$substr,len=$possiblelen\n";
172							}
173
174	0	0					if($self->{word_freq}->{$substr}){
175	0	0					if($self->{verbose}){
176	0						print "$substr found\n";
177							}
178
179	0						$arctable[$i][$i+$possiblelen-1]=$self->{word_freq}->{$substr};
180							}
181	0						$possiblelen--;
182							}
183							}
184
185	0	0					if($self->{verbose}){
186	0						for(my $i=0;$i<$len;$i++){
187	0						for(my $j=0;$j<$len;$j++){
188	0						print " ",$arctable[$i][$j];
189							}
190	0						print "\n";
191							}
192							}
193
194	0	0					if($self->{verbose}){
195	0						print "step2: from the arc table, try to find the best path as segmentation\n";
196							}
197
198
199	0						my @lrlabel=();
200	0						my @rllabel=();
201
202	0						for(my $k=0;$k<$len;$k++){
203	0						$lrlabel[$k]=0;
204	0						$rllabel[$k]=0;
205							}
206
207	0						my $lrfreq=0;
208	0						my $index=0;
209
210	0						while($index<$len){
211	0						my $endindex=$len-1;
212	0						my $found=0;
213
214	0		0				while((!$found)&&($endindex>=$index)){
215	0	0					if($arctable[$index][$endindex]!=-1){
216	0						$lrfreq+=log($arctable[$index][$endindex]);
217	0						$found=1;
218							}
219							else{
220	0						$endindex--;
221							}
222							}
223
224	0						$lrlabel[$endindex]=1;
225	0						$index=$endindex+1;
226							}
227
228	0						my $rlfreq=0;
229	0						$index=$len-1;
230
231	0						while($index>=0){
232	0						my $startindex=0;
233	0						my $found=0;
234	0		0				while((!$found)&&($startindex<=$index)){
235	0	0					if($arctable[$startindex][$index]!=-1){
236	0						$found=1;
237	0						$rlfreq+=log($arctable[$startindex][$index]);
238							}
239							else{
240	0						$startindex++;
241							}
242							}
243
244	0						$rllabel[$startindex]=1;
245	0						$index=$startindex-1;
246							}
247
248	0	0					if($self->{verbose}){
249	0						print "Step3: create result\n";
250							}
251
252	0	0					if($lrfreq>$rlfreq){
253	0						for(my $p=0;$p<$len;$p++){
254	0						$result .= $chars[$p];
255
256	0	0					if($lrlabel[$p]==1){
257	0						$result .= $self->{seperator};
258							}
259							}
260							}else{
261	0						for(my $p=0;$p<$len;$p++){
262	0	0					if($rllabel[$p]==1){
263	0						$result .= $self->{seperator};
264							}
265	0						$result .= $chars[$p];
266							}
267							}
268
269	0	0					if($self->{verbose}){
270	0						print "result=$result\n";
271							}
272
273	0						return $result;
274							}
275
276
277
278							=head1 NAME
279
280							Lingua::ZH::WordSegmenter - Simplified Chinese Word Segmentation
281
282							=head1 VERSION
283
284							Version 0.01
285
286							=cut
287
288							our $VERSION = '0.01';
289
290							=head1 SYNOPSIS
291
292							use Lingua::ZH::WordSegmenter;
293
294							my $segmenter = Lingua::ZH::WordSegmenter->new();
295							print encode('gbk', $segmenter->seg($_) );
296
297							=head1 Description
298
299							This is a perl version of simplified Chinese word segmentation.
300
301							The algorithm for this segmenter is to search the longest word at each
302							point from both left and right directions, and choose the one with
303							higher frequency product.
304
305							The original program is from the CPAN module Lingua::ZH::WordSegment
306							(http://search.cpan.org/~chenyr/) I did the follwing changes: 1) make
307							the interface object oriented; 2) make the internal string into utf8;
308							3) using sogou's dictionary (http://www.sogou.com/labs/dl/w.html) as
309							the default dictionary.
310
311
312							=head1 METHODS
313
314							=over 4
315
316							=item $segmenter = Lingua::ZH::WordSegmenter->new(%opinions)
317
318							This method constructs a new C object and
319							returns it. Key/value pair arguments may be provided to set up the
320							initial state. The following options correspond to attribute methods
321							described below:
322
323							KEY PURPOSE DEFAULT
324							----------- ------------- --------------------
325							dic filename of the dic sogou dic
326							dic_encoding encoding of the dic "gbk"
327							seperator string to seperate wrods " "
328							verbose show the segment process 0
329
330							=item $segmenter->seg($input,[$encoding])
331
332							Segment a input string, you can specify the encoding by the optional
333							parameter.
334
335							The return result is encoded in utf8 format.
336
337							=back
338
339							=head1 SEE ALSO
340
341							L
342
343							=head1 AUTHOR
344
345							Zhang Jun, C<< >>
346
347							=head1 COPYRIGHT & LICENSE
348
349							Copyright 2007 Zhang Jun, all rights reserved.
350
351							This program is free software; you can redistribute it and/or modify it
352							under the same terms as Perl itself.
353
354							=cut
355
356							1; # End of Lingua::ZH::WordSegmenter
357
358							__DATA__