|  line  | 
 stmt  | 
 bran  | 
 cond  | 
 sub  | 
 pod  | 
 time  | 
 code  | 
| 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 package Word::Segmenter::Chinese::Lite;  | 
| 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
3
 | 
3
 | 
 
 | 
 
 | 
  
3
  
 | 
 
 | 
78774
 | 
 use 5.008008;  | 
| 
 
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
11
 | 
    | 
| 
 
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
141
 | 
    | 
| 
4
 | 
3
 | 
 
 | 
 
 | 
  
3
  
 | 
 
 | 
17
 | 
 use strict;  | 
| 
 
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
6
 | 
    | 
| 
 
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
126
 | 
    | 
| 
5
 | 
3
 | 
 
 | 
 
 | 
  
3
  
 | 
 
 | 
16
 | 
 use warnings;  | 
| 
 
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
12
 | 
    | 
| 
 
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
115
 | 
    | 
| 
6
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
7
 | 
3
 | 
 
 | 
 
 | 
  
3
  
 | 
 
 | 
3014
 | 
 use Encode;  | 
| 
 
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
52358
 | 
    | 
| 
 
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
338
 | 
    | 
| 
8
 | 
3
 | 
 
 | 
 
 | 
  
3
  
 | 
 
 | 
2029
 | 
 use Word::Segmenter::Chinese::Lite::Dict qw(wscl_get_dict_default);  | 
| 
 
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
12
 | 
    | 
| 
 
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
2640
 | 
    | 
| 
9
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
10
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 require Exporter;  | 
| 
11
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 our @ISA     = qw(Exporter);  | 
| 
12
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 our @EXPORT  = qw(wscl_seg wscl_set_mode);  | 
| 
13
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 our $VERSION = '0.08';  | 
| 
14
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
15
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 our $WSCL_MODE = 'dict';  | 
| 
16
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 our %WSCL_DICT;  | 
| 
17
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
18
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub wscl_set_mode {  | 
| 
19
 | 
2
 | 
 
 | 
 
 | 
  
2
  
 | 
  
1
  
 | 
692
 | 
     my $mode = shift;  | 
| 
20
 | 
2
 | 
  
 50
  
 | 
  
 66
  
 | 
 
 | 
 
 | 
20
 | 
     if ( $mode eq 'dict' or $mode eq 'obigram' or $mode eq 'unigram' ) {  | 
| 
 
 | 
 
 | 
 
 | 
  
 66
  
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
21
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
3
 | 
         $WSCL_MODE = $mode;  | 
| 
22
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
23
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
5
 | 
     return 0;  | 
| 
24
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
25
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
26
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub wscl_seg {  | 
| 
27
 | 
6
 | 
 
 | 
 
 | 
  
6
  
 | 
  
1
  
 | 
2212
 | 
     my $str = shift;  | 
| 
28
 | 
6
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
18
 | 
     if ( $WSCL_MODE eq 'dict' ) {  | 
| 
29
 | 
4
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
16
 | 
         %WSCL_DICT = wscl_get_dict_default() unless defined $WSCL_DICT{'1'};  | 
| 
30
 | 
4
 | 
 
 | 
 
 | 
 
 | 
 
 | 
16
 | 
         return wscl_seg_dict($str);  | 
| 
31
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
32
 | 
2
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
25
 | 
     if ( $WSCL_MODE eq 'obigram' ) {  | 
| 
33
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
4
 | 
         return wscl_seg_obigram($str);  | 
| 
34
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
35
 | 
1
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
4
 | 
     if ( $WSCL_MODE eq 'unigram' ) {  | 
| 
36
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
3
 | 
         return wscl_seg_unigram($str);  | 
| 
37
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
38
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
     return 0;  | 
| 
39
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
40
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
41
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub wscl_seg_unigram {  | 
| 
42
 | 
1
 | 
 
 | 
 
 | 
  
1
  
 | 
  
0
  
 | 
2
 | 
     my $w = shift;  | 
| 
43
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
6
 | 
     my @r = map { $_ = encode( 'utf8', $_ ) } split //, decode( 'utf8', $w );  | 
| 
 
 | 
10
 | 
 
 | 
 
 | 
 
 | 
 
 | 
261
 | 
    | 
| 
44
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
24
 | 
     return @r;  | 
| 
45
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
46
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
47
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub wscl_seg_obigram {  | 
| 
48
 | 
1
 | 
 
 | 
 
 | 
  
1
  
 | 
  
0
  
 | 
2
 | 
     my $w = shift;  | 
| 
49
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1
 | 
     my @r;  | 
| 
50
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
4
 | 
     for ( 0 .. length( decode( 'utf8', $w ) ) ) {  | 
| 
51
 | 
11
 | 
 
 | 
 
 | 
 
 | 
 
 | 
45
 | 
         my $tmp = encode( 'utf8', substr( decode( 'utf8', $w ), $_, 2 ) );  | 
| 
52
 | 
11
 | 
 
 | 
 
 | 
 
 | 
 
 | 
404
 | 
         push @r, $tmp;  | 
| 
53
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
54
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
9
 | 
     return @r;  | 
| 
55
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
56
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
57
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub wscl_seg_dict {  | 
| 
58
 | 
4
 | 
 
 | 
 
 | 
  
4
  
 | 
  
0
  
 | 
53
 | 
     my $string = shift;  | 
| 
59
 | 
4
 | 
 
 | 
  
 50
  
 | 
 
 | 
 
 | 
25
 | 
     my $real_max_length = shift || 9;  | 
| 
60
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
61
 | 
4
 | 
 
 | 
 
 | 
 
 | 
 
 | 
16
 | 
     my $line = decode( 'utf8', $string );  | 
| 
62
 | 
4
 | 
 
 | 
 
 | 
 
 | 
 
 | 
170
 | 
     my $len = length($line);  | 
| 
63
 | 
4
 | 
  
 50
  
 | 
  
 33
  
 | 
 
 | 
 
 | 
19
 | 
     return 0 if !$len or $len <= 0;  | 
| 
64
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
65
 | 
4
 | 
 
 | 
 
 | 
 
 | 
 
 | 
6
 | 
     my @result;  | 
| 
66
 | 
4
 | 
 
 | 
 
 | 
 
 | 
 
 | 
29
 | 
     my @eng = $line =~ /[A-Za-z0-9\-\_\:\.]+/g;  | 
| 
67
 | 
4
 | 
 
 | 
 
 | 
 
 | 
 
 | 
9
 | 
     unshift @result, @eng;  | 
| 
68
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
69
 | 
4
 | 
 
 | 
 
 | 
 
 | 
 
 | 
8
 | 
     while ( length($line) >= 1 ) {  | 
| 
70
 | 
39
 | 
 
 | 
 
 | 
 
 | 
 
 | 
58
 | 
         for ( 0 .. $real_max_length - 1 ) {  | 
| 
71
 | 
330
 | 
 
 | 
 
 | 
 
 | 
 
 | 
304
 | 
             my $len = $real_max_length - $_;  | 
| 
72
 | 
330
 | 
 
 | 
 
 | 
 
 | 
 
 | 
397
 | 
             my $w = substr( $line, $_ - $real_max_length );  | 
| 
73
 | 
330
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
773
 | 
             if ( defined $WSCL_DICT{$len}{$w} ) {  | 
| 
74
 | 
16
 | 
 
 | 
 
 | 
 
 | 
 
 | 
40
 | 
                 unshift @result, encode( 'utf8', $w );  | 
| 
75
 | 
16
 | 
 
 | 
 
 | 
 
 | 
 
 | 
327
 | 
                 $line =  | 
| 
76
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                   substr( $line, 0, length($line) - ( $real_max_length - $_ ) );  | 
| 
77
 | 
16
 | 
 
 | 
 
 | 
 
 | 
 
 | 
41
 | 
                 last;  | 
| 
78
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             }  | 
| 
79
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
80
 | 
314
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
511
 | 
             if ( $_ == $real_max_length - 1 ) {  | 
| 
81
 | 
23
 | 
 
 | 
 
 | 
 
 | 
 
 | 
67
 | 
                 $line = substr( $line, 0, length($line) - 1 );  | 
| 
82
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             }  | 
| 
83
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }  | 
| 
84
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
85
 | 
4
 | 
 
 | 
 
 | 
 
 | 
 
 | 
30
 | 
     return @result;  | 
| 
86
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
87
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
88
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 1;  | 
| 
89
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 __END__  |