File Coverage

blib/lib/Word/Segmenter/Chinese/Lite.pm

Criterion	Covered	Total	%
statement	56	57	98.2
branch	13	16	81.2
condition	6	11	54.5
subroutine	10	10	100.0
pod	2	5	40.0
total	87	99	87.8

line	stmt	bran	cond	sub	pod	time	code
1							package Word::Segmenter::Chinese::Lite;
2
3	3			3		78774	use 5.008008;
	3					11
	3					141
4	3			3		17	use strict;
	3					6
	3					126
5	3			3		16	use warnings;
	3					12
	3					115
6
7	3			3		3014	use Encode;
	3					52358
	3					338
8	3			3		2029	use Word::Segmenter::Chinese::Lite::Dict qw(wscl_get_dict_default);
	3					12
	3					2640
9
10							require Exporter;
11							our @ISA = qw(Exporter);
12							our @EXPORT = qw(wscl_seg wscl_set_mode);
13							our $VERSION = '0.08';
14
15							our $WSCL_MODE = 'dict';
16							our %WSCL_DICT;
17
18							sub wscl_set_mode {
19	2			2	1	692	my $mode = shift;
20	2	50	66			20	if ( $mode eq 'dict' or $mode eq 'obigram' or $mode eq 'unigram' ) {
			66
21	2					3	$WSCL_MODE = $mode;
22							}
23	2					5	return 0;
24							}
25
26							sub wscl_seg {
27	6			6	1	2212	my $str = shift;
28	6	100				18	if ( $WSCL_MODE eq 'dict' ) {
29	4	100				16	%WSCL_DICT = wscl_get_dict_default() unless defined $WSCL_DICT{'1'};
30	4					16	return wscl_seg_dict($str);
31							}
32	2	100				25	if ( $WSCL_MODE eq 'obigram' ) {
33	1					4	return wscl_seg_obigram($str);
34							}
35	1	50				4	if ( $WSCL_MODE eq 'unigram' ) {
36	1					3	return wscl_seg_unigram($str);
37							}
38	0					0	return 0;
39							}
40
41							sub wscl_seg_unigram {
42	1			1	0	2	my $w = shift;
43	1					6	my @r = map { $_ = encode( 'utf8', $_ ) } split //, decode( 'utf8', $w );
	10					261
44	1					24	return @r;
45							}
46
47							sub wscl_seg_obigram {
48	1			1	0	2	my $w = shift;
49	1					1	my @r;
50	1					4	for ( 0 .. length( decode( 'utf8', $w ) ) ) {
51	11					45	my $tmp = encode( 'utf8', substr( decode( 'utf8', $w ), $_, 2 ) );
52	11					404	push @r, $tmp;
53							}
54	1					9	return @r;
55							}
56
57							sub wscl_seg_dict {
58	4			4	0	53	my $string = shift;
59	4		50			25	my $real_max_length = shift \|\| 9;
60
61	4					16	my $line = decode( 'utf8', $string );
62	4					170	my $len = length($line);
63	4	50	33			19	return 0 if !$len or $len <= 0;
64
65	4					6	my @result;
66	4					29	my @eng = $line =~ /[A-Za-z0-9\-\_\:\.]+/g;
67	4					9	unshift @result, @eng;
68
69	4					8	while ( length($line) >= 1 ) {
70	39					58	for ( 0 .. $real_max_length - 1 ) {
71	330					304	my $len = $real_max_length - $_;
72	330					397	my $w = substr( $line, $_ - $real_max_length );
73	330	100				773	if ( defined $WSCL_DICT{$len}{$w} ) {
74	16					40	unshift @result, encode( 'utf8', $w );
75	16					327	$line =
76							substr( $line, 0, length($line) - ( $real_max_length - $_ ) );
77	16					41	last;
78							}
79
80	314	100				511	if ( $_ == $real_max_length - 1 ) {
81	23					67	$line = substr( $line, 0, length($line) - 1 );
82							}
83							}
84							}
85	4					30	return @result;
86							}
87
88							1;
89							__END__