File Coverage

blib/lib/Text/Util/Chinese.pm

Criterion	Covered	Total	%
statement	108	108	100.0
branch	26	32	81.2
condition	27	31	87.1
subroutine	16	16	100.0
pod	4	6	66.6
total	181	193	93.7

line	stmt	bran	cond	sub	pod	time	code
1							package Text::Util::Chinese;
2	6			6		10087	use strict;
	6					41
	6					167
3	6			6		72	use warnings;
	6					14
	6					143
4	6			6		31	use utf8;
	6					11
	6					24
5
6	6			6		206	use Exporter 5.57 'import';
	6					172
	6					269
7	6			6		6003	use Unicode::UCD qw(charscript);
	6					321356
	6					801
8
9							our $VERSION = '0.06';
10							our @EXPORT_OK = qw(sentence_iterator phrase_iterator extract_presuf extract_words tokenize_by_script);
11
12	6			6		58	use List::Util qw(uniq pairmap);
	6					12
	6					1599
13
14							sub grep_iterator {
15	2			2	0	109	my ($iter, $cb) = @_;
16							return sub {
17	14			14		40	local $_;
18	14					18	do {
19	19					55	$_ = $iter->();
20	19	100				70	return undef unless defined($_);
21							} while (! $cb->());
22	12					42	return $_;
23							}
24	2					14	}
25
26							sub phrase_iterator {
27	2			2	1	139	my ($input_iter, $opts) = @_;
28	2					5	my @phrases;
29							return sub {
30	1403		100	1403		343352	while(! @phrases && defined(my $text = $input_iter->())) {
31							@phrases = grep {
32	692	100	100			22727	(! /\A\s+\z/) && (! /\p{General_Category=Punctuation}/) && /\p{Han}/
	3085					18748
33							} split / ( \r?\n \| \p{General_Category: Other_Punctuation} )+ /x, $text;
34							}
35	1403					3186	return shift @phrases;
36							}
37	2					14	}
38
39							sub sentence_iterator {
40	3			3	0	130488	my ($input_iter, $opts) = @_;
41	3					6	my @sentences;
42							return sub {
43	773		100	773		187122	while(! @sentences && defined(my $text = $input_iter->())) {
44	692					12164	@sentences = grep { !/\A\s+\z/ } ($text =~
	771					5464
45							m/(
46							(?:
47							[^\p{General_Category: Open_Punctuation}\p{General_Category: Close_Punctuation}]+?
48							\| .? \p{General_Category: Open_Punctuation} .? \p{General_Category: Close_Punctuation} .*?
49							)
50							(?: \z \| [\n\?\!。？！]+ )
51							)/gx);
52							}
53	773					1813	return shift @sentences;
54							}
55	3					16	}
56
57							sub extract_presuf {
58	1			1	1	110	my ($input_iter, $output_cb, $opts) = @_;
59
60	1					6	my %stats;
61							my %extracted;
62	1		50			6	my $threshold = $opts->{threshold} \|\| 9; # an arbitrary choice.
63	1		50			11	my $lengths = $opts->{lengths} \|\| [2,3];
64	1					10	my $text;
65
66							my $phrase_iter = grep_iterator(
67							phrase_iterator( $input_iter ),
68	7			7		39	sub { /\A\p{Han}+\z/ }
69	1					5	);
70	1					4	while (my $phrase = $phrase_iter->()) {
71	7					16	for my $len ( @$lengths ) {
72	14					34	my $re = '\p{Han}{' . $len . '}';
73	14	100	66			202	next unless length($phrase) >= $len * 2 && $phrase =~ /\A($re) .* ($re)\z/x;
74	12					1997	my ($prefix, $suffix) = ($1, $2);
75	12	50				48	$stats{prefix}{$prefix}++ unless $extracted{$prefix};
76	12	50				36	$stats{suffix}{$suffix}++ unless $extracted{$suffix};
77
78	12					24	for my $x ($prefix, $suffix) {
79	24	100	66			147	if (! $extracted{$x}
			100
			100
			100
80							&& $stats{prefix}{$x}
81							&& $stats{suffix}{$x}
82							&& $stats{prefix}{$x} > $threshold
83							&& $stats{suffix}{$x} > $threshold
84							) {
85	1					5	$extracted{$x} = 1;
86	1					8	delete $stats{prefix}{$x};
87	1					3	delete $stats{suffix}{$x};
88
89	1					6	$output_cb->($x, \%extracted);
90							}
91							}
92							}
93							}
94
95
96	1					16	return \%extracted;
97							}
98
99							sub extract_words {
100	1			1	1	100	my ($input_iter) = @_;
101
102	1					2	my (%lcontext, %rcontext);
103
104	1					6	while( my $txt = $input_iter->() ) {
105	7					110	my @phrase = split /\P{Letter}/, $txt;
106	7					17	for (@phrase) {
107	9	50				50	next unless /\A\p{Han}+\z/;
108
109	9					30	my @c = split("", $_);
110
111	9					23	for my $i (0..$#c) {
112	67	100				119	if ($i > 0) {
113	58					144	$lcontext{$c[$i]}{$c[$i-1]}++;
114	58					90	for my $n (2,3) {
115	116	100				206	if ($i >= $n) {
116	89					205	my $tok = join('', @c[ ($i-$n+1) .. $i] );
117	89	50				185	if (length($tok) > 1) {
118	89					237	$lcontext{ $tok }{$c[$i - $n]}++;
119							}
120							}
121							}
122							}
123	67	100				152	if ($i < $#c) {
124	58					158	$rcontext{$c[$i]}{$c[$i+1]}++;
125	58					87	for my $n (2,3) {
126	116	100				271	if ($i + $n <= $#c) {
127	89					177	my $tok = join('', @c[$i .. ($i+$n-1)]);
128	89	50				181	if (length($tok) > 1) {
129	89					330	$rcontext{ $tok }{ $c[$i+$n] }++;
130							}
131							}
132							}
133							}
134							}
135							}
136							}
137
138	1					80	my @tokens = uniq((keys %lcontext), (keys %rcontext));
139	1					12	my @words;
140	1					4	my $threshold = 5;
141	1					2	for my $x (@tokens) {
142	148	100	100			180	next unless ($threshold <= (keys %{$lcontext{$x}}) && $threshold <= (keys %{$rcontext{$x}}));
	148					354
	2					9
143	1					3	push @words, $x;
144							}
145
146	1					62	return \@words;
147							}
148
149							sub tokenize_by_script {
150	1			1	1	93	my ($str) = @_;
151	1					2	my @tokens;
152	1					11	my @chars = grep { defined($_) } split "", $str;
	41					68
153	1	50				6	return () unless @chars;
154
155	1					3	my $t = shift(@chars);
156	1					9	my $s = charscript(ord($t));
157	1					15241	while(my $char = shift @chars) {
158	40					82	my $_s = charscript(ord($char));
159	40	100				2681	if ($_s eq $s) {
160	29					70	$t .= $char;
161							}
162							else {
163	11					22	push @tokens, $t;
164	11					15	$s = $_s;
165	11					27	$t = $char;
166							}
167							}
168	1					7	push @tokens, $t;
169	1					3	return grep { ! /\A\s*\z/u } @tokens;
	12					55
170							}
171
172							1;
173
174							__END__