File Coverage

blib/lib/Lingua/RU/OpenCorpora/Tokenizer.pm

Criterion	Covered	Total	%
statement	123	143	86.0
branch	74	120	61.6
condition	27	56	48.2
subroutine	31	35	88.5
pod	3	3	100.0
total	258	357	72.2

line	stmt	bran	cond	sub	pod	time	code
1							package Lingua::RU::OpenCorpora::Tokenizer;
2
3	2			2		86983	use utf8;
	2					15
	2					13
4	2			2		60	use strict;
	2					3
	2					68
5	2			2		10	use warnings;
	2					8
	2					62
6
7	2			2		114883	use Unicode::Normalize;
	2					10247
	2					209
8	2			2		1881	use Lingua::RU::OpenCorpora::Tokenizer::List;
	2					7
	2					62
9	2			2		1487	use Lingua::RU::OpenCorpora::Tokenizer::Vectors;
	2					6
	2					3504
10
11							our $VERSION = 0.06;
12
13							sub new {
14	1			1	1	28	my $class = shift;
15
16	1					5	my $self = bless {@_}, $class;
17	1					6	$self->_init;
18
19	1					6	$self;
20							}
21
22							sub tokens {
23	0			0	1	0	my($self, $text, $options) = @_;
24
25	0	0				0	$options = {} unless defined $options;
26	0					0	$options->{want_tokens} = 1;
27	0	0				0	$options->{threshold} = 1 unless defined $options->{threshold};
28
29	0					0	$self->_do_tokenize($text, $options);
30
31	0					0	$self->{tokens};
32							}
33
34							sub tokens_bounds {
35	6			6	1	28269	my($self, $text, $options) = @_;
36
37	6	50				39	$options = {} unless defined $options;
38	6					22	$options->{want_tokens} = 0;
39
40	6					28	$self->_do_tokenize($text, $options);
41
42	6					36	$self->{bounds};
43							}
44
45							sub _do_tokenize {
46	6			6		19	my($self, $text, $options) = @_;
47
48							# workaround for The Unicode Bug
49							# see https://metacpan.org/module/perlunicode#The-Unicode-Bug
50	6					17	utf8::upgrade($text);
51
52							# normalize Unicode to prevent decomposed characters to be processed separately
53	6					152	$text = NFC($text);
54
55	6					128	my $chars = $self->{chars} = [split //, $text];
56	6					56	$self->{bounds} = [];
57	6					36	$self->{tokens} = [];
58
59	6					14	my $token;
60	6					13	for(my $i = 0; $i <= $#{ $chars }; $i++) {
	170					476
61	164	100				1003	my $ctx = {
62							char => $chars->[$i],
63							prevchar => $i ? $chars->[$i - 1] : '',
64							nextchar => $chars->[$i + 1],
65							nnextchar => $chars->[$i + 2],
66							pos => $i,
67							};
68							not defined $ctx->{$_} and $ctx->{$_} = ''
69	164		100			869	for qw(nextchar nnextchar);
70
71	164					385	$self->_get_char_sequences($ctx);
72	164					327	$self->_vectorize($ctx);
73
74	164					580	my $coeff = $self->{vectors}->in_list($ctx->{vector});
75	164	50				392	$coeff = 0.5 unless defined $coeff;
76
77	164	50				328	if($options->{want_tokens}) {
78	0					0	$token .= $chars->[$i];
79
80	0	0	0			0	if(
81	0					0	$coeff >= $options->{threshold}
82							or $ctx->{pos} == $#{ $chars }
83							)
84							{
85	0					0	$token =~ s{^\s+\|\s+$}{}g;
86	0	0				0	push @{ $self->{tokens} }, $token if $token;
	0					0
87	0					0	$token = '';
88							}
89							}
90							else {
91	164	100				684	if($coeff) {
92	29					33	push @{ $self->{bounds} }, [$ctx->{pos}, $coeff];
	29					193
93							}
94							}
95							}
96							}
97
98							sub _get_char_sequences {
99	164			164		227	my($self, $ctx) = @_;
100
101	164					253	my $seq = my $seq_left = my $seq_right = '';
102	164					179	my $spacer = '';
103
104	164	100	100			952	if(
105							$ctx->{nextchar} =~ m\|([-./?=:&"!+()])\|
106							or $ctx->{char} =~ m\|([-./?=:&"!+()])\|
107							)
108							{
109	6					15	$spacer = $1;
110							}
111
112	164	100				347	if(length $spacer) {
113							# go left
114	6					25	for(my $i = $ctx->{pos}; $i >= 0; $i--) {
115	55					85	my $ch = $self->{chars}[$i];
116
117	55		33			124	my $case1 = !!(
118							_is_hyphen($spacer)
119							and (
120							_is_cyr($ch)
121							or _is_hyphen($ch)
122							or _is_single_quote($ch)
123							)
124							);
125	55		66			96	my $case2 = !!(
126							not _is_hyphen($spacer)
127							and not _is_space($ch)
128							);
129
130	55	100	66			181	if($case1 or $case2) {
131	49					84	$seq_left = $ch . $seq_left;
132							}
133							else {
134	6					13	last;
135							}
136
137	49	100				226	$seq_left = substr $seq_left, 0, -1
138							if substr($seq_left, -1) eq $spacer;
139							}
140
141							# go right
142	6					16	for(my $i = $ctx->{pos} + 1; $i <= $#{ $self->{chars} }; $i++) {
	9					30
143	3					7	my $ch = $self->{chars}[$i];
144
145	3		33			8	my $case1 = !!(
146							_is_hyphen($spacer)
147							and (
148							_is_cyr($ch)
149							or _is_hyphen($ch)
150							or _is_single_quote($ch)
151							)
152							);
153	3		33			8	my $case2 = !!(
154							not _is_hyphen($spacer)
155							and not _is_space($ch)
156							);
157
158	3	50	33			15	if($case1 or $case2) {
159	3					6	$seq_right .= $ch;
160							}
161							else {
162	0					0	last;
163							}
164
165	3	50				20	$seq_right = substr $seq_right, 0, 1
166							if substr($seq_right, -1) eq $spacer;
167							}
168
169	6					16	$seq = join '', $seq_left, $spacer, $seq_right;
170							}
171
172	164					296	$ctx->{spacer} = $spacer;
173	164					273	$ctx->{seq} = $seq;
174	164					360	$ctx->{seq_left} = $seq_left;
175	164					248	$ctx->{seq_right} = $seq_right;
176
177	164					289	return;
178							}
179
180							sub _vectorize {
181	164					671	my $ckey = join ',', _is_hyphen($_[1]->{spacer}),
182	164			164		357	@{$_[1]}{qw(spacer prevchar char nextchar nnextchar seq_left seq seq_right)};
183	164		66			910	$_[1]->{vector} = $_[0]->{_vectors_cache}{$ckey} \|\|= $_[0]->_do_vectorize($_[1]);
184
185	164					233	return;
186							}
187
188							sub _do_vectorize {
189	153			153		198	my($self, $ctx) = @_;
190
191	153					260	my $spacer = !!length $ctx->{spacer};
192	153		66			302	my $spacer_is_hyphen = $spacer && _is_hyphen($ctx->{spacer});
193
194	153	50	66			310	my @bits = (
		50	66
		100	100
		100
		50
		100
195							_char_class($ctx->{char}),
196							_char_class($ctx->{nextchar}),
197							_is_digit($ctx->{prevchar}),
198							_is_digit($ctx->{nnextchar}),
199							$spacer_is_hyphen
200							? _is_dict_seq($self->{hyphens}, $ctx->{seq})
201							: 0,
202							$spacer_is_hyphen
203							? _is_suffix($ctx->{seq_right})
204							: 0,
205							_is_same_pm($ctx->{char}, $ctx->{nextchar}),
206							($spacer and not $spacer_is_hyphen)
207							? _looks_like_url($ctx->{seq}, $ctx->{seq_right})
208							: 0,
209							($spacer and not $spacer_is_hyphen)
210							? _is_exception_seq($self->{exceptions}, $ctx->{seq})
211							: 0,
212							$spacer_is_hyphen
213							? _is_prefix($self->{prefixes}, $ctx->{seq_left})
214							: 0,
215							(_is_colon($ctx->{spacer}) and !!length $ctx->{seq_right})
216							? _looks_like_time($ctx->{seq_left}, $ctx->{seq_right})
217							: 0,
218							);
219
220	153					1244	oct join '', '0b', @bits;
221							}
222
223							sub _init {
224	1			1		1	my $self = shift;
225
226	1					3	for(qw(exceptions prefixes hyphens)) {
227	3					44	my $list = Lingua::RU::OpenCorpora::Tokenizer::List->new(
228							$_,
229							{
230							data_dir => $self->{data_dir},
231							},
232							);
233	3					24	$self->{$_} = $list;
234							}
235
236	1					21	my $vectors = Lingua::RU::OpenCorpora::Tokenizer::Vectors->new({
237							data_dir => $self->{data_dir},
238							});
239	1					7	$self->{vectors} = $vectors;
240
241	1					3	return;
242							}
243
244	21	100		21		92	sub _is_pmark { $_[0] =~ /^[,?!";«»]$/ ? 1 : 0 }
245
246	2	100		2		12	sub _is_latin { $_[0] =~ /^\p{Latin}$/ ? 1 : 0 }
	2			16		3
	2					42
	16					115
247
248	306	100		306		1574	sub _is_cyr { $_[0] =~ /^\p{Cyrillic}$/ ? 1 : 0 }
249
250	322	50		322		1212	sub _is_digit { $_[0] =~ /^[0-9]$/ ? 1 : 0 }
251
252	8	50		8		37	sub _is_bracket1 { $_[0] =~ /^[\[({<]$/ ? 1 : 0 }
253
254	8	50		8		39	sub _is_bracket2 { $_[0] =~ /^[\])}>]$/ ? 1 : 0 }
255
256	0	0		0		0	sub _is_suffix { $_[0] =~ /^(?:то\|таки\|с\|ка\|де)$/ ? 1 : 0 }
257
258	119	100		119		492	sub _is_space { $_[0] =~ /^\s$/ ? 1 : 0 }
259
260	302	50		302		790	sub _is_hyphen { $_[0] eq '-' ? 1 : 0 }
261
262	25	100		25		112	sub _is_dot { $_[0] eq '.' ? 1 : 0 }
263
264	8	50		8		41	sub _is_single_quote { $_[0] eq "'" ? 1 : 0 }
265
266	8	50		8		29	sub _is_slash { $_[0] eq '/' ? 1 : 0 }
267
268	161	100		161		750	sub _is_colon { $_[0] eq ':' ? 1 : 0 }
269
270	153	100		153		966	sub _is_same_pm { $_[0] eq $_[1] ? 1 : 0 }
271
272	0	0		0		0	sub _is_prefix { $_[0]->in_list($_[1]) ? 1 : 0 }
273
274							sub _is_dict_seq {
275	0	0	0	0		0	return 0 if not $_[1] or substr $_[1], 0, 1 eq '-';
276
277	0	0				0	$_[0]->in_list($_[1]) ? 1 : 0;
278							}
279
280							sub _is_exception_seq {
281	6			6		12	my $seq = $_[1]; # need a copy
282
283	6	50				36	return 1 if $_[0]->in_list($seq);
284
285	6	50				5427	return 0 unless $seq =~ /^\W\|\W$/;
286
287	6					19	$seq =~ s/^\W+//;
288	6	50				24	return 1 if $_[0]->in_list($seq);
289
290	6					38	while($seq =~ s/\W$//) {
291	9	50				30	return 1 if $_[0]->in_list($seq);
292							}
293
294	6					21	0;
295							}
296
297							sub _looks_like_url {
298	6	100		6		34	return 0 unless !!length $_[1];
299	3	50				12	return 0 if length $_[0] < 5;
300	3	50				12	return 0 if substr $_[0], 0, 1 eq '.';
301
302	3	50	33			62	$_[0] =~ m{^\W*https?://?}
			33
303							or $_[0] =~ m{^\W*www\.}
304							or $_[0] =~ m<.\.(?:[a-z]{2,3}\|ру\|рф)\W*$>i
305							or return 0;
306
307	0					0	1;
308							}
309
310							sub _looks_like_time {
311	1			1		2	my($seq_left, $seq_right) = @_; # need copies
312
313	1					7	$seq_left =~ s/^[^0-9]{1,2}//;
314	1					7	$seq_right =~ s/[^0-9]+$//;
315
316	1	50	33			13	return 0 if $seq_left !~ /^[0-9]{1,2}$/
317							or $seq_right !~ /^[0-9]{2}$/;
318
319	0	0	0			0	($seq_left < 24 and $seq_right < 60)
320							? 1
321							: 0;
322							}
323
324							sub _char_class {
325	306	100		306		525	_is_cyr($_[0]) ? '0001' :
		50
		50
		50
		50
		100
		50
		50
		100
		100
		100
		100
326							_is_space($_[0]) ? '0010' :
327							_is_dot($_[0]) ? '0011' :
328							_is_pmark($_[0]) ? '0100' :
329							_is_hyphen($_[0]) ? '0101' :
330							_is_digit($_[0]) ? '0110' :
331							_is_latin($_[0]) ? '0111' :
332							_is_bracket1($_[0]) ? '1000' :
333							_is_bracket2($_[0]) ? '1001' :
334							_is_single_quote($_[0]) ? '1010' :
335							_is_slash($_[0]) ? '1011' :
336							_is_colon($_[0]) ? '1100' : '0000';
337							}
338
339							1;
340
341							__END__