| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package Lingua::RU::OpenCorpora::Tokenizer; |
|
2
|
|
|
|
|
|
|
|
|
3
|
2
|
|
|
2
|
|
86983
|
use utf8; |
|
|
2
|
|
|
|
|
15
|
|
|
|
2
|
|
|
|
|
13
|
|
|
4
|
2
|
|
|
2
|
|
60
|
use strict; |
|
|
2
|
|
|
|
|
3
|
|
|
|
2
|
|
|
|
|
68
|
|
|
5
|
2
|
|
|
2
|
|
10
|
use warnings; |
|
|
2
|
|
|
|
|
8
|
|
|
|
2
|
|
|
|
|
62
|
|
|
6
|
|
|
|
|
|
|
|
|
7
|
2
|
|
|
2
|
|
114883
|
use Unicode::Normalize; |
|
|
2
|
|
|
|
|
10247
|
|
|
|
2
|
|
|
|
|
209
|
|
|
8
|
2
|
|
|
2
|
|
1881
|
use Lingua::RU::OpenCorpora::Tokenizer::List; |
|
|
2
|
|
|
|
|
7
|
|
|
|
2
|
|
|
|
|
62
|
|
|
9
|
2
|
|
|
2
|
|
1487
|
use Lingua::RU::OpenCorpora::Tokenizer::Vectors; |
|
|
2
|
|
|
|
|
6
|
|
|
|
2
|
|
|
|
|
3504
|
|
|
10
|
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
our $VERSION = 0.06; |
|
12
|
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
sub new { |
|
14
|
1
|
|
|
1
|
1
|
28
|
my $class = shift; |
|
15
|
|
|
|
|
|
|
|
|
16
|
1
|
|
|
|
|
5
|
my $self = bless {@_}, $class; |
|
17
|
1
|
|
|
|
|
6
|
$self->_init; |
|
18
|
|
|
|
|
|
|
|
|
19
|
1
|
|
|
|
|
6
|
$self; |
|
20
|
|
|
|
|
|
|
} |
|
21
|
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
sub tokens { |
|
23
|
0
|
|
|
0
|
1
|
0
|
my($self, $text, $options) = @_; |
|
24
|
|
|
|
|
|
|
|
|
25
|
0
|
0
|
|
|
|
0
|
$options = {} unless defined $options; |
|
26
|
0
|
|
|
|
|
0
|
$options->{want_tokens} = 1; |
|
27
|
0
|
0
|
|
|
|
0
|
$options->{threshold} = 1 unless defined $options->{threshold}; |
|
28
|
|
|
|
|
|
|
|
|
29
|
0
|
|
|
|
|
0
|
$self->_do_tokenize($text, $options); |
|
30
|
|
|
|
|
|
|
|
|
31
|
0
|
|
|
|
|
0
|
$self->{tokens}; |
|
32
|
|
|
|
|
|
|
} |
|
33
|
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
sub tokens_bounds { |
|
35
|
6
|
|
|
6
|
1
|
28269
|
my($self, $text, $options) = @_; |
|
36
|
|
|
|
|
|
|
|
|
37
|
6
|
50
|
|
|
|
39
|
$options = {} unless defined $options; |
|
38
|
6
|
|
|
|
|
22
|
$options->{want_tokens} = 0; |
|
39
|
|
|
|
|
|
|
|
|
40
|
6
|
|
|
|
|
28
|
$self->_do_tokenize($text, $options); |
|
41
|
|
|
|
|
|
|
|
|
42
|
6
|
|
|
|
|
36
|
$self->{bounds}; |
|
43
|
|
|
|
|
|
|
} |
|
44
|
|
|
|
|
|
|
|
|
45
|
|
|
|
|
|
|
sub _do_tokenize { |
|
46
|
6
|
|
|
6
|
|
19
|
my($self, $text, $options) = @_; |
|
47
|
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
# workaround for The Unicode Bug |
|
49
|
|
|
|
|
|
|
# see https://metacpan.org/module/perlunicode#The-Unicode-Bug |
|
50
|
6
|
|
|
|
|
17
|
utf8::upgrade($text); |
|
51
|
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
# normalize Unicode to prevent decomposed characters to be processed separately |
|
53
|
6
|
|
|
|
|
152
|
$text = NFC($text); |
|
54
|
|
|
|
|
|
|
|
|
55
|
6
|
|
|
|
|
128
|
my $chars = $self->{chars} = [split //, $text]; |
|
56
|
6
|
|
|
|
|
56
|
$self->{bounds} = []; |
|
57
|
6
|
|
|
|
|
36
|
$self->{tokens} = []; |
|
58
|
|
|
|
|
|
|
|
|
59
|
6
|
|
|
|
|
14
|
my $token; |
|
60
|
6
|
|
|
|
|
13
|
for(my $i = 0; $i <= $#{ $chars }; $i++) { |
|
|
170
|
|
|
|
|
476
|
|
|
61
|
164
|
100
|
|
|
|
1003
|
my $ctx = { |
|
62
|
|
|
|
|
|
|
char => $chars->[$i], |
|
63
|
|
|
|
|
|
|
prevchar => $i ? $chars->[$i - 1] : '', |
|
64
|
|
|
|
|
|
|
nextchar => $chars->[$i + 1], |
|
65
|
|
|
|
|
|
|
nnextchar => $chars->[$i + 2], |
|
66
|
|
|
|
|
|
|
pos => $i, |
|
67
|
|
|
|
|
|
|
}; |
|
68
|
|
|
|
|
|
|
not defined $ctx->{$_} and $ctx->{$_} = '' |
|
69
|
164
|
|
100
|
|
|
869
|
for qw(nextchar nnextchar); |
|
70
|
|
|
|
|
|
|
|
|
71
|
164
|
|
|
|
|
385
|
$self->_get_char_sequences($ctx); |
|
72
|
164
|
|
|
|
|
327
|
$self->_vectorize($ctx); |
|
73
|
|
|
|
|
|
|
|
|
74
|
164
|
|
|
|
|
580
|
my $coeff = $self->{vectors}->in_list($ctx->{vector}); |
|
75
|
164
|
50
|
|
|
|
392
|
$coeff = 0.5 unless defined $coeff; |
|
76
|
|
|
|
|
|
|
|
|
77
|
164
|
50
|
|
|
|
328
|
if($options->{want_tokens}) { |
|
78
|
0
|
|
|
|
|
0
|
$token .= $chars->[$i]; |
|
79
|
|
|
|
|
|
|
|
|
80
|
0
|
0
|
0
|
|
|
0
|
if( |
|
81
|
0
|
|
|
|
|
0
|
$coeff >= $options->{threshold} |
|
82
|
|
|
|
|
|
|
or $ctx->{pos} == $#{ $chars } |
|
83
|
|
|
|
|
|
|
) |
|
84
|
|
|
|
|
|
|
{ |
|
85
|
0
|
|
|
|
|
0
|
$token =~ s{^\s+|\s+$}{}g; |
|
86
|
0
|
0
|
|
|
|
0
|
push @{ $self->{tokens} }, $token if $token; |
|
|
0
|
|
|
|
|
0
|
|
|
87
|
0
|
|
|
|
|
0
|
$token = ''; |
|
88
|
|
|
|
|
|
|
} |
|
89
|
|
|
|
|
|
|
} |
|
90
|
|
|
|
|
|
|
else { |
|
91
|
164
|
100
|
|
|
|
684
|
if($coeff) { |
|
92
|
29
|
|
|
|
|
33
|
push @{ $self->{bounds} }, [$ctx->{pos}, $coeff]; |
|
|
29
|
|
|
|
|
193
|
|
|
93
|
|
|
|
|
|
|
} |
|
94
|
|
|
|
|
|
|
} |
|
95
|
|
|
|
|
|
|
} |
|
96
|
|
|
|
|
|
|
} |
|
97
|
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
sub _get_char_sequences { |
|
99
|
164
|
|
|
164
|
|
227
|
my($self, $ctx) = @_; |
|
100
|
|
|
|
|
|
|
|
|
101
|
164
|
|
|
|
|
253
|
my $seq = my $seq_left = my $seq_right = ''; |
|
102
|
164
|
|
|
|
|
179
|
my $spacer = ''; |
|
103
|
|
|
|
|
|
|
|
|
104
|
164
|
100
|
100
|
|
|
952
|
if( |
|
105
|
|
|
|
|
|
|
$ctx->{nextchar} =~ m|([-./?=:&"!+()])| |
|
106
|
|
|
|
|
|
|
or $ctx->{char} =~ m|([-./?=:&"!+()])| |
|
107
|
|
|
|
|
|
|
) |
|
108
|
|
|
|
|
|
|
{ |
|
109
|
6
|
|
|
|
|
15
|
$spacer = $1; |
|
110
|
|
|
|
|
|
|
} |
|
111
|
|
|
|
|
|
|
|
|
112
|
164
|
100
|
|
|
|
347
|
if(length $spacer) { |
|
113
|
|
|
|
|
|
|
# go left |
|
114
|
6
|
|
|
|
|
25
|
for(my $i = $ctx->{pos}; $i >= 0; $i--) { |
|
115
|
55
|
|
|
|
|
85
|
my $ch = $self->{chars}[$i]; |
|
116
|
|
|
|
|
|
|
|
|
117
|
55
|
|
33
|
|
|
124
|
my $case1 = !!( |
|
118
|
|
|
|
|
|
|
_is_hyphen($spacer) |
|
119
|
|
|
|
|
|
|
and ( |
|
120
|
|
|
|
|
|
|
_is_cyr($ch) |
|
121
|
|
|
|
|
|
|
or _is_hyphen($ch) |
|
122
|
|
|
|
|
|
|
or _is_single_quote($ch) |
|
123
|
|
|
|
|
|
|
) |
|
124
|
|
|
|
|
|
|
); |
|
125
|
55
|
|
66
|
|
|
96
|
my $case2 = !!( |
|
126
|
|
|
|
|
|
|
not _is_hyphen($spacer) |
|
127
|
|
|
|
|
|
|
and not _is_space($ch) |
|
128
|
|
|
|
|
|
|
); |
|
129
|
|
|
|
|
|
|
|
|
130
|
55
|
100
|
66
|
|
|
181
|
if($case1 or $case2) { |
|
131
|
49
|
|
|
|
|
84
|
$seq_left = $ch . $seq_left; |
|
132
|
|
|
|
|
|
|
} |
|
133
|
|
|
|
|
|
|
else { |
|
134
|
6
|
|
|
|
|
13
|
last; |
|
135
|
|
|
|
|
|
|
} |
|
136
|
|
|
|
|
|
|
|
|
137
|
49
|
100
|
|
|
|
226
|
$seq_left = substr $seq_left, 0, -1 |
|
138
|
|
|
|
|
|
|
if substr($seq_left, -1) eq $spacer; |
|
139
|
|
|
|
|
|
|
} |
|
140
|
|
|
|
|
|
|
|
|
141
|
|
|
|
|
|
|
# go right |
|
142
|
6
|
|
|
|
|
16
|
for(my $i = $ctx->{pos} + 1; $i <= $#{ $self->{chars} }; $i++) { |
|
|
9
|
|
|
|
|
30
|
|
|
143
|
3
|
|
|
|
|
7
|
my $ch = $self->{chars}[$i]; |
|
144
|
|
|
|
|
|
|
|
|
145
|
3
|
|
33
|
|
|
8
|
my $case1 = !!( |
|
146
|
|
|
|
|
|
|
_is_hyphen($spacer) |
|
147
|
|
|
|
|
|
|
and ( |
|
148
|
|
|
|
|
|
|
_is_cyr($ch) |
|
149
|
|
|
|
|
|
|
or _is_hyphen($ch) |
|
150
|
|
|
|
|
|
|
or _is_single_quote($ch) |
|
151
|
|
|
|
|
|
|
) |
|
152
|
|
|
|
|
|
|
); |
|
153
|
3
|
|
33
|
|
|
8
|
my $case2 = !!( |
|
154
|
|
|
|
|
|
|
not _is_hyphen($spacer) |
|
155
|
|
|
|
|
|
|
and not _is_space($ch) |
|
156
|
|
|
|
|
|
|
); |
|
157
|
|
|
|
|
|
|
|
|
158
|
3
|
50
|
33
|
|
|
15
|
if($case1 or $case2) { |
|
159
|
3
|
|
|
|
|
6
|
$seq_right .= $ch; |
|
160
|
|
|
|
|
|
|
} |
|
161
|
|
|
|
|
|
|
else { |
|
162
|
0
|
|
|
|
|
0
|
last; |
|
163
|
|
|
|
|
|
|
} |
|
164
|
|
|
|
|
|
|
|
|
165
|
3
|
50
|
|
|
|
20
|
$seq_right = substr $seq_right, 0, 1 |
|
166
|
|
|
|
|
|
|
if substr($seq_right, -1) eq $spacer; |
|
167
|
|
|
|
|
|
|
} |
|
168
|
|
|
|
|
|
|
|
|
169
|
6
|
|
|
|
|
16
|
$seq = join '', $seq_left, $spacer, $seq_right; |
|
170
|
|
|
|
|
|
|
} |
|
171
|
|
|
|
|
|
|
|
|
172
|
164
|
|
|
|
|
296
|
$ctx->{spacer} = $spacer; |
|
173
|
164
|
|
|
|
|
273
|
$ctx->{seq} = $seq; |
|
174
|
164
|
|
|
|
|
360
|
$ctx->{seq_left} = $seq_left; |
|
175
|
164
|
|
|
|
|
248
|
$ctx->{seq_right} = $seq_right; |
|
176
|
|
|
|
|
|
|
|
|
177
|
164
|
|
|
|
|
289
|
return; |
|
178
|
|
|
|
|
|
|
} |
|
179
|
|
|
|
|
|
|
|
|
180
|
|
|
|
|
|
|
sub _vectorize { |
|
181
|
164
|
|
|
|
|
671
|
my $ckey = join ',', _is_hyphen($_[1]->{spacer}), |
|
182
|
164
|
|
|
164
|
|
357
|
@{$_[1]}{qw(spacer prevchar char nextchar nnextchar seq_left seq seq_right)}; |
|
183
|
164
|
|
66
|
|
|
910
|
$_[1]->{vector} = $_[0]->{_vectors_cache}{$ckey} ||= $_[0]->_do_vectorize($_[1]); |
|
184
|
|
|
|
|
|
|
|
|
185
|
164
|
|
|
|
|
233
|
return; |
|
186
|
|
|
|
|
|
|
} |
|
187
|
|
|
|
|
|
|
|
|
188
|
|
|
|
|
|
|
sub _do_vectorize { |
|
189
|
153
|
|
|
153
|
|
198
|
my($self, $ctx) = @_; |
|
190
|
|
|
|
|
|
|
|
|
191
|
153
|
|
|
|
|
260
|
my $spacer = !!length $ctx->{spacer}; |
|
192
|
153
|
|
66
|
|
|
302
|
my $spacer_is_hyphen = $spacer && _is_hyphen($ctx->{spacer}); |
|
193
|
|
|
|
|
|
|
|
|
194
|
153
|
50
|
66
|
|
|
310
|
my @bits = ( |
|
|
|
50
|
66
|
|
|
|
|
|
|
|
100
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
_char_class($ctx->{char}), |
|
196
|
|
|
|
|
|
|
_char_class($ctx->{nextchar}), |
|
197
|
|
|
|
|
|
|
_is_digit($ctx->{prevchar}), |
|
198
|
|
|
|
|
|
|
_is_digit($ctx->{nnextchar}), |
|
199
|
|
|
|
|
|
|
$spacer_is_hyphen |
|
200
|
|
|
|
|
|
|
? _is_dict_seq($self->{hyphens}, $ctx->{seq}) |
|
201
|
|
|
|
|
|
|
: 0, |
|
202
|
|
|
|
|
|
|
$spacer_is_hyphen |
|
203
|
|
|
|
|
|
|
? _is_suffix($ctx->{seq_right}) |
|
204
|
|
|
|
|
|
|
: 0, |
|
205
|
|
|
|
|
|
|
_is_same_pm($ctx->{char}, $ctx->{nextchar}), |
|
206
|
|
|
|
|
|
|
($spacer and not $spacer_is_hyphen) |
|
207
|
|
|
|
|
|
|
? _looks_like_url($ctx->{seq}, $ctx->{seq_right}) |
|
208
|
|
|
|
|
|
|
: 0, |
|
209
|
|
|
|
|
|
|
($spacer and not $spacer_is_hyphen) |
|
210
|
|
|
|
|
|
|
? _is_exception_seq($self->{exceptions}, $ctx->{seq}) |
|
211
|
|
|
|
|
|
|
: 0, |
|
212
|
|
|
|
|
|
|
$spacer_is_hyphen |
|
213
|
|
|
|
|
|
|
? _is_prefix($self->{prefixes}, $ctx->{seq_left}) |
|
214
|
|
|
|
|
|
|
: 0, |
|
215
|
|
|
|
|
|
|
(_is_colon($ctx->{spacer}) and !!length $ctx->{seq_right}) |
|
216
|
|
|
|
|
|
|
? _looks_like_time($ctx->{seq_left}, $ctx->{seq_right}) |
|
217
|
|
|
|
|
|
|
: 0, |
|
218
|
|
|
|
|
|
|
); |
|
219
|
|
|
|
|
|
|
|
|
220
|
153
|
|
|
|
|
1244
|
oct join '', '0b', @bits; |
|
221
|
|
|
|
|
|
|
} |
|
222
|
|
|
|
|
|
|
|
|
223
|
|
|
|
|
|
|
sub _init { |
|
224
|
1
|
|
|
1
|
|
1
|
my $self = shift; |
|
225
|
|
|
|
|
|
|
|
|
226
|
1
|
|
|
|
|
3
|
for(qw(exceptions prefixes hyphens)) { |
|
227
|
3
|
|
|
|
|
44
|
my $list = Lingua::RU::OpenCorpora::Tokenizer::List->new( |
|
228
|
|
|
|
|
|
|
$_, |
|
229
|
|
|
|
|
|
|
{ |
|
230
|
|
|
|
|
|
|
data_dir => $self->{data_dir}, |
|
231
|
|
|
|
|
|
|
}, |
|
232
|
|
|
|
|
|
|
); |
|
233
|
3
|
|
|
|
|
24
|
$self->{$_} = $list; |
|
234
|
|
|
|
|
|
|
} |
|
235
|
|
|
|
|
|
|
|
|
236
|
1
|
|
|
|
|
21
|
my $vectors = Lingua::RU::OpenCorpora::Tokenizer::Vectors->new({ |
|
237
|
|
|
|
|
|
|
data_dir => $self->{data_dir}, |
|
238
|
|
|
|
|
|
|
}); |
|
239
|
1
|
|
|
|
|
7
|
$self->{vectors} = $vectors; |
|
240
|
|
|
|
|
|
|
|
|
241
|
1
|
|
|
|
|
3
|
return; |
|
242
|
|
|
|
|
|
|
} |
|
243
|
|
|
|
|
|
|
|
|
244
|
21
|
100
|
|
21
|
|
92
|
sub _is_pmark { $_[0] =~ /^[,?!";«»]$/ ? 1 : 0 } |
|
245
|
|
|
|
|
|
|
|
|
246
|
2
|
100
|
|
2
|
|
12
|
sub _is_latin { $_[0] =~ /^\p{Latin}$/ ? 1 : 0 } |
|
|
2
|
|
|
16
|
|
3
|
|
|
|
2
|
|
|
|
|
42
|
|
|
|
16
|
|
|
|
|
115
|
|
|
247
|
|
|
|
|
|
|
|
|
248
|
306
|
100
|
|
306
|
|
1574
|
sub _is_cyr { $_[0] =~ /^\p{Cyrillic}$/ ? 1 : 0 } |
|
249
|
|
|
|
|
|
|
|
|
250
|
322
|
50
|
|
322
|
|
1212
|
sub _is_digit { $_[0] =~ /^[0-9]$/ ? 1 : 0 } |
|
251
|
|
|
|
|
|
|
|
|
252
|
8
|
50
|
|
8
|
|
37
|
sub _is_bracket1 { $_[0] =~ /^[\[({<]$/ ? 1 : 0 } |
|
253
|
|
|
|
|
|
|
|
|
254
|
8
|
50
|
|
8
|
|
39
|
sub _is_bracket2 { $_[0] =~ /^[\])}>]$/ ? 1 : 0 } |
|
255
|
|
|
|
|
|
|
|
|
256
|
0
|
0
|
|
0
|
|
0
|
sub _is_suffix { $_[0] =~ /^(?:то|таки|с|ка|де)$/ ? 1 : 0 } |
|
257
|
|
|
|
|
|
|
|
|
258
|
119
|
100
|
|
119
|
|
492
|
sub _is_space { $_[0] =~ /^\s$/ ? 1 : 0 } |
|
259
|
|
|
|
|
|
|
|
|
260
|
302
|
50
|
|
302
|
|
790
|
sub _is_hyphen { $_[0] eq '-' ? 1 : 0 } |
|
261
|
|
|
|
|
|
|
|
|
262
|
25
|
100
|
|
25
|
|
112
|
sub _is_dot { $_[0] eq '.' ? 1 : 0 } |
|
263
|
|
|
|
|
|
|
|
|
264
|
8
|
50
|
|
8
|
|
41
|
sub _is_single_quote { $_[0] eq "'" ? 1 : 0 } |
|
265
|
|
|
|
|
|
|
|
|
266
|
8
|
50
|
|
8
|
|
29
|
sub _is_slash { $_[0] eq '/' ? 1 : 0 } |
|
267
|
|
|
|
|
|
|
|
|
268
|
161
|
100
|
|
161
|
|
750
|
sub _is_colon { $_[0] eq ':' ? 1 : 0 } |
|
269
|
|
|
|
|
|
|
|
|
270
|
153
|
100
|
|
153
|
|
966
|
sub _is_same_pm { $_[0] eq $_[1] ? 1 : 0 } |
|
271
|
|
|
|
|
|
|
|
|
272
|
0
|
0
|
|
0
|
|
0
|
sub _is_prefix { $_[0]->in_list($_[1]) ? 1 : 0 } |
|
273
|
|
|
|
|
|
|
|
|
274
|
|
|
|
|
|
|
sub _is_dict_seq { |
|
275
|
0
|
0
|
0
|
0
|
|
0
|
return 0 if not $_[1] or substr $_[1], 0, 1 eq '-'; |
|
276
|
|
|
|
|
|
|
|
|
277
|
0
|
0
|
|
|
|
0
|
$_[0]->in_list($_[1]) ? 1 : 0; |
|
278
|
|
|
|
|
|
|
} |
|
279
|
|
|
|
|
|
|
|
|
280
|
|
|
|
|
|
|
sub _is_exception_seq { |
|
281
|
6
|
|
|
6
|
|
12
|
my $seq = $_[1]; # need a copy |
|
282
|
|
|
|
|
|
|
|
|
283
|
6
|
50
|
|
|
|
36
|
return 1 if $_[0]->in_list($seq); |
|
284
|
|
|
|
|
|
|
|
|
285
|
6
|
50
|
|
|
|
5427
|
return 0 unless $seq =~ /^\W|\W$/; |
|
286
|
|
|
|
|
|
|
|
|
287
|
6
|
|
|
|
|
19
|
$seq =~ s/^\W+//; |
|
288
|
6
|
50
|
|
|
|
24
|
return 1 if $_[0]->in_list($seq); |
|
289
|
|
|
|
|
|
|
|
|
290
|
6
|
|
|
|
|
38
|
while($seq =~ s/\W$//) { |
|
291
|
9
|
50
|
|
|
|
30
|
return 1 if $_[0]->in_list($seq); |
|
292
|
|
|
|
|
|
|
} |
|
293
|
|
|
|
|
|
|
|
|
294
|
6
|
|
|
|
|
21
|
0; |
|
295
|
|
|
|
|
|
|
} |
|
296
|
|
|
|
|
|
|
|
|
297
|
|
|
|
|
|
|
sub _looks_like_url { |
|
298
|
6
|
100
|
|
6
|
|
34
|
return 0 unless !!length $_[1]; |
|
299
|
3
|
50
|
|
|
|
12
|
return 0 if length $_[0] < 5; |
|
300
|
3
|
50
|
|
|
|
12
|
return 0 if substr $_[0], 0, 1 eq '.'; |
|
301
|
|
|
|
|
|
|
|
|
302
|
3
|
50
|
33
|
|
|
62
|
$_[0] =~ m{^\W*https?://?} |
|
|
|
|
33
|
|
|
|
|
|
303
|
|
|
|
|
|
|
or $_[0] =~ m{^\W*www\.} |
|
304
|
|
|
|
|
|
|
or $_[0] =~ m<.\.(?:[a-z]{2,3}|ру|рф)\W*$>i |
|
305
|
|
|
|
|
|
|
or return 0; |
|
306
|
|
|
|
|
|
|
|
|
307
|
0
|
|
|
|
|
0
|
1; |
|
308
|
|
|
|
|
|
|
} |
|
309
|
|
|
|
|
|
|
|
|
310
|
|
|
|
|
|
|
sub _looks_like_time { |
|
311
|
1
|
|
|
1
|
|
2
|
my($seq_left, $seq_right) = @_; # need copies |
|
312
|
|
|
|
|
|
|
|
|
313
|
1
|
|
|
|
|
7
|
$seq_left =~ s/^[^0-9]{1,2}//; |
|
314
|
1
|
|
|
|
|
7
|
$seq_right =~ s/[^0-9]+$//; |
|
315
|
|
|
|
|
|
|
|
|
316
|
1
|
50
|
33
|
|
|
13
|
return 0 if $seq_left !~ /^[0-9]{1,2}$/ |
|
317
|
|
|
|
|
|
|
or $seq_right !~ /^[0-9]{2}$/; |
|
318
|
|
|
|
|
|
|
|
|
319
|
0
|
0
|
0
|
|
|
0
|
($seq_left < 24 and $seq_right < 60) |
|
320
|
|
|
|
|
|
|
? 1 |
|
321
|
|
|
|
|
|
|
: 0; |
|
322
|
|
|
|
|
|
|
} |
|
323
|
|
|
|
|
|
|
|
|
324
|
|
|
|
|
|
|
sub _char_class { |
|
325
|
306
|
100
|
|
306
|
|
525
|
_is_cyr($_[0]) ? '0001' : |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
326
|
|
|
|
|
|
|
_is_space($_[0]) ? '0010' : |
|
327
|
|
|
|
|
|
|
_is_dot($_[0]) ? '0011' : |
|
328
|
|
|
|
|
|
|
_is_pmark($_[0]) ? '0100' : |
|
329
|
|
|
|
|
|
|
_is_hyphen($_[0]) ? '0101' : |
|
330
|
|
|
|
|
|
|
_is_digit($_[0]) ? '0110' : |
|
331
|
|
|
|
|
|
|
_is_latin($_[0]) ? '0111' : |
|
332
|
|
|
|
|
|
|
_is_bracket1($_[0]) ? '1000' : |
|
333
|
|
|
|
|
|
|
_is_bracket2($_[0]) ? '1001' : |
|
334
|
|
|
|
|
|
|
_is_single_quote($_[0]) ? '1010' : |
|
335
|
|
|
|
|
|
|
_is_slash($_[0]) ? '1011' : |
|
336
|
|
|
|
|
|
|
_is_colon($_[0]) ? '1100' : '0000'; |
|
337
|
|
|
|
|
|
|
} |
|
338
|
|
|
|
|
|
|
|
|
339
|
|
|
|
|
|
|
1; |
|
340
|
|
|
|
|
|
|
|
|
341
|
|
|
|
|
|
|
__END__ |