|  line  | 
 stmt  | 
 bran  | 
 cond  | 
 sub  | 
 pod  | 
 time  | 
 code  | 
| 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 package Lingua::RU::OpenCorpora::Tokenizer::List;  | 
| 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
3
 | 
3
 | 
 
 | 
 
 | 
  
3
  
 | 
 
 | 
36177
 | 
 use utf8;  | 
| 
 
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
7
 | 
    | 
| 
 
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
23
 | 
    | 
| 
4
 | 
3
 | 
 
 | 
 
 | 
  
3
  
 | 
 
 | 
91
 | 
 use strict;  | 
| 
 
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
3
 | 
    | 
| 
 
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
95
 | 
    | 
| 
5
 | 
3
 | 
 
 | 
 
 | 
  
3
  
 | 
 
 | 
15
 | 
 use warnings;  | 
| 
 
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
6
 | 
    | 
| 
 
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
158
 | 
    | 
| 
6
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
7
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 our $VERSION = 0.06;  | 
| 
8
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
9
 | 
3
 | 
 
 | 
 
 | 
  
3
  
 | 
 
 | 
4930
 | 
 use IO::File;  | 
| 
 
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
57365
 | 
    | 
| 
 
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
600
 | 
    | 
| 
10
 | 
3
 | 
 
 | 
 
 | 
  
3
  
 | 
 
 | 
33
 | 
 use File::Spec;  | 
| 
 
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
8
 | 
    | 
| 
 
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
153
 | 
    | 
| 
11
 | 
3
 | 
 
 | 
 
 | 
  
3
  
 | 
 
 | 
21
 | 
 use Carp qw(croak);  | 
| 
 
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
6
 | 
    | 
| 
 
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
165
 | 
    | 
| 
12
 | 
3
 | 
 
 | 
 
 | 
  
3
  
 | 
 
 | 
29548
 | 
 use Encode qw(decode);  | 
| 
 
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
94180
 | 
    | 
| 
 
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
526
 | 
    | 
| 
13
 | 
3
 | 
 
 | 
 
 | 
  
3
  
 | 
 
 | 
6531
 | 
 use IO::Uncompress::Gunzip;  | 
| 
 
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
162860
 | 
    | 
| 
 
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
379
 | 
    | 
| 
14
 | 
3
 | 
 
 | 
 
 | 
  
3
  
 | 
 
 | 
11737
 | 
 use File::ShareDir qw(dist_dir);  | 
| 
 
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
21097
 | 
    | 
| 
 
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
2144
 | 
    | 
| 
15
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
16
 | 
0
 | 
 
 | 
 
 | 
  
0
  
 | 
  
0
  
 | 
0
 | 
 sub data_version { 0.05 }  | 
| 
17
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
18
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub new {  | 
| 
19
 | 
8
 | 
 
 | 
 
 | 
  
8
  
 | 
  
1
  
 | 
29323
 | 
     my($class, $name, $args) = @_;  | 
| 
20
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
21
 | 
8
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
39
 | 
     croak "List name unspecified" unless defined $name;  | 
| 
22
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
23
 | 
8
 | 
 
 | 
  
100
  
 | 
 
 | 
 
 | 
42
 | 
     $args             ||= {};  | 
| 
24
 | 
8
 | 
 
 | 
  
 66
  
 | 
 
 | 
 
 | 
62
 | 
     $args->{data_dir} ||= dist_dir('Lingua-RU-OpenCorpora-Tokenizer');  | 
| 
25
 | 
8
 | 
 
 | 
  
 50
  
 | 
 
 | 
 
 | 
1011
 | 
     $args->{root_url} ||= 'http://opencorpora.org/files/export/tokenizer_data';  | 
| 
26
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
27
 | 
8
 | 
 
 | 
 
 | 
 
 | 
 
 | 
64
 | 
     my $self = bless {  | 
| 
28
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         %$args,  | 
| 
29
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         name => $name,  | 
| 
30
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }, $class;  | 
| 
31
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
32
 | 
8
 | 
 
 | 
 
 | 
 
 | 
 
 | 
46
 | 
     $self->_load;  | 
| 
33
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
34
 | 
8
 | 
 
 | 
 
 | 
 
 | 
 
 | 
719
 | 
     $self;  | 
| 
35
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
36
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
37
 | 
31
 | 
 
 | 
 
 | 
  
31
  
 | 
  
1
  
 | 
8220
 | 
 sub in_list { exists $_[0]->{data}{lc $_[1]} }  | 
| 
 
 | 
1
 | 
 
 | 
 
 | 
  
1
  
 | 
 
 | 
9
 | 
    | 
| 
 
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1
 | 
    | 
| 
 
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
15
 | 
    | 
| 
38
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
39
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub _load {  | 
| 
40
 | 
8
 | 
 
 | 
 
 | 
  
8
  
 | 
 
 | 
16
 | 
     my $self = shift;  | 
| 
41
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
42
 | 
8
 | 
 
 | 
 
 | 
 
 | 
 
 | 
36
 | 
     my $fn = $self->_path;  | 
| 
43
 | 
8
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
89
 | 
     my $fh = IO::Uncompress::Gunzip->new($fn) or die "$fn: $IO::Uncompress::Gunzip::GunzipError";  | 
| 
44
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
45
 | 
8
 | 
 
 | 
 
 | 
 
 | 
 
 | 
19167
 | 
     chomp($self->{version} = $fh->getline);  | 
| 
46
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
47
 | 
8
 | 
 
 | 
 
 | 
 
 | 
 
 | 
589
 | 
     my @data = map decode('utf-8', lc), $fh->getlines;  | 
| 
48
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
49
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # workaround for The Unicode Bug  | 
| 
50
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # see https://metacpan.org/module/perlunicode#The-Unicode-Bug  | 
| 
51
 | 
8
 | 
 
 | 
 
 | 
 
 | 
 
 | 
4934224
 | 
     utf8::upgrade($_) for @data;  | 
| 
52
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
53
 | 
8
 | 
 
 | 
 
 | 
 
 | 
 
 | 
106
 | 
     $self->_parse_list(\@data);  | 
| 
54
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
55
 | 
8
 | 
 
 | 
 
 | 
 
 | 
 
 | 
130
 | 
     $fh->close;  | 
| 
56
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
57
 | 
8
 | 
 
 | 
 
 | 
 
 | 
 
 | 
5233
 | 
     return;  | 
| 
58
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
59
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
60
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub _update {  | 
| 
61
 | 
0
 | 
 
 | 
 
 | 
  
0
  
 | 
 
 | 
0
 | 
     my($self, $new_data) = @_;  | 
| 
62
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
63
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
     my $fn = $self->_path;  | 
| 
64
 | 
0
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
0
 | 
     my $fh = IO::File->new($fn, '>') or croak "$fn: $!";  | 
| 
65
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
     $fh->binmode;  | 
| 
66
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
     $fh->print($new_data);  | 
| 
67
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
     $fh->close;  | 
| 
68
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
69
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
     $self->_load;  | 
| 
70
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
71
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
72
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
73
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub _parse_list {  | 
| 
74
 | 
6
 | 
 
 | 
 
 | 
  
6
  
 | 
 
 | 
17
 | 
     my($self, $list) = @_;  | 
| 
75
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
76
 | 
6
 | 
 
 | 
 
 | 
 
 | 
 
 | 
8650
 | 
     chomp @$list;  | 
| 
77
 | 
6
 | 
 
 | 
 
 | 
 
 | 
 
 | 
271
 | 
     $self->{data} = +{ map {$_,undef} @$list };  | 
| 
 
 | 
62676
 | 
 
 | 
 
 | 
 
 | 
 
 | 
131764
 | 
    | 
| 
78
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
79
 | 
6
 | 
 
 | 
 
 | 
 
 | 
 
 | 
4535
 | 
     return;  | 
| 
80
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
81
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
82
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub _path {  | 
| 
83
 | 
8
 | 
 
 | 
 
 | 
  
8
  
 | 
 
 | 
13
 | 
     my $self = shift;  | 
| 
84
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
85
 | 
8
 | 
 
 | 
 
 | 
 
 | 
 
 | 
138
 | 
     File::Spec->catfile($self->{data_dir}, "$self->{name}.gz");  | 
| 
86
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
87
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
88
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub _url {  | 
| 
89
 | 
0
 | 
 
 | 
 
 | 
  
0
  
 | 
 
 | 
0
 | 
     my($self, $mode) = @_;  | 
| 
90
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
91
 | 
0
 | 
 
 | 
  
  0
  
 | 
 
 | 
 
 | 
0
 | 
     $mode ||= 'file';  | 
| 
92
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
93
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
     my $url = join '/', $self->{root_url}, $self->data_version, $self->{name};  | 
| 
94
 | 
0
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
0
 | 
     if($mode eq 'file') {  | 
| 
 
 | 
 
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
95
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
         $url .= '.gz';  | 
| 
96
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
97
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     elsif($mode eq 'version') {  | 
| 
98
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
         $url .= '.latest';  | 
| 
99
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
100
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
101
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
     $url;  | 
| 
102
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
103
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
104
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 1;  | 
| 
105
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
106
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 __END__  |