line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
# |
2
|
|
|
|
|
|
|
# $Id:$ |
3
|
|
|
|
|
|
|
# |
4
|
|
|
|
|
|
|
|
5
|
2
|
|
|
2
|
|
56600
|
use 5.010; |
|
2
|
|
|
|
|
8
|
|
|
2
|
|
|
|
|
76
|
|
6
|
2
|
|
|
2
|
|
11
|
use strict; |
|
2
|
|
|
|
|
4
|
|
|
2
|
|
|
|
|
147
|
|
7
|
2
|
|
|
2
|
|
8
|
use warnings; |
|
2
|
|
|
|
|
4
|
|
|
2
|
|
|
|
|
85
|
|
8
|
2
|
|
|
2
|
|
10
|
use utf8; |
|
2
|
|
|
|
|
5
|
|
|
2
|
|
|
|
|
14
|
|
9
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
package Geo::Names::Russian; |
12
|
|
|
|
|
|
|
BEGIN { |
13
|
2
|
|
|
2
|
|
164
|
$Geo::Names::Russian::VERSION = '0.02'; |
14
|
|
|
|
|
|
|
} |
15
|
|
|
|
|
|
|
# ABSTRACT: parse and split russian geographical names |
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
=head1 NAME |
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
Geo::Names::Russian - parse and split russian geographical names |
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
=head1 VERSION |
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
version 0.02 |
25
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
=head1 SYNOPSIS |
27
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
use Geo::Names::Russian qw{ :all }; |
29
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
for my $street ( @streetnames ) |
31
|
|
|
|
|
|
|
$count{ streetname_keystring( $street ) } ++; |
32
|
|
|
|
|
|
|
} |
33
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
=cut |
35
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
|
37
|
2
|
|
|
2
|
|
10
|
use base qw{ Exporter }; |
|
2
|
|
|
|
|
4
|
|
|
2
|
|
|
|
|
367
|
|
38
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
our @EXPORT_OK = qw{ |
40
|
|
|
|
|
|
|
streetname_split |
41
|
|
|
|
|
|
|
streetname_keystring |
42
|
|
|
|
|
|
|
housenumber_keystring |
43
|
|
|
|
|
|
|
}; |
44
|
|
|
|
|
|
|
|
45
|
|
|
|
|
|
|
our %EXPORT_TAGS = ( |
46
|
|
|
|
|
|
|
all => \@EXPORT_OK, |
47
|
|
|
|
|
|
|
); |
48
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
51
|
2
|
|
|
2
|
|
1785
|
use List::MoreUtils qw{ any first_index }; |
|
2
|
|
|
|
|
2952
|
|
|
2
|
|
|
|
|
745
|
|
52
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
my @statuses = ( |
56
|
|
|
|
|
|
|
[ 'улица' => 'ул' ], |
57
|
|
|
|
|
|
|
[ 'переулок' => 'пер(?:еул)?|пу' ], |
58
|
|
|
|
|
|
|
[ 'проспект' => 'пр(?:-к?т|осп)' ], |
59
|
|
|
|
|
|
|
[ 'проезд' => 'пр(?:-з?д)?' ], |
60
|
|
|
|
|
|
|
[ 'площадь' => 'пл' ], |
61
|
|
|
|
|
|
|
[ 'шоссе' => 'ш' ], |
62
|
|
|
|
|
|
|
[ 'тупик' => 'туп' ], |
63
|
|
|
|
|
|
|
[ 'бульвар' => 'б(?:ул|ульв|-р)' ], |
64
|
|
|
|
|
|
|
[ 'набережная' => 'наб' ], |
65
|
|
|
|
|
|
|
[ 'аллея' => 'ал' ], |
66
|
|
|
|
|
|
|
[ 'мост' => 'м' ], |
67
|
|
|
|
|
|
|
[ 'тракт' => 'тр' ], |
68
|
|
|
|
|
|
|
[ 'просек' => 'прос' ], |
69
|
|
|
|
|
|
|
[ 'линия' => 'лин' ], |
70
|
|
|
|
|
|
|
[ 'дорога' => 'дор' ], |
71
|
|
|
|
|
|
|
[ 'квартал' => 'кв(?:арт|-?л)?' ], |
72
|
|
|
|
|
|
|
[ 'микрорайон' => 'мк?рн?' ], |
73
|
|
|
|
|
|
|
[ 'территория' => 'тер' ], |
74
|
|
|
|
|
|
|
[ 'посёлок' => 'пос(?:[её]лок)?|р?п'], |
75
|
|
|
|
|
|
|
[ 'городок' => 'гор' ], |
76
|
|
|
|
|
|
|
[ 'станция' => 'ст(?:анц)?' ], |
77
|
|
|
|
|
|
|
[ 'хутор' => 'х(?:ут)' ], |
78
|
|
|
|
|
|
|
[ 'разъезд' => 'р(?:аз)-д|рзд' ], |
79
|
|
|
|
|
|
|
[ 'казарма' => 'каз' ], |
80
|
|
|
|
|
|
|
[ 'парк' => 'парк' ], |
81
|
|
|
|
|
|
|
[ 'деревня' => 'дер' ], |
82
|
|
|
|
|
|
|
); |
83
|
|
|
|
|
|
|
|
84
|
|
|
|
|
|
|
for my $rec ( @statuses ) { |
85
|
|
|
|
|
|
|
$rec->[1] = qr{ ^ (?: $rec->[0] | $rec->[1] ) \.? $ }ixms; |
86
|
|
|
|
|
|
|
} |
87
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
my @addition_words = ( |
90
|
|
|
|
|
|
|
[ 'Ниж' => 'Н\.?|НИЖН(?:\.|ИЙ|ЕЕ|ЯЯ)' ], |
91
|
|
|
|
|
|
|
[ 'Сред' => 'СР\.?|СРЕДН(?:\.|ИЙ|ЕЕ|ЯЯ)'], |
92
|
|
|
|
|
|
|
[ 'Верх' => 'В\.?|ВЕРХН?(?:\.|ИЙ|ЕЕ|ЯЯ)'], |
93
|
|
|
|
|
|
|
[ 'Стар' => 'СТ\.?|СТАР(?:ЫЙ|АЯ|ОЕ|\.)' ], |
94
|
|
|
|
|
|
|
[ 'Нов' => 'НОВ(?:АЯ|ЫЙ|ОЕ|\.)' ], |
95
|
|
|
|
|
|
|
[ 'Мал' => 'М\.?|МАЛ(?:АЯ|ЫЙ|ОЕ|\.)' ], |
96
|
|
|
|
|
|
|
[ 'Бол' => 'Б\.?|БОЛЬШ(?:АЯ|ОЙ|ОЕ|\.)' ], |
97
|
|
|
|
|
|
|
); |
98
|
|
|
|
|
|
|
|
99
|
|
|
|
|
|
|
for my $rec ( @addition_words ) { |
100
|
|
|
|
|
|
|
$rec->[1] = qr{ ^ (?: $rec->[0] \.? | $rec->[1] ) $ }ixms; |
101
|
|
|
|
|
|
|
} |
102
|
|
|
|
|
|
|
|
103
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
my @prof_words = qw{ |
105
|
|
|
|
|
|
|
академика профессора |
106
|
|
|
|
|
|
|
архитектора |
107
|
|
|
|
|
|
|
композитора |
108
|
|
|
|
|
|
|
летчика лётчика |
109
|
|
|
|
|
|
|
милиционера |
110
|
|
|
|
|
|
|
командира комиссара политрука |
111
|
|
|
|
|
|
|
снайпера танкиста |
112
|
|
|
|
|
|
|
сержанта лейтенанта капитана |
113
|
|
|
|
|
|
|
подполковника полковника |
114
|
|
|
|
|
|
|
адмирала генерала маршала |
115
|
|
|
|
|
|
|
}; |
116
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
=head1 FUNCTIONS |
118
|
|
|
|
|
|
|
|
119
|
|
|
|
|
|
|
=head2 streetname_split |
120
|
|
|
|
|
|
|
|
121
|
|
|
|
|
|
|
Splits streetname into meaningful parts |
122
|
|
|
|
|
|
|
|
123
|
|
|
|
|
|
|
my $street = '2-я Тверская-Ямская ул.'; |
124
|
|
|
|
|
|
|
my ( $status, $name, $addition, $number, $km ) = streetname_split( $street ); |
125
|
|
|
|
|
|
|
# ( 'улица', 'Тверская-Ямская', '', '2-я', '' ) |
126
|
|
|
|
|
|
|
|
127
|
|
|
|
|
|
|
=cut |
128
|
|
|
|
|
|
|
|
129
|
|
|
|
|
|
|
sub streetname_split { |
130
|
|
|
|
|
|
|
|
131
|
25
|
|
|
25
|
1
|
33235
|
my ( $name ) = @_; |
132
|
25
|
50
|
|
|
|
70
|
return unless $name; |
133
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
|
135
|
2
|
|
|
2
|
|
14
|
$name =~ s/ ( No? | № ) \s* (?=\d) /$1/gix; |
|
2
|
|
|
|
|
9
|
|
|
2
|
|
|
|
|
40
|
|
|
25
|
|
|
|
|
411
|
|
136
|
|
|
|
|
|
|
|
137
|
|
|
|
|
|
|
# делим строку на слова |
138
|
25
|
|
|
|
|
303
|
my @words = grep {$_} split / \s+ | (?<=[.,]) \s* /x, $name; |
|
75
|
|
|
|
|
160
|
|
139
|
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
# прицепляем инициалы к фамилии |
141
|
25
|
|
|
|
|
102
|
for my $i ( reverse 0 .. $#words-1 ) { |
142
|
50
|
100
|
|
|
|
225
|
next unless $words[$i] =~ / ^ \p{IsUpper} \.? $ /xms; |
143
|
7
|
100
|
66
|
|
|
72
|
next if $words[$i+1] && $words[$i+1] =~ / (?: [ыи]й | я ) $ /xms; # тут вряд ли инициалы |
144
|
5
|
50
|
|
|
|
27
|
$words[$i] .= q{.} unless $words[$i] =~ /\.$/xms; |
145
|
5
|
|
|
|
|
23
|
$words[$i] .= splice @words, $i+1, 1; |
146
|
|
|
|
|
|
|
} |
147
|
|
|
|
|
|
|
# и переносим их в конец |
148
|
25
|
|
|
|
|
53
|
for my $word ( @words ) { |
149
|
70
|
|
|
|
|
200
|
$word =~ s/ ^ ( (?:\p{Alpha}\.)+ ) ( \p{Alpha}{3,} ) $ /$2 $1/xms; |
150
|
|
|
|
|
|
|
} |
151
|
|
|
|
|
|
|
|
152
|
|
|
|
|
|
|
# вытаскиваем километр |
153
|
25
|
|
|
|
|
46
|
my $km = q{}; |
154
|
25
|
|
|
|
|
103
|
my $km_re = qr{ km | км | километр }xi; |
155
|
25
|
|
|
|
|
90
|
for my $i ( 1 .. $#words ) { |
156
|
45
|
100
|
100
|
|
|
610
|
if ( @words > 2 && $words[$i-1] =~ / ^ \d+ (?: -? \S{1,3} )? $ /xms && $words[$i] =~ / ^ $km_re $ /xms ) { |
|
|
|
100
|
|
|
|
|
157
|
3
|
|
|
|
|
18
|
$km = join q{ }, splice @words, $i-1, 2; |
158
|
3
|
|
|
|
|
9
|
last; |
159
|
|
|
|
|
|
|
} |
160
|
42
|
100
|
|
|
|
537
|
if ( my @res = $words[$i] =~ / ^ (\d+) ($km_re) $ /xms ) { |
161
|
1
|
|
|
|
|
4
|
$km = join q{ }, @res; |
162
|
1
|
|
|
|
|
3
|
splice @words, $i, 1; |
163
|
1
|
|
|
|
|
3
|
last; |
164
|
|
|
|
|
|
|
} |
165
|
|
|
|
|
|
|
} |
166
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
# вычленяем статус |
168
|
25
|
|
|
|
|
39
|
my $status; |
169
|
25
|
100
|
|
|
|
61
|
if ( @words > 1 ) { |
170
|
24
|
|
|
|
|
47
|
for my $rec ( @statuses ) { |
171
|
378
|
|
|
871
|
|
1415
|
my $i = first_index { $_ =~ $rec->[1] } @words; |
|
871
|
|
|
|
|
4385
|
|
172
|
378
|
100
|
|
|
|
1382
|
next if $i < 0; |
173
|
12
|
|
|
|
|
26
|
$status = $rec->[0]; |
174
|
12
|
|
|
|
|
30
|
splice @words, $i, 1; |
175
|
12
|
|
|
|
|
47
|
last; |
176
|
|
|
|
|
|
|
} |
177
|
|
|
|
|
|
|
} |
178
|
|
|
|
|
|
|
|
179
|
|
|
|
|
|
|
# вытаскиваем правильно заданный номер |
180
|
25
|
|
|
|
|
47
|
my $number = q{}; |
181
|
25
|
|
|
|
|
73
|
for my $i ( 0 .. $#words ) { |
182
|
42
|
100
|
|
|
|
99
|
last if @words < 2; |
183
|
|
|
|
|
|
|
next unless |
184
|
37
|
100
|
100
|
|
|
597
|
( !$status || $status =~ /[аяь]$/ixms ) && $words[$i] =~ m{ ^ \d{1,2} -? а?я $ }ixms # женский род |
|
|
|
100
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
33
|
|
|
|
|
185
|
|
|
|
|
|
|
|| $status && $status =~ /[е]$/ixms && $words[$i] =~ m{ ^ \d{1,2} -? о?е $ }ixms # средний |
186
|
|
|
|
|
|
|
|| $status && $status =~ /[^еаяь]$/ixms && $words[$i] =~ m{ ^ \d{1,2} -? [ыио]?й $ }ixms; # мужской |
187
|
5
|
|
|
|
|
20
|
$number = lc $words[$i]; |
188
|
5
|
|
|
|
|
11
|
splice @words, $i, 1; |
189
|
5
|
|
|
|
|
8
|
last; |
190
|
|
|
|
|
|
|
} |
191
|
|
|
|
|
|
|
|
192
|
|
|
|
|
|
|
# пробуем вытащить кривой номер |
193
|
25
|
100
|
100
|
|
|
246
|
if ( @words > 1 && !$number |
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
194
|
|
|
|
|
|
|
&& $words[0] =~ / ^ \d{1,2} $ /xms |
195
|
|
|
|
|
|
|
&& $words[-1] =~ / (?: [ая]я | [ыи]й ) $ /xms ) { |
196
|
2
|
|
|
|
|
7
|
$number = shift @words; |
197
|
|
|
|
|
|
|
} |
198
|
25
|
50
|
100
|
|
|
157
|
if ( @words > 1 && !$number |
|
|
|
66
|
|
|
|
|
|
|
|
33
|
|
|
|
|
199
|
|
|
|
|
|
|
&& $words[-1] =~ / ^ \d{1,2} $ /xms |
200
|
|
|
|
|
|
|
&& $words[-2] =~ / (?: [ая]я | [ыи]й ) $ /xms ) { |
201
|
0
|
|
|
|
|
0
|
$number = pop @words; |
202
|
|
|
|
|
|
|
} |
203
|
|
|
|
|
|
|
|
204
|
|
|
|
|
|
|
# вытаскиваем вспомогательные имена |
205
|
25
|
|
|
|
|
35
|
my @additions; |
206
|
25
|
|
|
|
|
57
|
for my $i ( reverse 0 .. $#words ) { |
207
|
44
|
100
|
|
|
|
150
|
last if @words <= 1; |
208
|
33
|
100
|
|
207
|
|
153
|
next unless any { $words[$i] =~ $_->[1] } @addition_words; |
|
207
|
|
|
|
|
1031
|
|
209
|
6
|
|
|
|
|
33
|
push @additions, splice @words, $i, 1; |
210
|
|
|
|
|
|
|
} |
211
|
|
|
|
|
|
|
|
212
|
|
|
|
|
|
|
# и профессию |
213
|
25
|
|
|
36
|
|
137
|
my $i = first_index { my $w = lc $_; any { $w eq $_ } @prof_words } @words; |
|
36
|
|
|
|
|
411
|
|
|
36
|
|
|
|
|
12626
|
|
|
716
|
|
|
|
|
709
|
|
214
|
25
|
100
|
100
|
|
|
240
|
if ( @words > 1 && $i >= 0 ) { |
215
|
2
|
|
|
|
|
10
|
unshift @additions, splice @words, $i, 1; |
216
|
|
|
|
|
|
|
} |
217
|
|
|
|
|
|
|
|
218
|
|
|
|
|
|
|
# статус по дефолту, если не нашли |
219
|
25
|
100
|
66
|
|
|
293
|
$status ||= $words[-1] =~ /[ыи]й $/ix ? 'переулок' : 'улица'; |
220
|
|
|
|
|
|
|
|
221
|
25
|
|
|
|
|
205
|
return( $status, join( q{ }, @words ), join( q{ }, @additions ), $number, $km ); |
222
|
|
|
|
|
|
|
} |
223
|
|
|
|
|
|
|
|
224
|
|
|
|
|
|
|
|
225
|
|
|
|
|
|
|
=head2 streetname_keystring |
226
|
|
|
|
|
|
|
|
227
|
|
|
|
|
|
|
Returns unified keystring for street |
228
|
|
|
|
|
|
|
|
229
|
|
|
|
|
|
|
my $street = '2-й пр. Марьиной Рощи'; |
230
|
|
|
|
|
|
|
my $key = streetname_keystring( $street ); |
231
|
|
|
|
|
|
|
# 'МАРЬИНОЙ РОЩИ 2 ПРОЕЗД' |
232
|
|
|
|
|
|
|
|
233
|
|
|
|
|
|
|
=cut |
234
|
|
|
|
|
|
|
|
235
|
|
|
|
|
|
|
sub streetname_keystring { |
236
|
12
|
|
|
12
|
1
|
8032
|
my ($street, $suburb) = @_; |
237
|
|
|
|
|
|
|
|
238
|
12
|
|
50
|
|
|
63
|
( $suburb ||= q{} ) =~ s/ ^ (дер|г|пос) [\.\s]+ //ix; |
239
|
12
|
50
|
|
|
|
30
|
return uc $suburb unless $street; |
240
|
|
|
|
|
|
|
|
241
|
12
|
|
|
|
|
27
|
my ( $status, $name, $addition, $number, $km ) = streetname_split( $street ); |
242
|
|
|
|
|
|
|
|
243
|
12
|
|
100
|
|
|
70
|
( $number ||= q{} ) =~ s/ - \S+ //xms; |
244
|
12
|
|
|
|
|
527
|
$name =~ s/ (?<=\d) - \S+ //xms; |
245
|
12
|
|
|
|
|
32
|
$name =~ s/ ^ (?: No? | № ) (?=\d) //xms; |
246
|
12
|
|
|
|
|
100
|
$name =~ s/ (?<=\p{IsAlpha}) \s+ (?=\p{IsUpper}\.) /_/gxms; |
247
|
12
|
|
|
|
|
23
|
$name =~ s/ ^ им (?: ени )? \.? \s+ (?= \S ) //xms; |
248
|
|
|
|
|
|
|
|
249
|
12
|
100
|
|
|
|
41
|
if ( my ($n) = $km =~ / ^ (\d+) /xms ) { |
250
|
2
|
|
|
|
|
5
|
$km = "${n}км"; |
251
|
|
|
|
|
|
|
} |
252
|
|
|
|
|
|
|
|
253
|
12
|
100
|
100
|
|
|
50
|
if ( $addition ||= q{} ) { |
254
|
4
|
|
|
|
|
11
|
for my $rec ( @addition_words ) { |
255
|
16
|
100
|
|
|
|
110
|
next unless $addition =~ $rec->[1]; |
256
|
3
|
|
|
|
|
10
|
$addition = $rec->[0]; |
257
|
3
|
|
|
|
|
6
|
last; |
258
|
|
|
|
|
|
|
} |
259
|
|
|
|
|
|
|
} |
260
|
|
|
|
|
|
|
|
261
|
12
|
|
|
|
|
145
|
my $result = uc join( q{ }, $name, $addition, $status ); |
262
|
12
|
100
|
100
|
|
|
7252
|
$result .= uc " $number" if $number || $km || $suburb; |
|
|
|
66
|
|
|
|
|
263
|
12
|
100
|
66
|
|
|
52
|
$result .= uc " $km" if $km || $suburb; |
264
|
12
|
50
|
|
|
|
25
|
$result .= uc " $suburb" if $suburb; |
265
|
12
|
|
|
|
|
114
|
$result =~ s/Ё/Е/gi; |
266
|
12
|
|
|
|
|
46
|
return $result; |
267
|
|
|
|
|
|
|
} |
268
|
|
|
|
|
|
|
|
269
|
|
|
|
|
|
|
=head2 housenumber_keystring |
270
|
|
|
|
|
|
|
|
271
|
|
|
|
|
|
|
Returns unified keystring for house |
272
|
|
|
|
|
|
|
|
273
|
|
|
|
|
|
|
my $house = '1А к3 с5'; |
274
|
|
|
|
|
|
|
my $key = housenumber_keystring( $house ); |
275
|
|
|
|
|
|
|
# '1АК3С5' |
276
|
|
|
|
|
|
|
|
277
|
|
|
|
|
|
|
=cut |
278
|
|
|
|
|
|
|
|
279
|
|
|
|
|
|
|
sub housenumber_keystring { |
280
|
|
|
|
|
|
|
|
281
|
0
|
|
|
0
|
1
|
|
my ( $name ) = @_; |
282
|
|
|
|
|
|
|
|
283
|
0
|
0
|
|
|
|
|
return q{} if $name eq q{}; |
284
|
|
|
|
|
|
|
|
285
|
0
|
|
|
|
|
|
my $key = uc $name; |
286
|
0
|
|
|
|
|
|
$key =~ s/[\. ]//g; |
287
|
0
|
|
|
|
|
|
$key =~ s/КОРП(?:УС)/К/gi; |
288
|
0
|
|
|
|
|
|
$key =~ s/СТР/С/gi; |
289
|
0
|
|
|
|
|
|
$key =~ s/k/К/gi; |
290
|
0
|
|
|
|
|
|
$key =~ s/c/С/gi; |
291
|
|
|
|
|
|
|
|
292
|
0
|
|
|
|
|
|
$key =~ s/^вл?\.?\s*(\d.*)/$1/i; |
293
|
|
|
|
|
|
|
|
294
|
0
|
|
|
|
|
|
return $key; |
295
|
|
|
|
|
|
|
} |
296
|
|
|
|
|
|
|
|
297
|
|
|
|
|
|
|
|
298
|
|
|
|
|
|
|
|
299
|
|
|
|
|
|
|
|
300
|
|
|
|
|
|
|
1; |