line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Lingua::RU::Detect; |
2
|
|
|
|
|
|
|
|
3
|
1
|
|
|
1
|
|
21047
|
use vars qw ($VERSION); |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
73
|
|
4
|
|
|
|
|
|
|
$VERSION = '1.1'; |
5
|
|
|
|
|
|
|
|
6
|
1
|
|
|
1
|
|
6
|
use strict; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
37
|
|
7
|
1
|
|
|
1
|
|
1125
|
use utf8; |
|
1
|
|
|
|
|
14
|
|
|
1
|
|
|
|
|
5
|
|
8
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
require Exporter; |
10
|
|
|
|
|
|
|
our @ISA = qw(Exporter); |
11
|
|
|
|
|
|
|
our @EXPORT_OK = qw(detect_enc); |
12
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
my %patterns = ( |
14
|
|
|
|
|
|
|
'-|'. |
15
|
|
|
|
|
|
|
'UTF-8.KOI8-R,CP1251.KOI8-R,CP1251.UTF-8', '\b(?:[А-Я][а-х]+|[а-я]{3,})\b', |
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
'UTF-8.CP1251,KOI8-R.UTF-8|'. |
18
|
|
|
|
|
|
|
'UTF-8.KOI8-R,CP1251.UTF-8', '\b(?:[а-я][А-Я]+|[А-Я]{3,})\b', |
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
'UTF-8.ISO-8859-5,KOI8-R.UTF-8|'. |
21
|
|
|
|
|
|
|
'UTF-8.KOI8-R,ISO-8859-5.UTF-8|'. |
22
|
|
|
|
|
|
|
'UTF-8.CP1251,ISO-8859-5.UTF-8', '[а-я]+[А-Я]+[а-я]+|[А-Я]+[а-я]+[А-Я]+', |
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
'UTF-8.ISO-8859-5,CP1251.UTF-8|'. |
25
|
|
|
|
|
|
|
'UTF-8.CP866,CP1251.UTF-8', '[а-яёїџђѓќў]+[√ЄЁЎЇ№]{1,2}[а-яџђѓќўёї]+\b', |
26
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
'UTF-7.UTF-8', '(?:^|\s)\+B[B-F][0-9a-zA-Z]+', |
28
|
|
|
|
|
|
|
'UTF-7.CP866', '(?:^|\s)\+JWg[a-zA-Z]+', |
29
|
|
|
|
|
|
|
'UTF-8.ISO-8859-1,UTF-7.ISO-8859-1', '(?:^|\s)\+AN[a-zA-Z]+', |
30
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
'UTF-8.CP1251,KOI8-R.CP866', 'є.¶|¶.¶', |
32
|
|
|
|
|
|
|
'UTF-8.CP1252,KOI8-R.CP866', 'º.º|¶.º', |
33
|
|
|
|
|
|
|
'UTF-8.CP1251,UTF-8.CP866', 'в•Ё|•›в•', |
34
|
|
|
|
|
|
|
'UTF-8.KOI8-R,UTF-8.CP866', 'Б∙╗|Б∙.Б∙.', |
35
|
|
|
|
|
|
|
'UTF-8.ISO-8859-1', 'к|пÐ', |
36
|
|
|
|
|
|
|
'UTF-8.CP1251,UTF-8.ISO-8859-1', 'ГђВ|°Г‘', |
37
|
|
|
|
|
|
|
'UTF-8.KOI8-R,UTF-8.ISO-8859-1', 'ц░б', |
38
|
|
|
|
|
|
|
'UTF-8.CP1251,UTF-8.UTF-16', 'н‚.н‚|н†.н†', |
39
|
|
|
|
|
|
|
'UTF-8.KOI8-R,UTF-8.UTF-16', '╬М|╪М|╣М|╫М', |
40
|
|
|
|
|
|
|
'UTF-8.CP1251,CP866.UTF-8', '®.+Ґ|«Ё|®¤®|«Ґ', |
41
|
|
|
|
|
|
|
'UTF-8.CP1251,UTF-8.KOI8-R', 'Рїв|СЏ', |
42
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
'UTF-8.ISO-8859-1,ISO-8859-5.UTF-8', '[ýëåêòðèôèêàöèÿãóáåðíèé]+[ÝËÅÊÒÐÈÔÈÊÀÖÈŸÃÓÁÅÐÍÈÉ]+[ýëåêòðèôèêàöèÿãóáåðíèé]+|[ÝËÅÊÒÐÈÔÈÊÀÖÈŸÃÓÁÅÐÍÈÉ]+[ýëåêòðèôèêàöèÿãóáåðíèé]+[ÝËÅÊÒÐÈÔÈÊÀÖÈŸÃÓÁÅÐÍÈÉ]+', |
44
|
|
|
|
|
|
|
'UTF-8.CP1252,CP1251.UTF-8', '[ýëåêòðèôèêàöèÿãóáåðíèé]{3,}', |
45
|
|
|
|
|
|
|
'UTF-8.CP1252,KOI8-R.UTF-8', '[ÀÏÎËÜÇÎÂÀÍÈÅÑ]{2,}', |
46
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
'UTF-8.CP866,KOI8-R.UTF-8', '[╟╧╫╣║╔╙═╔╥╧╦┴╤╠┼╦╘╥╔╞╔╦┴├╔╤└╓╬┘╚ ╟╒┬┼╥╬╔╩]{4,}', |
48
|
|
|
|
|
|
|
'UTF-8.KOI8-R,CP866.UTF-8', '[╟╧╫╣║╔╙═╔╥╧╦┴╤╠┼╦╘╥╔╞╔╦┴├╔╤└╓╬┘╚ ╟╒┬┼╥╬╔╩]+[А-Я]+[╟╧╫╣║╔╙═╔╥╧╦┴╤╠┼╦╘╥╔╞╔╦┴├╔╤└╓╬┘╚ ╟╒┬┼╥╬╔╩]+[А-Я]+[╟╧╫╣║╔╙═╔╥╧╦┴╤╠┼╦╘╥╔╞╔╦┴├╔╤└╓╬┘╚ ╟╒┬┼╥╬╔╩]+', |
49
|
|
|
|
|
|
|
'UTF-8.CP866,ISO-8859-5.UTF-8', '[а-ор-я]+[╟╧╫╣║╔╙═╔╥╧╦┴╤╠┼╦╘╥╔╞╔╦┴├╔╤└╓╬┘╚ ╟╒┬┼╥╬╔╩]+[а-ор-я]+[╟╧╫╣║╔╙═╔╥╧╦┴╤╠┼╦╘╥╔╞╔╦┴├╔╤└╓╬┘╚ ╟╒┬┼╥╬╔╩]+', |
50
|
|
|
|
|
|
|
'UTF-8.KOI8-R', '[пя][╟╧╫╣║╔╙═╔╥╧╦┴╤╠┼╦╘╥╔╞╔╦┴├╔╤└╓╬┘╚ ╟╒┬┼╥╬╔╩][пя]|п.я', |
51
|
|
|
|
|
|
|
'UTF-8.CP866', '[▒▓╟╧╫╣║╔╙═╔╥╧╦┴╤╠┼╦╘╥╔╞╔╦┴├╔╤└╓╬┘╚ ╟╒┬┼╥╬╔╩]+[А-Я][▒▓╟╧╫╣║╔╙═╔╥╧╦┴╤╠┼╦╘╥╔╞╔╦┴├╔╤└╓╬┘╚ ╟╒┬┼╥╬╔╩]+[А-Я][▒▓╟╧╫╣║╔╙═╔╥╧╦┴╤╠┼╦╘╥╔╞╔╦┴├╔╤└╓╬┘╚ ╟╒┬┼╥╬╔╩]+', |
52
|
|
|
|
|
|
|
); |
53
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
my %ambiguities = ( |
55
|
|
|
|
|
|
|
'-' => 'на|ст|ни|но|ан|ов|ко|то|ен|ле|ел|ра|ре|ис|по|ом|ро|ет|ва|та|ос|те|де|ль|ер|он|ть|за|ск|от|ли|ат|ол|об|ар|од|ие|го|пр|ри|мо|ам|сл|тр|не|ор|ла|ал|ит', |
56
|
|
|
|
|
|
|
'UTF-8.KOI8-R,CP1251.KOI8-R,CP1251.UTF-8' => 'лч|ъп|лу|лм|чл|ма|им|пм|ел|йе|ей|оч|ое|уъ|нм|мк|ом|еп|ач|пч|мъ|пе|де|йщ|ео|мл|пщ|цч|ъи|мп|йу|чп|мй|мю|чо|мд|уе|жм|но|оу|км|чк|ъй|по|ле|мо|йч|чй|уп', |
57
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
'UTF-8.CP1251,KOI8-R.UTF-8' => 'ОБ|УФ|ОЙ|ОП|БО|ПЧ|ЛП|ФП|ЕО|МЕ|ЕМ|ТБ|ТЕ|ЙУ|РП|ПН|ТП|ЕФ|ЧБ|ФБ|ПУ|ФЕ|ДЕ|МШ|ЕТ|ПО|ФШ|ЪБ|УЛ|ПФ|МЙ|БФ|ПМ|ПВ|БТ|ПД|ЙЕ|ЗП|РТ|ТЙ|НП|БН|УМ|ФТ|ОЕ|ПТ|МБ|БМ|ЙФ', |
59
|
|
|
|
|
|
|
'UTF-8.KOI8-R,CP1251.UTF-8' => 'МЮ|ЯР|МХ|МН|ЮМ|НБ|ЙН|РН|ЕМ|КЕ|ЕК|ПЮ|ПЕ|ХЯ|ОН|НЛ|ПН|ЕР|БЮ|РЮ|НЯ|РЕ|ДЕ|КЭ|ЕП|НМ|РЭ|ГЮ|ЯЙ|НР|КХ|ЮР|НК|НА|ЮП|НД|ХЕ|ЦН|ОП|ПХ|ЛН|ЮЛ|ЯК|РП|МЕ|НП|КЮ|ЮК|ХР', |
60
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
'UTF-8.ISO-8859-5,KOI8-R.UTF-8' => 'ЮС|гд|ЮЩ|ЮЯ|СЮ|Яз|ЫЯ|дЯ|ХЮ|ЬХ|ХЬ|вС|вХ|Щг|аЯ|ЯЭ|вЯ|Хд|зС|дС|Яг|дХ|ФХ|Ьи|Хв|ЯЮ|ди|кС|гЫ|Яд|ЬЩ|Сд|ЯЬ|ЯТ|Св|ЯФ|ЩХ|ЧЯ|ав|вЩ|ЭЯ|СЭ|гЬ|дв|ЮХ|Яв|ЬС|СЬ|Щд', |
62
|
|
|
|
|
|
|
'UTF-8.KOI8-R,ISO-8859-5.UTF-8' => 'щп|АБ|щь|щч|пщ|чр|зч|Бч|ущ|шу|уш|Юп|Юу|ьА|ъч|чэ|Юч|уБ|рп|Бп|чА|Бу|ту|шЛ|уЮ|чщ|БЛ|вп|Аз|чБ|шь|пБ|чш|чя|пЮ|чт|ьу|сч|ъЮ|Юь|эч|пэ|Аш|БЮ|щу|чЮ|шп|пш|ьБ', |
63
|
|
|
|
|
|
|
'UTF-8.CP1251,ISO-8859-5.UTF-8' => 'ЭР|бв|ЭШ|ЭЮ|РЭ|ЮТ|ЪЮ|вЮ|ХЭ|ЫХ|ХЫ|аР|аХ|Шб|ЯЮ|ЮЬ|аЮ|Хв|ТР|вР|Юб|вХ|ФХ|Ым|Ха|ЮЭ|вм|ЧР|бЪ|Юв|ЫШ|Рв|ЮЫ|ЮС|Ра|ЮФ|ШХ|УЮ|Яа|аШ|ЬЮ|РЬ|бЫ|ва|ЭХ|Юа|ЫР|РЫ|Шв', |
64
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
'UTF-8.ISO-8859-5,CP1251.UTF-8' => 'эр|ёђ|эш|эю|рэ|ют|ъю|ђю|хэ|ых|хы|№р|№х|шё|яю|юь|№ю|хђ|тр|ђр|юё|ђх|фх|ыќ|х№|юэ|ђќ|чр|ёъ|юђ|ыш|рђ|юы|юс|р№|юф|шх|ую|я№|№ш|ью|рь|ёы|ђ№|эх|ю№|ыр|ры|шђ', |
66
|
|
|
|
|
|
|
'UTF-8.CP866,CP1251.UTF-8' => 'эр|ёЄ|эш|эю|рэ|ют|ъю|Єю|хэ|ых|хы|Ёр|Ёх|шё|яю|юь|Ёю|хЄ|тр|Єр|юё|Єх|фх|ы№|хЁ|юэ|Є№|чр|ёъ|юЄ|ыш|рЄ|юы|юс|рЁ|юф|шх|ую|яЁ|Ёш|ью|рь|ёы|ЄЁ|эх|юЁ|ыр|ры|шЄ', |
67
|
|
|
|
|
|
|
); |
68
|
|
|
|
|
|
|
|
69
|
|
|
|
|
|
|
sub detect_enc { |
70
|
0
|
|
|
0
|
0
|
|
my $string = shift; |
71
|
|
|
|
|
|
|
|
72
|
0
|
|
|
|
|
|
my %variants = (); |
73
|
0
|
|
|
|
|
|
for my $path (sort keys %patterns) { |
74
|
0
|
|
|
|
|
|
$variants{$path} = () = $string =~ /$patterns{$path}/g; |
75
|
|
|
|
|
|
|
} |
76
|
|
|
|
|
|
|
|
77
|
0
|
0
|
|
|
|
|
my $path = scalar keys %variants ? (sort {$variants{$a} <=> $variants{$b}} keys %variants)[-1] : ''; |
|
0
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
|
79
|
0
|
0
|
|
|
|
|
$path = remove_ambiguity($path, $string) if $path =~ m{\|}; |
80
|
|
|
|
|
|
|
|
81
|
0
|
|
|
|
|
|
return make_list($path); |
82
|
|
|
|
|
|
|
} |
83
|
|
|
|
|
|
|
|
84
|
|
|
|
|
|
|
sub remove_ambiguity { |
85
|
0
|
|
|
0
|
0
|
|
my $paths = shift; |
86
|
0
|
|
|
|
|
|
my $text = shift; |
87
|
|
|
|
|
|
|
|
88
|
0
|
|
|
|
|
|
my @paths = split m{\|}, $paths; |
89
|
0
|
|
|
|
|
|
my %stats = (); |
90
|
0
|
|
|
|
|
|
for my $path (@paths) { |
91
|
0
|
|
|
|
|
|
$stats{$path} = () = $text =~ /$ambiguities{$path}/g; |
92
|
|
|
|
|
|
|
} |
93
|
|
|
|
|
|
|
|
94
|
0
|
0
|
|
|
|
|
return scalar keys %stats ? (sort {$stats{$a} <=> $stats{$b}} keys %stats)[-1] : $paths[0]; |
|
0
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
} |
96
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
sub make_list { |
98
|
0
|
|
|
0
|
0
|
|
my $path = shift; |
99
|
|
|
|
|
|
|
|
100
|
0
|
|
|
|
|
|
my @ret; |
101
|
|
|
|
|
|
|
|
102
|
0
|
|
|
|
|
|
for my $pair (split /,/, $path) { |
103
|
0
|
|
|
|
|
|
my ($from, $to) = split /\./, $pair; |
104
|
0
|
0
|
|
|
|
|
push @ret, [$from, $to] unless $from eq '-'; |
105
|
|
|
|
|
|
|
} |
106
|
|
|
|
|
|
|
|
107
|
0
|
|
|
|
|
|
return @ret; |
108
|
|
|
|
|
|
|
} |
109
|
|
|
|
|
|
|
|
110
|
|
|
|
|
|
|
1; |
111
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
__END__ |