line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Lingua::Guess; |
2
|
|
|
|
|
|
|
|
3
|
1
|
|
|
1
|
|
15907
|
use strict; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
25
|
|
4
|
1
|
|
|
1
|
|
3
|
use warnings; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
29
|
|
5
|
|
|
|
|
|
|
require 5.008; |
6
|
1
|
|
|
1
|
|
4
|
use Carp; |
|
1
|
|
|
|
|
4
|
|
|
1
|
|
|
|
|
70
|
|
7
|
1
|
|
|
1
|
|
383
|
use File::Spec::Functions 'catfile'; |
|
1
|
|
|
|
|
554
|
|
|
1
|
|
|
|
|
50
|
|
8
|
1
|
|
|
1
|
|
485
|
use Unicode::Normalize qw/NFC/; |
|
1
|
|
|
|
|
1576
|
|
|
1
|
|
|
|
|
86
|
|
9
|
1
|
|
|
1
|
|
807
|
use Unicode::UCD 'charinfo'; |
|
1
|
|
|
|
|
42032
|
|
|
1
|
|
|
|
|
1851
|
|
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
our $VERSION = '0.01'; |
12
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
our $MAX = 300; |
14
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
our @BASIC_LATIN = qw/English cebuano hausa somali pig_latin klingon indonesian |
16
|
|
|
|
|
|
|
hawaiian welsh latin swahili/; |
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
our @EXOTIC_LATIN = qw/Czech Polish Croatian Romanian Slovak Slovene Turkish Hungarian |
19
|
|
|
|
|
|
|
Azeri Lithuanian Estonian/; |
20
|
|
|
|
|
|
|
our @ACCENTED_LATIN = (qw/Albanian Spanish French German Dutch Italian Danish |
21
|
|
|
|
|
|
|
Icelandic Norwegian Swedish Finnish Latvian Portuguese |
22
|
|
|
|
|
|
|
/, @EXOTIC_LATIN); |
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
our @ALL_LATIN = ( @BASIC_LATIN, @EXOTIC_LATIN, @ACCENTED_LATIN); |
25
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
our @CYRILLIC = qw/Russian Ukrainian Belarussian Kazakh Uzbek Mongolian |
27
|
|
|
|
|
|
|
Serbian Macedonian Bulgarian Kyrgyz/; |
28
|
|
|
|
|
|
|
our @ARABIC = qw/Arabic Farsi Jawi Kurdish Pashto Sindhi Urdu/; |
29
|
|
|
|
|
|
|
our @DEVANAGARI = qw/Bhojpuri Bihari Hindi Kashmiri Konkani Marathi Nepali |
30
|
|
|
|
|
|
|
Sanskrit/; |
31
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
our @SINGLETONS = qw/Armenian Hebrew Bengali Gurumkhi Greek Gujarati Oriya |
33
|
|
|
|
|
|
|
Tamil Telugu Kannada Malayalam Sinhala Thai Lao Tibetan |
34
|
|
|
|
|
|
|
Burmese Georgian Mongolian/; |
35
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
sub new |
37
|
|
|
|
|
|
|
{ |
38
|
1
|
|
|
1
|
1
|
14
|
my ($class, %params) = @_; |
39
|
1
|
50
|
|
|
|
3
|
if (! $params{modeldir}) { |
40
|
1
|
|
|
|
|
3
|
my $md = __FILE__; |
41
|
1
|
|
|
|
|
8
|
$md =~ s!\.pm$!/train!; |
42
|
1
|
|
|
|
|
3
|
$params{modeldir} = $md; |
43
|
|
|
|
|
|
|
} |
44
|
1
|
50
|
|
|
|
4
|
unless (exists $params{modeldir}) { |
45
|
0
|
|
|
|
|
0
|
croak "Must provide a model directory"; |
46
|
|
|
|
|
|
|
} |
47
|
1
|
50
|
|
|
|
34
|
unless (-d $params{modeldir}) { |
48
|
0
|
|
|
|
|
0
|
croak "Model directory '$params{modeldir}' does not exist"; |
49
|
|
|
|
|
|
|
} |
50
|
1
|
|
|
|
|
5
|
my $self = bless { %params }, $class; |
51
|
1
|
|
|
|
|
3
|
return $self; |
52
|
|
|
|
|
|
|
} |
53
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
sub guess |
56
|
|
|
|
|
|
|
{ |
57
|
28
|
|
|
28
|
1
|
29
|
my ($self, $string) = @_; |
58
|
28
|
100
|
|
|
|
67
|
unless (defined $self->{models}) { |
59
|
1
|
|
|
|
|
4
|
$self->load_models(); |
60
|
|
|
|
|
|
|
} |
61
|
28
|
|
|
|
|
53
|
my @runs = find_runs($string); |
62
|
28
|
|
|
|
|
29
|
my @langs; |
63
|
|
|
|
|
|
|
my %scripts; |
64
|
28
|
|
|
|
|
31
|
for my $run (@runs) { |
65
|
54
|
|
|
|
|
81
|
$scripts{$run->[1]}++; |
66
|
|
|
|
|
|
|
} |
67
|
28
|
|
|
|
|
85
|
return $self->identify ($string, %scripts); |
68
|
|
|
|
|
|
|
} |
69
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
sub simple_guess |
71
|
|
|
|
|
|
|
{ |
72
|
28
|
|
|
28
|
1
|
384
|
my ($self, $string) = @_; |
73
|
28
|
|
|
|
|
64
|
my $got = $self->guess ($string); |
74
|
28
|
|
|
|
|
176
|
return $got->[0]{name}; |
75
|
|
|
|
|
|
|
} |
76
|
|
|
|
|
|
|
|
77
|
|
|
|
|
|
|
sub load_models |
78
|
|
|
|
|
|
|
{ |
79
|
1
|
|
|
1
|
0
|
1
|
my ($self) = @_; |
80
|
|
|
|
|
|
|
|
81
|
1
|
50
|
|
|
|
42
|
opendir my $dh, $self->{modeldir} or die "Unable to open dir:$!"; |
82
|
1
|
|
|
|
|
2
|
my %models; |
83
|
1
|
|
|
|
|
28
|
while (my $f = readdir $dh) { |
84
|
57
|
100
|
|
|
|
173
|
unless ($f =~ /\.train$/) { |
85
|
2
|
|
|
|
|
5
|
next; |
86
|
|
|
|
|
|
|
} |
87
|
55
|
|
|
|
|
130
|
my ($name) = $f =~ m|(.*)\.|; |
88
|
55
|
|
|
|
|
252
|
my $path = catfile ($self->{modeldir}, $f); |
89
|
1
|
50
|
|
1
|
|
6
|
open my $fh, "<:encoding(utf8)", $path or die "Failed to open file: $!"; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
5
|
|
|
55
|
|
|
|
|
1519
|
|
90
|
55
|
|
|
|
|
9584
|
my %model; |
91
|
55
|
|
|
|
|
761
|
while (my $line = <$fh>) { |
92
|
16500
|
|
|
|
|
14831
|
chomp $line; |
93
|
16500
|
|
|
|
|
27251
|
my ($k, $v) = $line =~ m|(.{3})\s+(.*)|; |
94
|
16500
|
50
|
|
|
|
18677
|
unless (defined $k) { |
95
|
0
|
|
|
|
|
0
|
next; |
96
|
|
|
|
|
|
|
} |
97
|
16500
|
|
|
|
|
36258
|
$model{$k} = $v; |
98
|
|
|
|
|
|
|
} |
99
|
55
|
|
|
|
|
673
|
$models{$name} = \%model; |
100
|
|
|
|
|
|
|
} |
101
|
1
|
|
|
|
|
20
|
$self->{models} = \%models; |
102
|
|
|
|
|
|
|
} |
103
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
sub find_runs |
105
|
|
|
|
|
|
|
{ |
106
|
28
|
|
|
28
|
0
|
918
|
my ($raw) = @_; |
107
|
28
|
|
|
|
|
404
|
my @chars = split m//, $raw; |
108
|
28
|
|
|
|
|
32
|
my $prev = ''; |
109
|
28
|
|
|
|
|
25
|
my @c; |
110
|
|
|
|
|
|
|
my @runs; |
111
|
0
|
|
|
|
|
0
|
my @run_types; |
112
|
28
|
|
|
|
|
40
|
my $current_run = -1; |
113
|
|
|
|
|
|
|
|
114
|
28
|
|
|
|
|
39
|
for my $c (@chars) { |
115
|
2852
|
|
|
|
|
3833
|
my $is_alph = $c =~ /[[:alpha:]]/o; |
116
|
2852
|
|
|
|
|
2506
|
my $inf = get_charinfo ($c); |
117
|
2852
|
100
|
100
|
|
|
7149
|
if ($is_alph and ! ($inf->{block} eq $prev)) { |
118
|
227
|
|
|
|
|
183
|
$prev = $inf->{block}; |
119
|
227
|
|
|
|
|
237
|
@c = (); |
120
|
227
|
|
|
|
|
144
|
$current_run++; |
121
|
227
|
|
|
|
|
180
|
$run_types[$current_run] = $prev; |
122
|
|
|
|
|
|
|
} |
123
|
2852
|
|
|
|
|
2241
|
push @c, $c; |
124
|
2852
|
100
|
|
|
|
3035
|
if ($current_run > -1) { |
125
|
2848
|
|
|
|
|
1612
|
push @{ $runs[$current_run] }, $c; |
|
2848
|
|
|
|
|
3472
|
|
126
|
|
|
|
|
|
|
} |
127
|
|
|
|
|
|
|
} |
128
|
|
|
|
|
|
|
|
129
|
28
|
|
|
|
|
57
|
my ($newruns, $newtypes) = reconcile_latin (\@runs, \@run_types); |
130
|
28
|
|
|
|
|
35
|
my $counter = 0; |
131
|
28
|
|
|
|
|
25
|
my @result; |
132
|
28
|
|
|
|
|
31
|
for my $r (@$newruns) { |
133
|
54
|
|
|
|
|
72
|
push @result, [ $r, $newtypes->[$counter]]; |
134
|
54
|
|
|
|
|
49
|
$counter++; |
135
|
|
|
|
|
|
|
} |
136
|
28
|
|
|
|
|
296
|
return @result; |
137
|
|
|
|
|
|
|
} |
138
|
|
|
|
|
|
|
|
139
|
|
|
|
|
|
|
# Cached lookups from charinfo |
140
|
|
|
|
|
|
|
|
141
|
|
|
|
|
|
|
my %cache; |
142
|
|
|
|
|
|
|
|
143
|
|
|
|
|
|
|
# Look up characters using charinfo, but with a cache to save repeated |
144
|
|
|
|
|
|
|
# lookups. |
145
|
|
|
|
|
|
|
|
146
|
|
|
|
|
|
|
sub get_charinfo |
147
|
|
|
|
|
|
|
{ |
148
|
2852
|
|
|
2852
|
0
|
2139
|
my ($char) = @_; |
149
|
2852
|
|
|
|
|
2240
|
my $known = $cache{$char}; |
150
|
2852
|
100
|
|
|
|
3077
|
if ($known) { |
151
|
2657
|
|
|
|
|
2240
|
return $known; |
152
|
|
|
|
|
|
|
} |
153
|
195
|
|
|
|
|
411
|
my $inf = charinfo (ord ($char)); |
154
|
195
|
|
|
|
|
225095
|
$cache{$char} = $inf; |
155
|
195
|
|
|
|
|
230
|
return $inf; |
156
|
|
|
|
|
|
|
} |
157
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
sub reconcile_latin |
159
|
|
|
|
|
|
|
{ |
160
|
28
|
|
|
28
|
0
|
31
|
my ($runs, $types) = @_; |
161
|
28
|
|
|
|
|
70
|
my @types = @$types; |
162
|
28
|
|
|
|
|
18
|
my (@new_runs, @new_types); |
163
|
28
|
|
|
|
|
25
|
my $last_type = ''; |
164
|
|
|
|
|
|
|
|
165
|
28
|
|
|
|
|
28
|
my $upgrade; |
166
|
28
|
100
|
|
|
|
52
|
if (has_supplemental_latin (@$types)) { |
167
|
10
|
|
|
|
|
13
|
$upgrade = 'Accented Latin'; |
168
|
|
|
|
|
|
|
} |
169
|
28
|
100
|
|
|
|
45
|
if (has_extended_latin (@$types)) { |
170
|
7
|
|
|
|
|
8
|
$upgrade = 'Exotic Latin' ; |
171
|
|
|
|
|
|
|
} |
172
|
28
|
50
|
|
|
|
52
|
if (has_latin_extended_additional (@$types)) { |
173
|
0
|
|
|
|
|
0
|
$upgrade = 'Superfreak Latin'; |
174
|
|
|
|
|
|
|
} |
175
|
28
|
100
|
|
|
|
51
|
unless ($upgrade) { |
176
|
16
|
|
|
|
|
31
|
return ($runs, $types); |
177
|
|
|
|
|
|
|
} |
178
|
12
|
|
|
|
|
16
|
my $run_count = -1; |
179
|
12
|
|
|
|
|
15
|
for my $r (@$runs) { |
180
|
211
|
|
|
|
|
134
|
my $type = shift @types; |
181
|
211
|
100
|
|
|
|
294
|
if ($type =~ /Latin/) { |
182
|
198
|
|
|
|
|
116
|
$type = $upgrade; |
183
|
|
|
|
|
|
|
} |
184
|
211
|
100
|
|
|
|
230
|
unless ($type eq $last_type) { |
185
|
38
|
|
|
|
|
23
|
$run_count++; |
186
|
|
|
|
|
|
|
} |
187
|
211
|
|
|
|
|
116
|
push @{$new_runs[$run_count]}, @$r; |
|
211
|
|
|
|
|
344
|
|
188
|
211
|
|
|
|
|
161
|
$new_types[$run_count] = $type; |
189
|
211
|
|
|
|
|
164
|
$last_type = $type; |
190
|
|
|
|
|
|
|
} |
191
|
12
|
|
|
|
|
28
|
return (\@new_runs, \@new_types); |
192
|
|
|
|
|
|
|
} |
193
|
|
|
|
|
|
|
|
194
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
sub has_extended_latin |
196
|
|
|
|
|
|
|
{ |
197
|
28
|
|
|
28
|
0
|
49
|
my (@types) = @_; |
198
|
28
|
|
|
|
|
29
|
return scalar grep { /Latin Extended-A/ } @types; |
|
227
|
|
|
|
|
213
|
|
199
|
|
|
|
|
|
|
} |
200
|
|
|
|
|
|
|
|
201
|
|
|
|
|
|
|
sub has_supplemental_latin |
202
|
|
|
|
|
|
|
{ |
203
|
28
|
|
|
28
|
0
|
53
|
my (@types) = @_; |
204
|
28
|
|
|
|
|
39
|
return scalar grep { /Latin-1 Supplement/ } @types; |
|
227
|
|
|
|
|
247
|
|
205
|
|
|
|
|
|
|
} |
206
|
|
|
|
|
|
|
|
207
|
|
|
|
|
|
|
sub has_latin_extended_additional |
208
|
|
|
|
|
|
|
{ |
209
|
28
|
|
|
28
|
0
|
46
|
my (@types) = @_; |
210
|
28
|
|
|
|
|
36
|
return scalar grep { /Latin Extended Additional/ } @types; |
|
227
|
|
|
|
|
285
|
|
211
|
|
|
|
|
|
|
} |
212
|
|
|
|
|
|
|
|
213
|
|
|
|
|
|
|
|
214
|
|
|
|
|
|
|
sub identify |
215
|
|
|
|
|
|
|
{ |
216
|
28
|
|
|
28
|
0
|
50
|
my ($self, $sample, %scripts) = @_; |
217
|
|
|
|
|
|
|
# Check for Korean |
218
|
28
|
50
|
33
|
|
|
157
|
if (exists $scripts{'Hangul Syllables'} || |
|
|
|
33
|
|
|
|
|
|
|
|
33
|
|
|
|
|
219
|
|
|
|
|
|
|
exists $scripts{'Hangul Jamo'} || |
220
|
|
|
|
|
|
|
exists $scripts{'Hangul Compatibility Jamo'} || |
221
|
|
|
|
|
|
|
exists $scripts{'Hangul'}) { |
222
|
0
|
|
|
|
|
0
|
return [{ name =>'korean', score => 1 }]; |
223
|
|
|
|
|
|
|
} |
224
|
28
|
100
|
|
|
|
42
|
if (exists $scripts{'Greek and Coptic'}) { |
225
|
|
|
|
|
|
|
|
226
|
1
|
|
|
|
|
13
|
return [{ name =>'greek', score => 1 }]; |
227
|
|
|
|
|
|
|
} |
228
|
|
|
|
|
|
|
|
229
|
27
|
50
|
33
|
|
|
97
|
if (exists $scripts{'Katakana'} || |
|
|
|
33
|
|
|
|
|
230
|
|
|
|
|
|
|
exists $scripts{'Hiragana'} || |
231
|
|
|
|
|
|
|
exists $scripts{'Katakana Phonetic Extensions'}) { |
232
|
0
|
|
|
|
|
0
|
return [{ name =>'japanese', score => 1 }]; |
233
|
|
|
|
|
|
|
} |
234
|
|
|
|
|
|
|
|
235
|
|
|
|
|
|
|
|
236
|
27
|
50
|
66
|
|
|
151
|
if (exists $scripts{'CJK Unified Ideographs'} || |
|
|
|
33
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
|
33
|
|
|
|
|
237
|
|
|
|
|
|
|
exists $scripts{'Bopomofo'} || |
238
|
|
|
|
|
|
|
exists $scripts{'Bopomofo Extended'} || |
239
|
|
|
|
|
|
|
exists $scripts{'KangXi Radicals'} || |
240
|
|
|
|
|
|
|
exists $scripts{'Arabic Presentation Forms-A'}) { |
241
|
1
|
|
|
|
|
7
|
return [{ name => 'chinese', score => 1 }]; |
242
|
|
|
|
|
|
|
} |
243
|
|
|
|
|
|
|
|
244
|
26
|
100
|
|
|
|
51
|
if (exists $scripts{'Cyrillic'}) { |
245
|
7
|
|
|
|
|
19
|
return $self->check ($sample, @CYRILLIC); |
246
|
|
|
|
|
|
|
} |
247
|
|
|
|
|
|
|
|
248
|
|
|
|
|
|
|
|
249
|
19
|
50
|
66
|
|
|
63
|
if (exists $scripts{'Arabic'} || |
|
|
|
33
|
|
|
|
|
250
|
|
|
|
|
|
|
exists $scripts{'Arabic Presentation Forms-A'} || |
251
|
|
|
|
|
|
|
exists $scripts{'Arabic Presentation Forms-B'}) { |
252
|
1
|
|
|
|
|
2
|
return $self->check ($sample, @ARABIC); |
253
|
|
|
|
|
|
|
} |
254
|
|
|
|
|
|
|
|
255
|
18
|
50
|
|
|
|
33
|
if (exists $scripts{'Devanagari'}) { |
256
|
0
|
|
|
|
|
0
|
return $self->check ($sample, @DEVANAGARI); |
257
|
|
|
|
|
|
|
} |
258
|
|
|
|
|
|
|
|
259
|
|
|
|
|
|
|
|
260
|
|
|
|
|
|
|
# Try languages with unique scripts |
261
|
|
|
|
|
|
|
|
262
|
18
|
|
|
|
|
27
|
for my $s (@SINGLETONS) { |
263
|
307
|
100
|
|
|
|
371
|
if (exists $scripts{$s}) { |
264
|
1
|
|
|
|
|
7
|
return [{ name => lc ($s), score => 1 }]; |
265
|
|
|
|
|
|
|
} |
266
|
|
|
|
|
|
|
} |
267
|
|
|
|
|
|
|
|
268
|
17
|
50
|
|
|
|
28
|
if (exists $scripts{'Superfreak Latin'}) { |
269
|
0
|
|
|
|
|
0
|
return [{ name => 'vietnamese', score => 1 }]; |
270
|
|
|
|
|
|
|
} |
271
|
|
|
|
|
|
|
|
272
|
17
|
100
|
|
|
|
24
|
if (exists $scripts{'Exotic Latin'}) { |
273
|
7
|
|
|
|
|
19
|
return $self->check ($sample, @EXOTIC_LATIN); |
274
|
|
|
|
|
|
|
} |
275
|
|
|
|
|
|
|
|
276
|
10
|
100
|
|
|
|
16
|
if (exists $scripts{'Accented Latin'}) { |
277
|
5
|
|
|
|
|
12
|
return $self->check ($sample, @ACCENTED_LATIN); |
278
|
|
|
|
|
|
|
} |
279
|
|
|
|
|
|
|
|
280
|
|
|
|
|
|
|
|
281
|
5
|
50
|
|
|
|
11
|
if (exists $scripts{'Basic Latin'}) { |
282
|
5
|
|
|
|
|
28
|
return $self->check ($sample, @ALL_LATIN); |
283
|
|
|
|
|
|
|
} |
284
|
|
|
|
|
|
|
|
285
|
0
|
|
|
|
|
0
|
return [{ name => "unknown script: '". (join ", ", keys %scripts)."'", score => 1}]; |
286
|
|
|
|
|
|
|
|
287
|
|
|
|
|
|
|
} |
288
|
|
|
|
|
|
|
|
289
|
|
|
|
|
|
|
|
290
|
|
|
|
|
|
|
sub check |
291
|
|
|
|
|
|
|
{ |
292
|
25
|
|
|
25
|
0
|
89
|
my ($self, $sample, @langs) = @_; |
293
|
25
|
|
|
|
|
50
|
my $mod = __make_model ($sample); |
294
|
25
|
|
|
|
|
40
|
my $num_tri = scalar keys %$mod; |
295
|
25
|
|
|
|
|
18
|
my %scores; |
296
|
25
|
|
|
|
|
43
|
for my $key (@langs) { |
297
|
504
|
|
|
|
|
447
|
my $l = lc ($key); |
298
|
504
|
100
|
|
|
|
798
|
unless (exists $self->{models}{$l}) { |
299
|
10
|
|
|
|
|
13
|
next; |
300
|
|
|
|
|
|
|
} |
301
|
494
|
|
|
|
|
545
|
my $score = __distance ($mod, $self->{models}{$l}); |
302
|
494
|
|
|
|
|
626
|
$scores{$l} = $score; |
303
|
|
|
|
|
|
|
} |
304
|
25
|
|
|
|
|
115
|
my @sorted = sort { $scores{$a} <=> $scores{$b} } keys %scores; |
|
1429
|
|
|
|
|
974
|
|
305
|
25
|
|
|
|
|
35
|
my @out; |
306
|
25
|
|
50
|
|
|
44
|
$num_tri ||=1; |
307
|
25
|
|
|
|
|
23
|
for my $s (@sorted) { |
308
|
439
|
|
|
|
|
341
|
my $norm = $scores{$s}/$num_tri; |
309
|
439
|
|
|
|
|
584
|
push @out, { name => $s , score => int ($norm) }; |
310
|
|
|
|
|
|
|
} |
311
|
25
|
|
|
|
|
600
|
return [splice (@out, 0, 4)]; |
312
|
|
|
|
|
|
|
|
313
|
0
|
0
|
|
|
|
0
|
if (@sorted) { |
314
|
0
|
|
|
|
|
0
|
return splice (@sorted, 0, 4); |
315
|
0
|
|
|
|
|
0
|
my @all; |
316
|
0
|
|
|
|
|
0
|
my $firstscore = $scores{$sorted[0]}; |
317
|
0
|
|
|
|
|
0
|
while (my $next = shift @sorted) { |
318
|
0
|
0
|
|
|
|
0
|
unless ($scores{$next} == $firstscore) { |
319
|
0
|
|
|
|
|
0
|
last; |
320
|
|
|
|
|
|
|
} |
321
|
0
|
|
|
|
|
0
|
push @all, $next; |
322
|
|
|
|
|
|
|
} |
323
|
0
|
|
|
|
|
0
|
return join ', ', @all; |
324
|
|
|
|
|
|
|
} |
325
|
0
|
|
|
|
|
0
|
return { name => 'unknown'. (join ' ', @langs), score => 1 }; |
326
|
|
|
|
|
|
|
} |
327
|
|
|
|
|
|
|
|
328
|
|
|
|
|
|
|
|
329
|
|
|
|
|
|
|
sub __distance |
330
|
|
|
|
|
|
|
{ |
331
|
494
|
|
|
494
|
|
380
|
my ($m1, $m2) = @_; |
332
|
494
|
|
|
|
|
307
|
my $dist =0; |
333
|
494
|
|
|
|
|
3669
|
for my $k (keys %$m1) { |
334
|
47837
|
100
|
|
|
|
51699
|
$dist += (exists $m2->{$k} ? abs($m2->{$k} - $m1->{$k}) : $MAX); |
335
|
|
|
|
|
|
|
} |
336
|
494
|
|
|
|
|
1835
|
return $dist; |
337
|
|
|
|
|
|
|
} |
338
|
|
|
|
|
|
|
|
339
|
|
|
|
|
|
|
|
340
|
|
|
|
|
|
|
sub __make_model |
341
|
|
|
|
|
|
|
{ |
342
|
25
|
|
|
25
|
|
27
|
my ($content) = @_; |
343
|
25
|
|
|
|
|
17
|
my %trigrams; |
344
|
25
|
|
|
|
|
450
|
$content = NFC ($content); # normal form C |
345
|
25
|
|
|
|
|
408
|
$content =~ s/[^[:alpha:]']/ /g; |
346
|
25
|
|
|
|
|
93
|
for (my $i = 0; $i < length ($content) - 2; $i++) { |
347
|
2664
|
|
|
|
|
3000
|
my $tri = lc (substr ($content, $i, 3)); |
348
|
2664
|
|
|
|
|
7771
|
$trigrams{$tri}++; |
349
|
|
|
|
|
|
|
} |
350
|
|
|
|
|
|
|
|
351
|
|
|
|
|
|
|
my @sorted = sort { $trigrams{$b} == $trigrams{$a} ? |
352
|
|
|
|
|
|
|
$a cmp $b : |
353
|
13102
|
100
|
|
|
|
13397
|
$trigrams{$b} <=> $trigrams{$a} } |
354
|
25
|
|
|
|
|
299
|
grep { !/\s\s/o } keys %trigrams; |
|
2445
|
|
|
|
|
2299
|
|
355
|
25
|
|
|
|
|
324
|
my @trimmed = splice (@sorted, 0, 300); |
356
|
25
|
|
|
|
|
89
|
my $counter = 0; |
357
|
25
|
|
|
|
|
14
|
my %res; |
358
|
25
|
|
|
|
|
31
|
for my $t (@trimmed) { |
359
|
2397
|
|
|
|
|
1995
|
$res{$t} = $counter++; |
360
|
|
|
|
|
|
|
} |
361
|
25
|
|
|
|
|
277
|
return \%res; |
362
|
|
|
|
|
|
|
} |
363
|
|
|
|
|
|
|
|
364
|
|
|
|
|
|
|
1; |