line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Lingua::Guess; |
2
|
|
|
|
|
|
|
|
3
|
2
|
|
|
2
|
|
142860
|
use strict; |
|
2
|
|
|
|
|
12
|
|
|
2
|
|
|
|
|
61
|
|
4
|
2
|
|
|
2
|
|
10
|
use warnings; |
|
2
|
|
|
|
|
4
|
|
|
2
|
|
|
|
|
72
|
|
5
|
|
|
|
|
|
|
require 5.008; |
6
|
2
|
|
|
2
|
|
11
|
use Carp; |
|
2
|
|
|
|
|
4
|
|
|
2
|
|
|
|
|
144
|
|
7
|
2
|
|
|
2
|
|
1016
|
use File::Spec::Functions 'catfile'; |
|
2
|
|
|
|
|
1801
|
|
|
2
|
|
|
|
|
159
|
|
8
|
2
|
|
|
2
|
|
1208
|
use Unicode::Normalize qw/NFC/; |
|
2
|
|
|
|
|
4481
|
|
|
2
|
|
|
|
|
156
|
|
9
|
2
|
|
|
2
|
|
3210
|
use Unicode::UCD 'charinfo'; |
|
2
|
|
|
|
|
102593
|
|
|
2
|
|
|
|
|
166
|
|
10
|
2
|
|
|
2
|
|
945
|
use JSON::Parse 'read_json'; |
|
2
|
|
|
|
|
2524
|
|
|
2
|
|
|
|
|
5144
|
|
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
our $VERSION = '0.03'; |
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
# Maximum distance, used by __distance. |
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
our $MAX = 300; |
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
our @BASIC_LATIN = qw/English cebuano hausa somali pig_latin klingon |
19
|
|
|
|
|
|
|
indonesian hawaiian welsh latin swahili/; |
20
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
our @EXOTIC_LATIN = qw/Czech Polish Croatian Romanian Slovak Slovene |
22
|
|
|
|
|
|
|
Turkish Hungarian Azeri Lithuanian Estonian/; |
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
our @ACCENTED_LATIN = (qw/Albanian Spanish French German Dutch Italian |
25
|
|
|
|
|
|
|
Danish Icelandic Norwegian Swedish Finnish |
26
|
|
|
|
|
|
|
Latvian Portuguese /, @EXOTIC_LATIN); |
27
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
our @ALL_LATIN = ( @BASIC_LATIN, @EXOTIC_LATIN, @ACCENTED_LATIN); |
29
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
our @CYRILLIC = qw/Russian Ukrainian Belarussian Kazakh Uzbek |
31
|
|
|
|
|
|
|
Mongolian Serbian Macedonian Bulgarian Kyrgyz/; |
32
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
our @ARABIC = qw/Arabic Farsi Jawi Kurdish Pashto Sindhi Urdu/; |
34
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
our @DEVANAGARI = qw/Bhojpuri Bihari Hindi Kashmiri Konkani Marathi |
36
|
|
|
|
|
|
|
Nepali Sanskrit/; |
37
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
our @SINGLETONS = qw/Armenian Hebrew Bengali Gurumkhi Greek Gujarati |
39
|
|
|
|
|
|
|
Oriya Tamil Telugu Kannada Malayalam Sinhala |
40
|
|
|
|
|
|
|
Thai Lao Tibetan Burmese Georgian Mongolian/; |
41
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
my $dir = __FILE__; |
43
|
|
|
|
|
|
|
$dir =~ s!\.pm$!!; |
44
|
|
|
|
|
|
|
|
45
|
|
|
|
|
|
|
my $lang2codes = read_json ("$dir/lang.json"); |
46
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
sub make_ret |
48
|
|
|
|
|
|
|
{ |
49
|
477
|
|
|
477
|
0
|
757
|
my ($lang, $score) = @_; |
50
|
477
|
|
|
|
|
582
|
my %ret; |
51
|
477
|
|
|
|
|
743
|
$lang = lc $lang; |
52
|
477
|
|
|
|
|
879
|
my $codes = $lang2codes->{$lang}; |
53
|
477
|
100
|
|
|
|
821
|
if ($codes) { |
54
|
416
|
|
|
|
|
887
|
$ret{code2} = $codes->[0]; |
55
|
416
|
|
|
|
|
699
|
$ret{code3} = $codes->[1]; |
56
|
|
|
|
|
|
|
} |
57
|
477
|
|
|
|
|
677
|
$ret{score} = $score; |
58
|
477
|
|
|
|
|
652
|
$ret{name} = $lang; |
59
|
477
|
|
|
|
|
1061
|
return \%ret; |
60
|
|
|
|
|
|
|
} |
61
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
sub new |
64
|
|
|
|
|
|
|
{ |
65
|
2
|
|
|
2
|
1
|
936
|
my ($class, %params) = @_; |
66
|
2
|
50
|
|
|
|
12
|
if (! $params{modeldir}) { |
67
|
2
|
|
|
|
|
8
|
my $md = "$dir/train"; |
68
|
2
|
|
|
|
|
8
|
$params{modeldir} = $md; |
69
|
|
|
|
|
|
|
} |
70
|
2
|
50
|
|
|
|
53
|
if (! -d $params{modeldir}) { |
71
|
0
|
|
|
|
|
0
|
croak "Model directory '$params{modeldir}' does not exist"; |
72
|
|
|
|
|
|
|
} |
73
|
2
|
|
|
|
|
13
|
my $self = bless { %params }, $class; |
74
|
2
|
|
|
|
|
10
|
return $self; |
75
|
|
|
|
|
|
|
} |
76
|
|
|
|
|
|
|
|
77
|
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
sub guess |
79
|
|
|
|
|
|
|
{ |
80
|
29
|
|
|
29
|
1
|
55
|
my ($self, $string) = @_; |
81
|
29
|
100
|
|
|
|
87
|
unless (defined $self->{models}) { |
82
|
2
|
|
|
|
|
15
|
$self->load_models (); |
83
|
|
|
|
|
|
|
} |
84
|
29
|
|
|
|
|
80
|
my @runs = find_runs($string); |
85
|
29
|
|
|
|
|
47
|
my %scripts; |
86
|
29
|
|
|
|
|
46
|
for my $run (@runs) { |
87
|
55
|
|
|
|
|
122
|
$scripts{$run->[1]}++; |
88
|
|
|
|
|
|
|
} |
89
|
29
|
|
|
|
|
126
|
return $self->identify ($string, %scripts); |
90
|
|
|
|
|
|
|
} |
91
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
sub simple_guess |
93
|
|
|
|
|
|
|
{ |
94
|
28
|
|
|
28
|
1
|
17452
|
my ($self, $string) = @_; |
95
|
28
|
|
|
|
|
69
|
my $got = $self->guess ($string); |
96
|
28
|
|
|
|
|
348
|
return $got->[0]{name}; |
97
|
|
|
|
|
|
|
} |
98
|
|
|
|
|
|
|
|
99
|
|
|
|
|
|
|
sub load_models |
100
|
|
|
|
|
|
|
{ |
101
|
2
|
|
|
2
|
0
|
5
|
my ($self) = @_; |
102
|
2
|
50
|
|
|
|
99
|
opendir my $dh, $self->{modeldir} or die "Unable to open dir:$!"; |
103
|
2
|
|
|
|
|
7
|
my %models; |
104
|
2
|
|
|
|
|
139
|
while (my $f = readdir $dh) { |
105
|
114
|
100
|
|
|
|
602
|
unless ($f =~ /\.train$/) { |
106
|
4
|
|
|
|
|
17
|
next; |
107
|
|
|
|
|
|
|
} |
108
|
110
|
|
|
|
|
440
|
my ($name) = $f =~ m|(.*)\.|; |
109
|
110
|
|
|
|
|
723
|
my $path = catfile ($self->{modeldir}, $f); |
110
|
110
|
50
|
|
|
|
4375
|
open my $fh, "<:encoding(utf8)", $path or die "Failed to open file: $!"; |
111
|
110
|
|
|
|
|
6162
|
my %model; |
112
|
110
|
|
|
|
|
3141
|
while (my $line = <$fh>) { |
113
|
33000
|
|
|
|
|
52834
|
chomp $line; |
114
|
33000
|
|
|
|
|
103301
|
my ($k, $v) = $line =~ m|(.{3})\s+(.*)|; |
115
|
33000
|
50
|
|
|
|
63813
|
unless (defined $k) { |
116
|
0
|
|
|
|
|
0
|
next; |
117
|
|
|
|
|
|
|
} |
118
|
33000
|
|
|
|
|
111371
|
$model{$k} = $v; |
119
|
|
|
|
|
|
|
} |
120
|
110
|
|
|
|
|
2440
|
$models{$name} = \%model; |
121
|
|
|
|
|
|
|
} |
122
|
2
|
|
|
|
|
131
|
$self->{models} = \%models; |
123
|
|
|
|
|
|
|
} |
124
|
|
|
|
|
|
|
|
125
|
|
|
|
|
|
|
sub find_runs |
126
|
|
|
|
|
|
|
{ |
127
|
29
|
|
|
29
|
0
|
54
|
my ($raw) = @_; |
128
|
29
|
|
|
|
|
514
|
my @chars = split m//, $raw; |
129
|
29
|
|
|
|
|
59
|
my $prev = ''; |
130
|
29
|
|
|
|
|
69
|
my @c; |
131
|
|
|
|
|
|
|
my @runs; |
132
|
29
|
|
|
|
|
0
|
my @run_types; |
133
|
29
|
|
|
|
|
40
|
my $current_run = -1; |
134
|
|
|
|
|
|
|
|
135
|
29
|
|
|
|
|
61
|
for my $c (@chars) { |
136
|
3082
|
|
|
|
|
8091
|
my $is_alph = $c =~ /[[:alpha:]]/o; |
137
|
3082
|
|
|
|
|
4831
|
my $inf = get_charinfo ($c); |
138
|
3082
|
100
|
100
|
|
|
8674
|
if ($is_alph and ! ($inf->{block} eq $prev)) { |
139
|
228
|
|
|
|
|
372
|
$prev = $inf->{block}; |
140
|
228
|
|
|
|
|
397
|
@c = (); |
141
|
228
|
|
|
|
|
285
|
$current_run++; |
142
|
228
|
|
|
|
|
372
|
$run_types[$current_run] = $prev; |
143
|
|
|
|
|
|
|
} |
144
|
3082
|
|
|
|
|
4905
|
push @c, $c; |
145
|
3082
|
100
|
|
|
|
4797
|
if ($current_run > -1) { |
146
|
3078
|
|
|
|
|
3787
|
push @{ $runs[$current_run] }, $c; |
|
3078
|
|
|
|
|
6332
|
|
147
|
|
|
|
|
|
|
} |
148
|
|
|
|
|
|
|
} |
149
|
|
|
|
|
|
|
|
150
|
29
|
|
|
|
|
84
|
my ($newruns, $newtypes) = reconcile_latin (\@runs, \@run_types); |
151
|
29
|
|
|
|
|
60
|
my $counter = 0; |
152
|
29
|
|
|
|
|
37
|
my @result; |
153
|
29
|
|
|
|
|
53
|
for my $r (@$newruns) { |
154
|
55
|
|
|
|
|
108
|
push @result, [ $r, $newtypes->[$counter]]; |
155
|
55
|
|
|
|
|
89
|
$counter++; |
156
|
|
|
|
|
|
|
} |
157
|
29
|
|
|
|
|
407
|
return @result; |
158
|
|
|
|
|
|
|
} |
159
|
|
|
|
|
|
|
|
160
|
|
|
|
|
|
|
# Cached lookups from charinfo |
161
|
|
|
|
|
|
|
|
162
|
|
|
|
|
|
|
my %cache; |
163
|
|
|
|
|
|
|
|
164
|
|
|
|
|
|
|
# Look up characters using charinfo, but with a cache to save repeated |
165
|
|
|
|
|
|
|
# lookups. |
166
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
sub get_charinfo |
168
|
|
|
|
|
|
|
{ |
169
|
3082
|
|
|
3082
|
0
|
4892
|
my ($char) = @_; |
170
|
3082
|
|
|
|
|
4587
|
my $known = $cache{$char}; |
171
|
3082
|
100
|
|
|
|
5050
|
if ($known) { |
172
|
2853
|
|
|
|
|
4258
|
return $known; |
173
|
|
|
|
|
|
|
} |
174
|
229
|
|
|
|
|
629
|
my $inf = charinfo (ord ($char)); |
175
|
229
|
|
|
|
|
678835
|
$cache{$char} = $inf; |
176
|
229
|
|
|
|
|
502
|
return $inf; |
177
|
|
|
|
|
|
|
} |
178
|
|
|
|
|
|
|
|
179
|
|
|
|
|
|
|
sub reconcile_latin |
180
|
|
|
|
|
|
|
{ |
181
|
29
|
|
|
29
|
0
|
66
|
my ($runs, $types) = @_; |
182
|
29
|
|
|
|
|
87
|
my @types = @$types; |
183
|
29
|
|
|
|
|
46
|
my (@new_runs, @new_types); |
184
|
29
|
|
|
|
|
45
|
my $last_type = ''; |
185
|
|
|
|
|
|
|
|
186
|
29
|
|
|
|
|
39
|
my $upgrade; |
187
|
29
|
100
|
|
|
|
64
|
if (has_supplemental_latin (@$types)) { |
188
|
10
|
|
|
|
|
22
|
$upgrade = 'Accented Latin'; |
189
|
|
|
|
|
|
|
} |
190
|
29
|
100
|
|
|
|
71
|
if (has_extended_latin (@$types)) { |
191
|
7
|
|
|
|
|
13
|
$upgrade = 'Exotic Latin' ; |
192
|
|
|
|
|
|
|
} |
193
|
29
|
50
|
|
|
|
67
|
if (has_latin_extended_additional (@$types)) { |
194
|
0
|
|
|
|
|
0
|
$upgrade = 'Superfreak Latin'; |
195
|
|
|
|
|
|
|
} |
196
|
29
|
100
|
|
|
|
403
|
unless ($upgrade) { |
197
|
17
|
|
|
|
|
49
|
return ($runs, $types); |
198
|
|
|
|
|
|
|
} |
199
|
12
|
|
|
|
|
22
|
my $run_count = -1; |
200
|
12
|
|
|
|
|
24
|
for my $r (@$runs) { |
201
|
211
|
|
|
|
|
269
|
my $type = shift @types; |
202
|
211
|
100
|
|
|
|
434
|
if ($type =~ /Latin/) { |
203
|
198
|
|
|
|
|
257
|
$type = $upgrade; |
204
|
|
|
|
|
|
|
} |
205
|
211
|
100
|
|
|
|
333
|
unless ($type eq $last_type) { |
206
|
38
|
|
|
|
|
49
|
$run_count++; |
207
|
|
|
|
|
|
|
} |
208
|
211
|
|
|
|
|
245
|
push @{$new_runs[$run_count]}, @$r; |
|
211
|
|
|
|
|
521
|
|
209
|
211
|
|
|
|
|
337
|
$new_types[$run_count] = $type; |
210
|
211
|
|
|
|
|
323
|
$last_type = $type; |
211
|
|
|
|
|
|
|
} |
212
|
12
|
|
|
|
|
41
|
return (\@new_runs, \@new_types); |
213
|
|
|
|
|
|
|
} |
214
|
|
|
|
|
|
|
|
215
|
|
|
|
|
|
|
|
216
|
|
|
|
|
|
|
sub has_extended_latin |
217
|
|
|
|
|
|
|
{ |
218
|
29
|
|
|
29
|
0
|
59
|
my (@types) = @_; |
219
|
29
|
|
|
|
|
50
|
return scalar grep { /Latin Extended-A/ } @types; |
|
228
|
|
|
|
|
379
|
|
220
|
|
|
|
|
|
|
} |
221
|
|
|
|
|
|
|
|
222
|
|
|
|
|
|
|
sub has_supplemental_latin |
223
|
|
|
|
|
|
|
{ |
224
|
29
|
|
|
29
|
0
|
73
|
my (@types) = @_; |
225
|
29
|
|
|
|
|
53
|
return scalar grep { /Latin-1 Supplement/ } @types; |
|
228
|
|
|
|
|
434
|
|
226
|
|
|
|
|
|
|
} |
227
|
|
|
|
|
|
|
|
228
|
|
|
|
|
|
|
sub has_latin_extended_additional |
229
|
|
|
|
|
|
|
{ |
230
|
29
|
|
|
29
|
0
|
71
|
my (@types) = @_; |
231
|
29
|
|
|
|
|
42
|
return scalar grep { /Latin Extended Additional/ } @types; |
|
228
|
|
|
|
|
361
|
|
232
|
|
|
|
|
|
|
} |
233
|
|
|
|
|
|
|
|
234
|
|
|
|
|
|
|
|
235
|
|
|
|
|
|
|
|
236
|
|
|
|
|
|
|
sub identify |
237
|
|
|
|
|
|
|
{ |
238
|
57
|
|
|
57
|
0
|
8736
|
my ($self, $sample, %scripts) = @_; |
239
|
|
|
|
|
|
|
|
240
|
|
|
|
|
|
|
# Check for Korean |
241
|
|
|
|
|
|
|
|
242
|
57
|
50
|
33
|
|
|
375
|
if (exists $scripts{'Hangul Syllables'} || |
|
|
|
33
|
|
|
|
|
|
|
|
33
|
|
|
|
|
243
|
|
|
|
|
|
|
exists $scripts{'Hangul Jamo'} || |
244
|
|
|
|
|
|
|
exists $scripts{'Hangul Compatibility Jamo'} || |
245
|
|
|
|
|
|
|
exists $scripts{'Hangul'}) { |
246
|
0
|
|
|
|
|
0
|
return [make_ret ('korean', 1)]; |
247
|
|
|
|
|
|
|
} |
248
|
|
|
|
|
|
|
|
249
|
57
|
100
|
|
|
|
115
|
if (exists $scripts{'Greek and Coptic'}) { |
250
|
1
|
|
|
|
|
3
|
return [make_ret ('greek', 1)]; |
251
|
|
|
|
|
|
|
} |
252
|
|
|
|
|
|
|
|
253
|
56
|
50
|
33
|
|
|
250
|
if (exists $scripts{'Katakana'} || |
|
|
|
33
|
|
|
|
|
254
|
|
|
|
|
|
|
exists $scripts{'Hiragana'} || |
255
|
|
|
|
|
|
|
exists $scripts{'Katakana Phonetic Extensions'}) { |
256
|
0
|
|
|
|
|
0
|
return [make_ret ('japanese', 1)]; |
257
|
|
|
|
|
|
|
} |
258
|
|
|
|
|
|
|
|
259
|
56
|
50
|
66
|
|
|
287
|
if (exists $scripts{'CJK Unified Ideographs'} || |
|
|
|
33
|
|
|
|
|
|
|
|
33
|
|
|
|
|
260
|
|
|
|
|
|
|
exists $scripts{'Bopomofo'} || |
261
|
|
|
|
|
|
|
exists $scripts{'Bopomofo Extended'} || |
262
|
|
|
|
|
|
|
exists $scripts{'KangXi Radicals'}) { |
263
|
1
|
|
|
|
|
5
|
return [make_ret ('chinese', 1)]; |
264
|
|
|
|
|
|
|
} |
265
|
|
|
|
|
|
|
|
266
|
55
|
100
|
|
|
|
105
|
if (exists $scripts{'Cyrillic'}) { |
267
|
7
|
|
|
|
|
36
|
return $self->check ($sample, @CYRILLIC); |
268
|
|
|
|
|
|
|
} |
269
|
|
|
|
|
|
|
|
270
|
48
|
50
|
66
|
|
|
172
|
if (exists $scripts{'Arabic'} || |
|
|
|
33
|
|
|
|
|
271
|
|
|
|
|
|
|
exists $scripts{'Arabic Presentation Forms-A'} || |
272
|
|
|
|
|
|
|
exists $scripts{'Arabic Presentation Forms-B'}) { |
273
|
1
|
|
|
|
|
5
|
return $self->check ($sample, @ARABIC); |
274
|
|
|
|
|
|
|
} |
275
|
|
|
|
|
|
|
|
276
|
47
|
50
|
|
|
|
91
|
if (exists $scripts{'Devanagari'}) { |
277
|
0
|
|
|
|
|
0
|
return $self->check ($sample, @DEVANAGARI); |
278
|
|
|
|
|
|
|
} |
279
|
|
|
|
|
|
|
|
280
|
|
|
|
|
|
|
# Try languages with unique scripts |
281
|
|
|
|
|
|
|
|
282
|
47
|
|
|
|
|
92
|
for my $s (@SINGLETONS) { |
283
|
829
|
100
|
|
|
|
1398
|
if (exists $scripts{$s}) { |
284
|
1
|
|
|
|
|
4
|
return [make_ret (lc ($s), 1)]; |
285
|
|
|
|
|
|
|
} |
286
|
|
|
|
|
|
|
} |
287
|
|
|
|
|
|
|
|
288
|
46
|
50
|
|
|
|
79
|
if (exists $scripts{'Superfreak Latin'}) { |
289
|
0
|
|
|
|
|
0
|
return [make_ret ('vietnamese', 1)]; |
290
|
|
|
|
|
|
|
} |
291
|
|
|
|
|
|
|
|
292
|
46
|
100
|
|
|
|
80
|
if (exists $scripts{'Exotic Latin'}) { |
293
|
7
|
|
|
|
|
20
|
return $self->check ($sample, @EXOTIC_LATIN); |
294
|
|
|
|
|
|
|
} |
295
|
|
|
|
|
|
|
|
296
|
39
|
100
|
|
|
|
72
|
if (exists $scripts{'Accented Latin'}) { |
297
|
5
|
|
|
|
|
18
|
return $self->check ($sample, @ACCENTED_LATIN); |
298
|
|
|
|
|
|
|
} |
299
|
|
|
|
|
|
|
|
300
|
34
|
100
|
|
|
|
60
|
if (exists $scripts{'Basic Latin'}) { |
301
|
6
|
|
|
|
|
28
|
return $self->check ($sample, @ALL_LATIN); |
302
|
|
|
|
|
|
|
} |
303
|
|
|
|
|
|
|
|
304
|
28
|
|
|
|
|
150
|
return [{ name => "unknown script: '". (join ", ", keys %scripts)."'", |
305
|
|
|
|
|
|
|
score => 1}]; |
306
|
|
|
|
|
|
|
} |
307
|
|
|
|
|
|
|
|
308
|
|
|
|
|
|
|
sub check |
309
|
|
|
|
|
|
|
{ |
310
|
26
|
|
|
26
|
0
|
120
|
my ($self, $sample, @langs) = @_; |
311
|
26
|
|
|
|
|
59
|
my $mod = __make_model ($sample); |
312
|
26
|
|
|
|
|
73
|
my $num_tri = scalar keys %$mod; |
313
|
26
|
|
|
|
|
38
|
my %scores; |
314
|
26
|
|
|
|
|
44
|
for my $key (@langs) { |
315
|
550
|
|
|
|
|
874
|
my $l = lc ($key); |
316
|
550
|
100
|
|
|
|
1235
|
unless (exists $self->{models}{$l}) { |
317
|
10
|
|
|
|
|
16
|
next; |
318
|
|
|
|
|
|
|
} |
319
|
540
|
|
|
|
|
1026
|
my $score = __distance ($mod, $self->{models}{$l}); |
320
|
540
|
|
|
|
|
1148
|
$scores{$l} = $score; |
321
|
|
|
|
|
|
|
} |
322
|
26
|
|
|
|
|
193
|
my @sorted = sort { $scores{$a} <=> $scores{$b} } keys %scores; |
|
1560
|
|
|
|
|
2069
|
|
323
|
26
|
|
|
|
|
65
|
my @out; |
324
|
26
|
|
50
|
|
|
65
|
$num_tri ||=1; |
325
|
26
|
|
|
|
|
52
|
for my $s (@sorted) { |
326
|
474
|
|
|
|
|
797
|
my $norm = $scores{$s}/$num_tri; |
327
|
474
|
|
|
|
|
797
|
push @out, make_ret ($s, $norm); |
328
|
|
|
|
|
|
|
} |
329
|
26
|
|
|
|
|
39
|
my $total = 0.0; |
330
|
26
|
|
|
|
|
51
|
for (@out) { |
331
|
|
|
|
|
|
|
$total += $_->{score} |
332
|
474
|
|
|
|
|
649
|
} |
333
|
26
|
|
|
|
|
36
|
for (@out) { |
334
|
474
|
|
|
|
|
614
|
$_->{score} /= $total; |
335
|
|
|
|
|
|
|
} |
336
|
26
|
|
|
|
|
730
|
return \@out; |
337
|
|
|
|
|
|
|
} |
338
|
|
|
|
|
|
|
|
339
|
|
|
|
|
|
|
sub __distance |
340
|
|
|
|
|
|
|
{ |
341
|
540
|
|
|
540
|
|
901
|
my ($m1, $m2) = @_; |
342
|
540
|
|
|
|
|
646
|
my $dist = 0; |
343
|
540
|
|
|
|
|
5578
|
for my $k (keys %$m1) { |
344
|
55703
|
100
|
|
|
|
94124
|
$dist += (exists $m2->{$k} ? abs($m2->{$k} - $m1->{$k}) : $MAX); |
345
|
|
|
|
|
|
|
} |
346
|
540
|
|
|
|
|
2886
|
return $dist; |
347
|
|
|
|
|
|
|
} |
348
|
|
|
|
|
|
|
|
349
|
|
|
|
|
|
|
sub __make_model |
350
|
|
|
|
|
|
|
{ |
351
|
26
|
|
|
26
|
|
47
|
my ($content) = @_; |
352
|
26
|
|
|
|
|
39
|
my %trigrams; |
353
|
26
|
|
|
|
|
556
|
$content = NFC ($content); # normal form C |
354
|
|
|
|
|
|
|
# Substitute all non-word characters with spaces |
355
|
26
|
|
|
|
|
603
|
$content =~ s/[^[:alpha:]']/ /g; |
356
|
26
|
|
|
|
|
133
|
for (my $i = 0; $i < length ($content) - 2; $i++) { |
357
|
2892
|
|
|
|
|
5665
|
my $tri = lc (substr ($content, $i, 3)); |
358
|
2892
|
|
|
|
|
12495
|
$trigrams{$tri}++; |
359
|
|
|
|
|
|
|
} |
360
|
|
|
|
|
|
|
|
361
|
|
|
|
|
|
|
my @sorted = sort { $trigrams{$b} == $trigrams{$a} ? |
362
|
|
|
|
|
|
|
$a cmp $b : |
363
|
14195
|
100
|
|
|
|
23076
|
$trigrams{$b} <=> $trigrams{$a} } |
364
|
26
|
|
|
|
|
399
|
grep { !/\s\s/o } keys %trigrams; |
|
2625
|
|
|
|
|
4730
|
|
365
|
26
|
|
|
|
|
273
|
my @trimmed = splice (@sorted, 0, 300); |
366
|
26
|
|
|
|
|
58
|
my $counter = 0; |
367
|
26
|
|
|
|
|
36
|
my %res; |
368
|
26
|
|
|
|
|
101
|
for my $t (@trimmed) { |
369
|
2568
|
|
|
|
|
3843
|
$res{$t} = $counter++; |
370
|
|
|
|
|
|
|
} |
371
|
26
|
|
|
|
|
339
|
return \%res; |
372
|
|
|
|
|
|
|
} |
373
|
|
|
|
|
|
|
|
374
|
|
|
|
|
|
|
1; |