line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Lingua::Guess; |
2
|
|
|
|
|
|
|
|
3
|
1
|
|
|
1
|
|
21651
|
use strict; |
|
1
|
|
|
|
|
4
|
|
|
1
|
|
|
|
|
52
|
|
4
|
1
|
|
|
1
|
|
10
|
use warnings; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
58
|
|
5
|
|
|
|
|
|
|
require 5.008; |
6
|
1
|
|
|
1
|
|
9
|
use Carp; |
|
1
|
|
|
|
|
8
|
|
|
1
|
|
|
|
|
122
|
|
7
|
1
|
|
|
1
|
|
630
|
use File::Spec::Functions 'catfile'; |
|
1
|
|
|
|
|
1271
|
|
|
1
|
|
|
|
|
89
|
|
8
|
1
|
|
|
1
|
|
621
|
use Unicode::Normalize qw/NFC/; |
|
1
|
|
|
|
|
1795
|
|
|
1
|
|
|
|
|
70
|
|
9
|
1
|
|
|
1
|
|
743
|
use Unicode::UCD 'charinfo'; |
|
1
|
|
|
|
|
42100
|
|
|
1
|
|
|
|
|
1670
|
|
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
our $VERSION = '0.02'; |
12
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
# Maximum distance, used by __distance. |
14
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
our $MAX = 300; |
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
our @BASIC_LATIN = qw/English cebuano hausa somali pig_latin klingon |
18
|
|
|
|
|
|
|
indonesian hawaiian welsh latin swahili/; |
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
our @EXOTIC_LATIN = qw/Czech Polish Croatian Romanian Slovak Slovene |
21
|
|
|
|
|
|
|
Turkish Hungarian Azeri Lithuanian Estonian/; |
22
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
our @ACCENTED_LATIN = (qw/Albanian Spanish French German Dutch Italian |
24
|
|
|
|
|
|
|
Danish Icelandic Norwegian Swedish Finnish |
25
|
|
|
|
|
|
|
Latvian Portuguese /, @EXOTIC_LATIN); |
26
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
our @ALL_LATIN = ( @BASIC_LATIN, @EXOTIC_LATIN, @ACCENTED_LATIN); |
28
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
our @CYRILLIC = qw/Russian Ukrainian Belarussian Kazakh Uzbek |
30
|
|
|
|
|
|
|
Mongolian Serbian Macedonian Bulgarian Kyrgyz/; |
31
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
our @ARABIC = qw/Arabic Farsi Jawi Kurdish Pashto Sindhi Urdu/; |
33
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
our @DEVANAGARI = qw/Bhojpuri Bihari Hindi Kashmiri Konkani Marathi |
35
|
|
|
|
|
|
|
Nepali Sanskrit/; |
36
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
our @SINGLETONS = qw/Armenian Hebrew Bengali Gurumkhi Greek Gujarati |
38
|
|
|
|
|
|
|
Oriya Tamil Telugu Kannada Malayalam Sinhala |
39
|
|
|
|
|
|
|
Thai Lao Tibetan Burmese Georgian Mongolian/; |
40
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
sub new |
42
|
|
|
|
|
|
|
{ |
43
|
1
|
|
|
1
|
1
|
14
|
my ($class, %params) = @_; |
44
|
1
|
50
|
|
|
|
4
|
if (! $params{modeldir}) { |
45
|
1
|
|
|
|
|
3
|
my $md = __FILE__; |
46
|
1
|
|
|
|
|
6
|
$md =~ s!\.pm$!/train!; |
47
|
1
|
|
|
|
|
4
|
$params{modeldir} = $md; |
48
|
|
|
|
|
|
|
} |
49
|
1
|
50
|
|
|
|
24
|
if (! -d $params{modeldir}) { |
50
|
0
|
|
|
|
|
0
|
croak "Model directory '$params{modeldir}' does not exist"; |
51
|
|
|
|
|
|
|
} |
52
|
1
|
|
|
|
|
6
|
my $self = bless { %params }, $class; |
53
|
1
|
|
|
|
|
3
|
return $self; |
54
|
|
|
|
|
|
|
} |
55
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
sub guess |
58
|
|
|
|
|
|
|
{ |
59
|
28
|
|
|
28
|
1
|
233
|
my ($self, $string) = @_; |
60
|
28
|
100
|
|
|
|
196
|
unless (defined $self->{models}) { |
61
|
1
|
|
|
|
|
4
|
$self->load_models (); |
62
|
|
|
|
|
|
|
} |
63
|
28
|
|
|
|
|
328
|
my @runs = find_runs($string); |
64
|
28
|
|
|
|
|
113
|
my @langs; |
65
|
|
|
|
|
|
|
my %scripts; |
66
|
28
|
|
|
|
|
97
|
for my $run (@runs) { |
67
|
54
|
|
|
|
|
1313
|
$scripts{$run->[1]}++; |
68
|
|
|
|
|
|
|
} |
69
|
28
|
|
|
|
|
304
|
return $self->identify ($string, %scripts); |
70
|
|
|
|
|
|
|
} |
71
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
sub simple_guess |
73
|
|
|
|
|
|
|
{ |
74
|
28
|
|
|
28
|
1
|
134948
|
my ($self, $string) = @_; |
75
|
28
|
|
|
|
|
177
|
my $got = $self->guess ($string); |
76
|
28
|
|
|
|
|
805
|
return $got->[0]{name}; |
77
|
|
|
|
|
|
|
} |
78
|
|
|
|
|
|
|
|
79
|
|
|
|
|
|
|
sub load_models |
80
|
|
|
|
|
|
|
{ |
81
|
1
|
|
|
1
|
0
|
2
|
my ($self) = @_; |
82
|
1
|
50
|
|
|
|
43
|
opendir my $dh, $self->{modeldir} or die "Unable to open dir:$!"; |
83
|
1
|
|
|
|
|
3
|
my %models; |
84
|
1
|
|
|
|
|
20
|
while (my $f = readdir $dh) { |
85
|
57
|
100
|
|
|
|
350
|
unless ($f =~ /\.train$/) { |
86
|
2
|
|
|
|
|
10
|
next; |
87
|
|
|
|
|
|
|
} |
88
|
55
|
|
|
|
|
296
|
my ($name) = $f =~ m|(.*)\.|; |
89
|
55
|
|
|
|
|
477
|
my $path = catfile ($self->{modeldir}, $f); |
90
|
1
|
50
|
|
1
|
|
7
|
open my $fh, "<:encoding(utf8)", $path or die "Failed to open file: $!"; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
7
|
|
|
55
|
|
|
|
|
3001
|
|
91
|
55
|
|
|
|
|
11554
|
my %model; |
92
|
55
|
|
|
|
|
953
|
while (my $line = <$fh>) { |
93
|
16500
|
|
|
|
|
30466
|
chomp $line; |
94
|
16500
|
|
|
|
|
48009
|
my ($k, $v) = $line =~ m|(.{3})\s+(.*)|; |
95
|
16500
|
50
|
|
|
|
36869
|
unless (defined $k) { |
96
|
0
|
|
|
|
|
0
|
next; |
97
|
|
|
|
|
|
|
} |
98
|
16500
|
|
|
|
|
58782
|
$model{$k} = $v; |
99
|
|
|
|
|
|
|
} |
100
|
55
|
|
|
|
|
1260
|
$models{$name} = \%model; |
101
|
|
|
|
|
|
|
} |
102
|
1
|
|
|
|
|
22
|
$self->{models} = \%models; |
103
|
|
|
|
|
|
|
} |
104
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
sub find_runs |
106
|
|
|
|
|
|
|
{ |
107
|
28
|
|
|
28
|
0
|
150
|
my ($raw) = @_; |
108
|
28
|
|
|
|
|
1300
|
my @chars = split m//, $raw; |
109
|
28
|
|
|
|
|
147
|
my $prev = ''; |
110
|
28
|
|
|
|
|
519
|
my @c; |
111
|
|
|
|
|
|
|
my @runs; |
112
|
28
|
|
|
|
|
0
|
my @run_types; |
113
|
28
|
|
|
|
|
86
|
my $current_run = -1; |
114
|
|
|
|
|
|
|
|
115
|
28
|
|
|
|
|
150
|
for my $c (@chars) { |
116
|
2852
|
|
|
|
|
16672
|
my $is_alph = $c =~ /[[:alpha:]]/o; |
117
|
2852
|
|
|
|
|
10232
|
my $inf = get_charinfo ($c); |
118
|
2852
|
100
|
100
|
|
|
26909
|
if ($is_alph and ! ($inf->{block} eq $prev)) { |
119
|
227
|
|
|
|
|
783
|
$prev = $inf->{block}; |
120
|
227
|
|
|
|
|
788
|
@c = (); |
121
|
227
|
|
|
|
|
538
|
$current_run++; |
122
|
227
|
|
|
|
|
1000
|
$run_types[$current_run] = $prev; |
123
|
|
|
|
|
|
|
} |
124
|
2852
|
|
|
|
|
8909
|
push @c, $c; |
125
|
2852
|
100
|
|
|
|
11691
|
if ($current_run > -1) { |
126
|
2848
|
|
|
|
|
22948
|
push @{ $runs[$current_run] }, $c; |
|
2848
|
|
|
|
|
26827
|
|
127
|
|
|
|
|
|
|
} |
128
|
|
|
|
|
|
|
} |
129
|
|
|
|
|
|
|
|
130
|
28
|
|
|
|
|
181
|
my ($newruns, $newtypes) = reconcile_latin (\@runs, \@run_types); |
131
|
28
|
|
|
|
|
154
|
my $counter = 0; |
132
|
28
|
|
|
|
|
142
|
my @result; |
133
|
28
|
|
|
|
|
259
|
for my $r (@$newruns) { |
134
|
54
|
|
|
|
|
251
|
push @result, [ $r, $newtypes->[$counter]]; |
135
|
54
|
|
|
|
|
144
|
$counter++; |
136
|
|
|
|
|
|
|
} |
137
|
28
|
|
|
|
|
1370
|
return @result; |
138
|
|
|
|
|
|
|
} |
139
|
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
# Cached lookups from charinfo |
141
|
|
|
|
|
|
|
|
142
|
|
|
|
|
|
|
my %cache; |
143
|
|
|
|
|
|
|
|
144
|
|
|
|
|
|
|
# Look up characters using charinfo, but with a cache to save repeated |
145
|
|
|
|
|
|
|
# lookups. |
146
|
|
|
|
|
|
|
|
147
|
|
|
|
|
|
|
sub get_charinfo |
148
|
|
|
|
|
|
|
{ |
149
|
2852
|
|
|
2852
|
0
|
8327
|
my ($char) = @_; |
150
|
2852
|
|
|
|
|
8920
|
my $known = $cache{$char}; |
151
|
2852
|
100
|
|
|
|
23139
|
if ($known) { |
152
|
2657
|
|
|
|
|
9123
|
return $known; |
153
|
|
|
|
|
|
|
} |
154
|
195
|
|
|
|
|
1660
|
my $inf = charinfo (ord ($char)); |
155
|
195
|
|
|
|
|
739626
|
$cache{$char} = $inf; |
156
|
195
|
|
|
|
|
1006
|
return $inf; |
157
|
|
|
|
|
|
|
} |
158
|
|
|
|
|
|
|
|
159
|
|
|
|
|
|
|
sub reconcile_latin |
160
|
|
|
|
|
|
|
{ |
161
|
28
|
|
|
28
|
0
|
253
|
my ($runs, $types) = @_; |
162
|
28
|
|
|
|
|
219
|
my @types = @$types; |
163
|
28
|
|
|
|
|
384
|
my (@new_runs, @new_types); |
164
|
28
|
|
|
|
|
710
|
my $last_type = ''; |
165
|
|
|
|
|
|
|
|
166
|
28
|
|
|
|
|
82
|
my $upgrade; |
167
|
28
|
100
|
|
|
|
157
|
if (has_supplemental_latin (@$types)) { |
168
|
10
|
|
|
|
|
39
|
$upgrade = 'Accented Latin'; |
169
|
|
|
|
|
|
|
} |
170
|
28
|
100
|
|
|
|
210
|
if (has_extended_latin (@$types)) { |
171
|
7
|
|
|
|
|
23
|
$upgrade = 'Exotic Latin' ; |
172
|
|
|
|
|
|
|
} |
173
|
28
|
50
|
|
|
|
146
|
if (has_latin_extended_additional (@$types)) { |
174
|
0
|
|
|
|
|
0
|
$upgrade = 'Superfreak Latin'; |
175
|
|
|
|
|
|
|
} |
176
|
28
|
100
|
|
|
|
139
|
unless ($upgrade) { |
177
|
16
|
|
|
|
|
192
|
return ($runs, $types); |
178
|
|
|
|
|
|
|
} |
179
|
12
|
|
|
|
|
42
|
my $run_count = -1; |
180
|
12
|
|
|
|
|
52
|
for my $r (@$runs) { |
181
|
211
|
|
|
|
|
505
|
my $type = shift @types; |
182
|
211
|
100
|
|
|
|
825
|
if ($type =~ /Latin/) { |
183
|
198
|
|
|
|
|
427
|
$type = $upgrade; |
184
|
|
|
|
|
|
|
} |
185
|
211
|
100
|
|
|
|
659
|
unless ($type eq $last_type) { |
186
|
38
|
|
|
|
|
86
|
$run_count++; |
187
|
|
|
|
|
|
|
} |
188
|
211
|
|
|
|
|
627
|
push @{$new_runs[$run_count]}, @$r; |
|
211
|
|
|
|
|
2246
|
|
189
|
211
|
|
|
|
|
549
|
$new_types[$run_count] = $type; |
190
|
211
|
|
|
|
|
556
|
$last_type = $type; |
191
|
|
|
|
|
|
|
} |
192
|
12
|
|
|
|
|
75
|
return (\@new_runs, \@new_types); |
193
|
|
|
|
|
|
|
} |
194
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
|
196
|
|
|
|
|
|
|
sub has_extended_latin |
197
|
|
|
|
|
|
|
{ |
198
|
28
|
|
|
28
|
0
|
151
|
my (@types) = @_; |
199
|
28
|
|
|
|
|
100
|
return scalar grep { /Latin Extended-A/ } @types; |
|
227
|
|
|
|
|
1029
|
|
200
|
|
|
|
|
|
|
} |
201
|
|
|
|
|
|
|
|
202
|
|
|
|
|
|
|
sub has_supplemental_latin |
203
|
|
|
|
|
|
|
{ |
204
|
28
|
|
|
28
|
0
|
154
|
my (@types) = @_; |
205
|
28
|
|
|
|
|
123
|
return scalar grep { /Latin-1 Supplement/ } @types; |
|
227
|
|
|
|
|
877
|
|
206
|
|
|
|
|
|
|
} |
207
|
|
|
|
|
|
|
|
208
|
|
|
|
|
|
|
sub has_latin_extended_additional |
209
|
|
|
|
|
|
|
{ |
210
|
28
|
|
|
28
|
0
|
151
|
my (@types) = @_; |
211
|
28
|
|
|
|
|
91
|
return scalar grep { /Latin Extended Additional/ } @types; |
|
227
|
|
|
|
|
1064
|
|
212
|
|
|
|
|
|
|
} |
213
|
|
|
|
|
|
|
|
214
|
|
|
|
|
|
|
|
215
|
|
|
|
|
|
|
sub identify |
216
|
|
|
|
|
|
|
{ |
217
|
56
|
|
|
56
|
0
|
45567
|
my ($self, $sample, %scripts) = @_; |
218
|
|
|
|
|
|
|
|
219
|
|
|
|
|
|
|
# Check for Korean |
220
|
|
|
|
|
|
|
|
221
|
56
|
50
|
33
|
|
|
816
|
if (exists $scripts{'Hangul Syllables'} || |
|
|
|
33
|
|
|
|
|
|
|
|
33
|
|
|
|
|
222
|
|
|
|
|
|
|
exists $scripts{'Hangul Jamo'} || |
223
|
|
|
|
|
|
|
exists $scripts{'Hangul Compatibility Jamo'} || |
224
|
|
|
|
|
|
|
exists $scripts{'Hangul'}) { |
225
|
0
|
|
|
|
|
0
|
return [{ name =>'korean', score => 1 }]; |
226
|
|
|
|
|
|
|
} |
227
|
|
|
|
|
|
|
|
228
|
56
|
100
|
|
|
|
234
|
if (exists $scripts{'Greek and Coptic'}) { |
229
|
1
|
|
|
|
|
40
|
return [{ name =>'greek', score => 1 }]; |
230
|
|
|
|
|
|
|
} |
231
|
|
|
|
|
|
|
|
232
|
55
|
50
|
33
|
|
|
575
|
if (exists $scripts{'Katakana'} || |
|
|
|
33
|
|
|
|
|
233
|
|
|
|
|
|
|
exists $scripts{'Hiragana'} || |
234
|
|
|
|
|
|
|
exists $scripts{'Katakana Phonetic Extensions'}) { |
235
|
0
|
|
|
|
|
0
|
return [{ name =>'japanese', score => 1 }]; |
236
|
|
|
|
|
|
|
} |
237
|
|
|
|
|
|
|
|
238
|
55
|
50
|
66
|
|
|
604
|
if (exists $scripts{'CJK Unified Ideographs'} || |
|
|
|
33
|
|
|
|
|
|
|
|
33
|
|
|
|
|
239
|
|
|
|
|
|
|
exists $scripts{'Bopomofo'} || |
240
|
|
|
|
|
|
|
exists $scripts{'Bopomofo Extended'} || |
241
|
|
|
|
|
|
|
exists $scripts{'KangXi Radicals'}) { |
242
|
1
|
|
|
|
|
19
|
return [{ name => 'chinese', score => 1 }]; |
243
|
|
|
|
|
|
|
} |
244
|
|
|
|
|
|
|
|
245
|
54
|
100
|
|
|
|
525
|
if (exists $scripts{'Cyrillic'}) { |
246
|
7
|
|
|
|
|
53
|
return $self->check ($sample, @CYRILLIC); |
247
|
|
|
|
|
|
|
} |
248
|
|
|
|
|
|
|
|
249
|
47
|
50
|
66
|
|
|
1012
|
if (exists $scripts{'Arabic'} || |
|
|
|
33
|
|
|
|
|
250
|
|
|
|
|
|
|
exists $scripts{'Arabic Presentation Forms-A'} || |
251
|
|
|
|
|
|
|
exists $scripts{'Arabic Presentation Forms-B'}) { |
252
|
1
|
|
|
|
|
8
|
return $self->check ($sample, @ARABIC); |
253
|
|
|
|
|
|
|
} |
254
|
|
|
|
|
|
|
|
255
|
46
|
50
|
|
|
|
164
|
if (exists $scripts{'Devanagari'}) { |
256
|
0
|
|
|
|
|
0
|
return $self->check ($sample, @DEVANAGARI); |
257
|
|
|
|
|
|
|
} |
258
|
|
|
|
|
|
|
|
259
|
|
|
|
|
|
|
# Try languages with unique scripts |
260
|
|
|
|
|
|
|
|
261
|
46
|
|
|
|
|
221
|
for my $s (@SINGLETONS) { |
262
|
811
|
100
|
|
|
|
3871
|
if (exists $scripts{$s}) { |
263
|
1
|
|
|
|
|
19
|
return [{ name => lc ($s), score => 1 }]; |
264
|
|
|
|
|
|
|
} |
265
|
|
|
|
|
|
|
} |
266
|
|
|
|
|
|
|
|
267
|
45
|
50
|
|
|
|
443
|
if (exists $scripts{'Superfreak Latin'}) { |
268
|
0
|
|
|
|
|
0
|
return [{ name => 'vietnamese', score => 1 }]; |
269
|
|
|
|
|
|
|
} |
270
|
|
|
|
|
|
|
|
271
|
45
|
100
|
|
|
|
150
|
if (exists $scripts{'Exotic Latin'}) { |
272
|
7
|
|
|
|
|
53
|
return $self->check ($sample, @EXOTIC_LATIN); |
273
|
|
|
|
|
|
|
} |
274
|
|
|
|
|
|
|
|
275
|
38
|
100
|
|
|
|
135
|
if (exists $scripts{'Accented Latin'}) { |
276
|
5
|
|
|
|
|
40
|
return $self->check ($sample, @ACCENTED_LATIN); |
277
|
|
|
|
|
|
|
} |
278
|
|
|
|
|
|
|
|
279
|
33
|
100
|
|
|
|
117
|
if (exists $scripts{'Basic Latin'}) { |
280
|
5
|
|
|
|
|
44
|
return $self->check ($sample, @ALL_LATIN); |
281
|
|
|
|
|
|
|
} |
282
|
|
|
|
|
|
|
|
283
|
28
|
|
|
|
|
344
|
return [{ name => "unknown script: '". (join ", ", keys %scripts)."'", |
284
|
|
|
|
|
|
|
score => 1}]; |
285
|
|
|
|
|
|
|
} |
286
|
|
|
|
|
|
|
|
287
|
|
|
|
|
|
|
sub check |
288
|
|
|
|
|
|
|
{ |
289
|
25
|
|
|
25
|
0
|
358
|
my ($self, $sample, @langs) = @_; |
290
|
25
|
|
|
|
|
145
|
my $mod = __make_model ($sample); |
291
|
25
|
|
|
|
|
183
|
my $num_tri = scalar keys %$mod; |
292
|
25
|
|
|
|
|
93
|
my %scores; |
293
|
25
|
|
|
|
|
98
|
for my $key (@langs) { |
294
|
504
|
|
|
|
|
1741
|
my $l = lc ($key); |
295
|
504
|
100
|
|
|
|
2752
|
unless (exists $self->{models}{$l}) { |
296
|
10
|
|
|
|
|
43
|
next; |
297
|
|
|
|
|
|
|
} |
298
|
494
|
|
|
|
|
3273
|
my $score = __distance ($mod, $self->{models}{$l}); |
299
|
494
|
|
|
|
|
2678
|
$scores{$l} = $score; |
300
|
|
|
|
|
|
|
} |
301
|
25
|
|
|
|
|
438
|
my @sorted = sort { $scores{$a} <=> $scores{$b} } keys %scores; |
|
1435
|
|
|
|
|
4102
|
|
302
|
25
|
|
|
|
|
103
|
my @out; |
303
|
25
|
|
50
|
|
|
125
|
$num_tri ||=1; |
304
|
25
|
|
|
|
|
162
|
for my $s (@sorted) { |
305
|
439
|
|
|
|
|
1264
|
my $norm = $scores{$s}/$num_tri; |
306
|
439
|
|
|
|
|
2332
|
push @out, { name => $s , score => $norm }; |
307
|
|
|
|
|
|
|
} |
308
|
25
|
|
|
|
|
82
|
my $total = 0.0; |
309
|
25
|
|
|
|
|
98
|
for (@out) { |
310
|
|
|
|
|
|
|
$total += $_->{score} |
311
|
439
|
|
|
|
|
948
|
} |
312
|
25
|
|
|
|
|
100
|
for (@out) { |
313
|
439
|
|
|
|
|
9354
|
$_->{score} /= $total; |
314
|
|
|
|
|
|
|
} |
315
|
25
|
|
|
|
|
2241
|
return \@out; |
316
|
|
|
|
|
|
|
} |
317
|
|
|
|
|
|
|
|
318
|
|
|
|
|
|
|
sub __distance |
319
|
|
|
|
|
|
|
{ |
320
|
494
|
|
|
494
|
|
1527
|
my ($m1, $m2) = @_; |
321
|
494
|
|
|
|
|
1204
|
my $dist = 0; |
322
|
494
|
|
|
|
|
8221
|
for my $k (keys %$m1) { |
323
|
47837
|
100
|
|
|
|
246463
|
$dist += (exists $m2->{$k} ? abs($m2->{$k} - $m1->{$k}) : $MAX); |
324
|
|
|
|
|
|
|
} |
325
|
494
|
|
|
|
|
5428
|
return $dist; |
326
|
|
|
|
|
|
|
} |
327
|
|
|
|
|
|
|
|
328
|
|
|
|
|
|
|
sub __make_model |
329
|
|
|
|
|
|
|
{ |
330
|
25
|
|
|
25
|
|
92
|
my ($content) = @_; |
331
|
25
|
|
|
|
|
83
|
my %trigrams; |
332
|
25
|
|
|
|
|
1025
|
$content = NFC ($content); # normal form C |
333
|
|
|
|
|
|
|
# Substitute all non-word characters with spaces |
334
|
25
|
|
|
|
|
926
|
$content =~ s/[^[:alpha:]']/ /g; |
335
|
25
|
|
|
|
|
247
|
for (my $i = 0; $i < length ($content) - 2; $i++) { |
336
|
2664
|
|
|
|
|
29173
|
my $tri = lc (substr ($content, $i, 3)); |
337
|
2664
|
|
|
|
|
25618
|
$trigrams{$tri}++; |
338
|
|
|
|
|
|
|
} |
339
|
|
|
|
|
|
|
|
340
|
|
|
|
|
|
|
my @sorted = sort { $trigrams{$b} == $trigrams{$a} ? |
341
|
|
|
|
|
|
|
$a cmp $b : |
342
|
13125
|
100
|
|
|
|
49428
|
$trigrams{$b} <=> $trigrams{$a} } |
343
|
25
|
|
|
|
|
1860
|
grep { !/\s\s/o } keys %trigrams; |
|
2445
|
|
|
|
|
9494
|
|
344
|
25
|
|
|
|
|
742
|
my @trimmed = splice (@sorted, 0, 300); |
345
|
25
|
|
|
|
|
208
|
my $counter = 0; |
346
|
25
|
|
|
|
|
87
|
my %res; |
347
|
25
|
|
|
|
|
183
|
for my $t (@trimmed) { |
348
|
2397
|
|
|
|
|
7519
|
$res{$t} = $counter++; |
349
|
|
|
|
|
|
|
} |
350
|
25
|
|
|
|
|
695
|
return \%res; |
351
|
|
|
|
|
|
|
} |
352
|
|
|
|
|
|
|
|
353
|
|
|
|
|
|
|
1; |