File Coverage

blib/lib/Lingua/Guess.pm

Criterion	Covered	Total	%
statement	189	196	96.4
branch	50	62	80.6
condition	16	35	45.7
subroutine	21	21	100.0
pod	3	12	25.0
total	279	326	85.5

line	stmt	bran	cond	sub	pod	time	code
1							package Lingua::Guess;
2
3	1			1		21651	use strict;
	1					4
	1					52
4	1			1		10	use warnings;
	1					2
	1					58
5							require 5.008;
6	1			1		9	use Carp;
	1					8
	1					122
7	1			1		630	use File::Spec::Functions 'catfile';
	1					1271
	1					89
8	1			1		621	use Unicode::Normalize qw/NFC/;
	1					1795
	1					70
9	1			1		743	use Unicode::UCD 'charinfo';
	1					42100
	1					1670
10
11							our $VERSION = '0.02';
12
13							# Maximum distance, used by __distance.
14
15							our $MAX = 300;
16
17							our @BASIC_LATIN = qw/English cebuano hausa somali pig_latin klingon
18							indonesian hawaiian welsh latin swahili/;
19
20							our @EXOTIC_LATIN = qw/Czech Polish Croatian Romanian Slovak Slovene
21							Turkish Hungarian Azeri Lithuanian Estonian/;
22
23							our @ACCENTED_LATIN = (qw/Albanian Spanish French German Dutch Italian
24							Danish Icelandic Norwegian Swedish Finnish
25							Latvian Portuguese /, @EXOTIC_LATIN);
26
27							our @ALL_LATIN = ( @BASIC_LATIN, @EXOTIC_LATIN, @ACCENTED_LATIN);
28
29							our @CYRILLIC = qw/Russian Ukrainian Belarussian Kazakh Uzbek
30							Mongolian Serbian Macedonian Bulgarian Kyrgyz/;
31
32							our @ARABIC = qw/Arabic Farsi Jawi Kurdish Pashto Sindhi Urdu/;
33
34							our @DEVANAGARI = qw/Bhojpuri Bihari Hindi Kashmiri Konkani Marathi
35							Nepali Sanskrit/;
36
37							our @SINGLETONS = qw/Armenian Hebrew Bengali Gurumkhi Greek Gujarati
38							Oriya Tamil Telugu Kannada Malayalam Sinhala
39							Thai Lao Tibetan Burmese Georgian Mongolian/;
40
41							sub new
42							{
43	1			1	1	14	my ($class, %params) = @_;
44	1	50				4	if (! $params{modeldir}) {
45	1					3	my $md = __FILE__;
46	1					6	$md =~ s!\.pm$!/train!;
47	1					4	$params{modeldir} = $md;
48							}
49	1	50				24	if (! -d $params{modeldir}) {
50	0					0	croak "Model directory '$params{modeldir}' does not exist";
51							}
52	1					6	my $self = bless { %params }, $class;
53	1					3	return $self;
54							}
55
56
57							sub guess
58							{
59	28			28	1	233	my ($self, $string) = @_;
60	28	100				196	unless (defined $self->{models}) {
61	1					4	$self->load_models ();
62							}
63	28					328	my @runs = find_runs($string);
64	28					113	my @langs;
65							my %scripts;
66	28					97	for my $run (@runs) {
67	54					1313	$scripts{$run->[1]}++;
68							}
69	28					304	return $self->identify ($string, %scripts);
70							}
71
72							sub simple_guess
73							{
74	28			28	1	134948	my ($self, $string) = @_;
75	28					177	my $got = $self->guess ($string);
76	28					805	return $got->[0]{name};
77							}
78
79							sub load_models
80							{
81	1			1	0	2	my ($self) = @_;
82	1	50				43	opendir my $dh, $self->{modeldir} or die "Unable to open dir:$!";
83	1					3	my %models;
84	1					20	while (my $f = readdir $dh) {
85	57	100				350	unless ($f =~ /\.train$/) {
86	2					10	next;
87							}
88	55					296	my ($name) = $f =~ m\|(.*)\.\|;
89	55					477	my $path = catfile ($self->{modeldir}, $f);
90	1	50		1		7	open my $fh, "<:encoding(utf8)", $path or die "Failed to open file: $!";
	1					2
	1					7
	55					3001
91	55					11554	my %model;
92	55					953	while (my $line = <$fh>) {
93	16500					30466	chomp $line;
94	16500					48009	my ($k, $v) = $line =~ m\|(.{3})\s+(.*)\|;
95	16500	50				36869	unless (defined $k) {
96	0					0	next;
97							}
98	16500					58782	$model{$k} = $v;
99							}
100	55					1260	$models{$name} = \%model;
101							}
102	1					22	$self->{models} = \%models;
103							}
104
105							sub find_runs
106							{
107	28			28	0	150	my ($raw) = @_;
108	28					1300	my @chars = split m//, $raw;
109	28					147	my $prev = '';
110	28					519	my @c;
111							my @runs;
112	28					0	my @run_types;
113	28					86	my $current_run = -1;
114
115	28					150	for my $c (@chars) {
116	2852					16672	my $is_alph = $c =~ /[[:alpha:]]/o;
117	2852					10232	my $inf = get_charinfo ($c);
118	2852	100	100			26909	if ($is_alph and ! ($inf->{block} eq $prev)) {
119	227					783	$prev = $inf->{block};
120	227					788	@c = ();
121	227					538	$current_run++;
122	227					1000	$run_types[$current_run] = $prev;
123							}
124	2852					8909	push @c, $c;
125	2852	100				11691	if ($current_run > -1) {
126	2848					22948	push @{ $runs[$current_run] }, $c;
	2848					26827
127							}
128							}
129
130	28					181	my ($newruns, $newtypes) = reconcile_latin (\@runs, \@run_types);
131	28					154	my $counter = 0;
132	28					142	my @result;
133	28					259	for my $r (@$newruns) {
134	54					251	push @result, [ $r, $newtypes->[$counter]];
135	54					144	$counter++;
136							}
137	28					1370	return @result;
138							}
139
140							# Cached lookups from charinfo
141
142							my %cache;
143
144							# Look up characters using charinfo, but with a cache to save repeated
145							# lookups.
146
147							sub get_charinfo
148							{
149	2852			2852	0	8327	my ($char) = @_;
150	2852					8920	my $known = $cache{$char};
151	2852	100				23139	if ($known) {
152	2657					9123	return $known;
153							}
154	195					1660	my $inf = charinfo (ord ($char));
155	195					739626	$cache{$char} = $inf;
156	195					1006	return $inf;
157							}
158
159							sub reconcile_latin
160							{
161	28			28	0	253	my ($runs, $types) = @_;
162	28					219	my @types = @$types;
163	28					384	my (@new_runs, @new_types);
164	28					710	my $last_type = '';
165
166	28					82	my $upgrade;
167	28	100				157	if (has_supplemental_latin (@$types)) {
168	10					39	$upgrade = 'Accented Latin';
169							}
170	28	100				210	if (has_extended_latin (@$types)) {
171	7					23	$upgrade = 'Exotic Latin' ;
172							}
173	28	50				146	if (has_latin_extended_additional (@$types)) {
174	0					0	$upgrade = 'Superfreak Latin';
175							}
176	28	100				139	unless ($upgrade) {
177	16					192	return ($runs, $types);
178							}
179	12					42	my $run_count = -1;
180	12					52	for my $r (@$runs) {
181	211					505	my $type = shift @types;
182	211	100				825	if ($type =~ /Latin/) {
183	198					427	$type = $upgrade;
184							}
185	211	100				659	unless ($type eq $last_type) {
186	38					86	$run_count++;
187							}
188	211					627	push @{$new_runs[$run_count]}, @$r;
	211					2246
189	211					549	$new_types[$run_count] = $type;
190	211					556	$last_type = $type;
191							}
192	12					75	return (\@new_runs, \@new_types);
193							}
194
195
196							sub has_extended_latin
197							{
198	28			28	0	151	my (@types) = @_;
199	28					100	return scalar grep { /Latin Extended-A/ } @types;
	227					1029
200							}
201
202							sub has_supplemental_latin
203							{
204	28			28	0	154	my (@types) = @_;
205	28					123	return scalar grep { /Latin-1 Supplement/ } @types;
	227					877
206							}
207
208							sub has_latin_extended_additional
209							{
210	28			28	0	151	my (@types) = @_;
211	28					91	return scalar grep { /Latin Extended Additional/ } @types;
	227					1064
212							}
213
214
215							sub identify
216							{
217	56			56	0	45567	my ($self, $sample, %scripts) = @_;
218
219							# Check for Korean
220
221	56	50	33			816	if (exists $scripts{'Hangul Syllables'} \|\|
			33
			33
222							exists $scripts{'Hangul Jamo'} \|\|
223							exists $scripts{'Hangul Compatibility Jamo'} \|\|
224							exists $scripts{'Hangul'}) {
225	0					0	return [{ name =>'korean', score => 1 }];
226							}
227
228	56	100				234	if (exists $scripts{'Greek and Coptic'}) {
229	1					40	return [{ name =>'greek', score => 1 }];
230							}
231
232	55	50	33			575	if (exists $scripts{'Katakana'} \|\|
			33
233							exists $scripts{'Hiragana'} \|\|
234							exists $scripts{'Katakana Phonetic Extensions'}) {
235	0					0	return [{ name =>'japanese', score => 1 }];
236							}
237
238	55	50	66			604	if (exists $scripts{'CJK Unified Ideographs'} \|\|
			33
			33
239							exists $scripts{'Bopomofo'} \|\|
240							exists $scripts{'Bopomofo Extended'} \|\|
241							exists $scripts{'KangXi Radicals'}) {
242	1					19	return [{ name => 'chinese', score => 1 }];
243							}
244
245	54	100				525	if (exists $scripts{'Cyrillic'}) {
246	7					53	return $self->check ($sample, @CYRILLIC);
247							}
248
249	47	50	66			1012	if (exists $scripts{'Arabic'} \|\|
			33
250							exists $scripts{'Arabic Presentation Forms-A'} \|\|
251							exists $scripts{'Arabic Presentation Forms-B'}) {
252	1					8	return $self->check ($sample, @ARABIC);
253							}
254
255	46	50				164	if (exists $scripts{'Devanagari'}) {
256	0					0	return $self->check ($sample, @DEVANAGARI);
257							}
258
259							# Try languages with unique scripts
260
261	46					221	for my $s (@SINGLETONS) {
262	811	100				3871	if (exists $scripts{$s}) {
263	1					19	return [{ name => lc ($s), score => 1 }];
264							}
265							}
266
267	45	50				443	if (exists $scripts{'Superfreak Latin'}) {
268	0					0	return [{ name => 'vietnamese', score => 1 }];
269							}
270
271	45	100				150	if (exists $scripts{'Exotic Latin'}) {
272	7					53	return $self->check ($sample, @EXOTIC_LATIN);
273							}
274
275	38	100				135	if (exists $scripts{'Accented Latin'}) {
276	5					40	return $self->check ($sample, @ACCENTED_LATIN);
277							}
278
279	33	100				117	if (exists $scripts{'Basic Latin'}) {
280	5					44	return $self->check ($sample, @ALL_LATIN);
281							}
282
283	28					344	return [{ name => "unknown script: '". (join ", ", keys %scripts)."'",
284							score => 1}];
285							}
286
287							sub check
288							{
289	25			25	0	358	my ($self, $sample, @langs) = @_;
290	25					145	my $mod = __make_model ($sample);
291	25					183	my $num_tri = scalar keys %$mod;
292	25					93	my %scores;
293	25					98	for my $key (@langs) {
294	504					1741	my $l = lc ($key);
295	504	100				2752	unless (exists $self->{models}{$l}) {
296	10					43	next;
297							}
298	494					3273	my $score = __distance ($mod, $self->{models}{$l});
299	494					2678	$scores{$l} = $score;
300							}
301	25					438	my @sorted = sort { $scores{$a} <=> $scores{$b} } keys %scores;
	1435					4102
302	25					103	my @out;
303	25		50			125	$num_tri \|\|=1;
304	25					162	for my $s (@sorted) {
305	439					1264	my $norm = $scores{$s}/$num_tri;
306	439					2332	push @out, { name => $s , score => $norm };
307							}
308	25					82	my $total = 0.0;
309	25					98	for (@out) {
310							$total += $_->{score}
311	439					948	}
312	25					100	for (@out) {
313	439					9354	$_->{score} /= $total;
314							}
315	25					2241	return \@out;
316							}
317
318							sub __distance
319							{
320	494			494		1527	my ($m1, $m2) = @_;
321	494					1204	my $dist = 0;
322	494					8221	for my $k (keys %$m1) {
323	47837	100				246463	$dist += (exists $m2->{$k} ? abs($m2->{$k} - $m1->{$k}) : $MAX);
324							}
325	494					5428	return $dist;
326							}
327
328							sub __make_model
329							{
330	25			25		92	my ($content) = @_;
331	25					83	my %trigrams;
332	25					1025	$content = NFC ($content); # normal form C
333							# Substitute all non-word characters with spaces
334	25					926	$content =~ s/[^[:alpha:]']/ /g;
335	25					247	for (my $i = 0; $i < length ($content) - 2; $i++) {
336	2664					29173	my $tri = lc (substr ($content, $i, 3));
337	2664					25618	$trigrams{$tri}++;
338							}
339
340							my @sorted = sort { $trigrams{$b} == $trigrams{$a} ?
341							$a cmp $b :
342	13125	100				49428	$trigrams{$b} <=> $trigrams{$a} }
343	25					1860	grep { !/\s\s/o } keys %trigrams;
	2445					9494
344	25					742	my @trimmed = splice (@sorted, 0, 300);
345	25					208	my $counter = 0;
346	25					87	my %res;
347	25					183	for my $t (@trimmed) {
348	2397					7519	$res{$t} = $counter++;
349							}
350	25					695	return \%res;
351							}
352
353							1;