File Coverage

blib/lib/Lingua/Identify/Blacklists.pm

Criterion	Covered	Total	%
statement	197	303	65.0
branch	70	144	48.6
condition	21	42	50.0
subroutine	24	30	80.0
pod	8	19	42.1
total	320	538	59.4

line	stmt	bran	cond	sub	pod	time	code
1							#--perl--
2
3							package Lingua::Identify::Blacklists;
4
5	4			4		128194	use 5.008;
	4					15
	4					380
6	4			4		21	use strict;
	4					8
	4					139
7
8	4			4		3848	use File::ShareDir qw/dist_dir/;
	4					31569
	4					619
9	4			4		40	use File::Basename qw/dirname/;
	4					8
	4					290
10	4			4		4362	use File::GetLineMaxLength;
	4					1626
	4					328
11
12	4			4		3798	use Lingua::Identify qw(:language_identification);;
	4					346792
	4					925
13	4			4		4219	use Lingua::Identify::CLD;
	4					28607
	4					193
14
15	4			4		47	use Exporter 'import';
	4					7
	4					8559
16							our @EXPORT_OK = qw( identify identify_file identify_stdin
17							train train_blacklist run_experiment
18							available_languages available_blacklists );
19							our %EXPORT_TAGS = ( all => \@EXPORT_OK );
20
21							our $VERSION = '0.04';
22
23							=encoding UTF-8
24
25							=head1 NAME
26
27							Lingua::Identify::Blacklists - Language identification for related languages based on blacklists
28
29							=head1 VERSION
30
31							Version 0.03
32
33							=head1 SYNOPSIS
34
35							use Lingua::Identfy::Blacklists qw/:all/;
36
37							# detect language for a given text
38							# (discriminate between Bosanian, Croatian and Serbian)
39							my $lang = identify( ".... text to be classified ...",
40							langs => ['bs','hr','sr']);
41
42							# check if the assumed language ('hr') is confused with another one
43							my $lang = identify( ".... text to be classified ...", assumed => 'hr' );
44
45							# use a general-purpose identfier and check confusable langauges if necessary
46							my $lang = identify( ".... text to be classified ...");
47
48							# delect language in the given file (Unicode UTF-8 is assumed)
49							my $lang = identify_file( $filename, langs => [...] );
50							my $lang = identify_file( $filename, assumed => '..' );
51							my $lang = identify_file( $filename );
52
53							# delect language for every line separately from the given file
54							# (return a list of lang-IDs)
55							my @langs = identify_file( $filename, every_line => 1, langs = [...] );
56							my @langs = identify_file( $filename, every_line => 1, assumed = '..' );
57							my @langs = identify_file( $filename, every_line => 1 );
58
59
60							# learn classifiers (blacklists) for all pairs of languages
61							# given some training data
62							train( { cs => $file_with_cs_text,
63							sk => $file_with_sk_text,
64							pl => $file_with_pl_text } );
65
66							# learn a blacklist from a given pair of texts (prints to STDOUT)
67							train_blacklist( $filename1, $filename2 );
68
69							# ... the same but write to outfile
70							train_blacklist( $filename1, $filename2, outfile => $outfilename );
71
72							# train and evaluate the classification using given training/test data
73							my @traindata = ($trainfile1, $trainfile2, $trainfile3);
74							my @evaldata = ($testfile1, $testfile2, $testfile3);
75							run_experiment(\@traindata, \@evaldata, $lang1 $lang2, $lang3);
76
77							# train with different parameters (optional)
78							my %para = (
79							min_high => 5, # minimal token frequency in one langusgae
80							max_low => 2, # maximal token frequency in the other language
81							min_diff => 0.7 ); # score difference threshold
82
83							train( { cs => $file_with_cs_text, sk => $file_with_sk_text }, %para );
84
85							=head1 Description
86
87							This module adds a blacklist classifier to a general purpose language identification tool. Related languages can easily be confused with each other and standard language detection tools do not work very well for distinguishing them. With this module one can train so-called blacklists of words for language pairs containing words that should not (or very rarely) occur in one language while being quite common in the other. These blacklists are then used to discriminate between those "confusable" related languages.
88
89							Since version 0.03 it also integrates a standard language identifier (Lingua::Identify::CLD) and can now be used for general language identification. It calls the blacklist classifier only for those languages that can be confused and for which appropriate blacklists are trained.
90
91
92							=head1 Settings
93
94							Module-internal variables that can be modified:
95
96							$BLACKLISTDIR # directory with all blacklists (default: module-share-dir)
97							$LOWERCASE # lowercase all data, yes/no (1/0), default: 1
98							$TOKENIZE # tokenize all data, yes/no (1/0), default: 1
99							$ALPHA_ONLY # don't use tokens with non-alphabetic characters, default: 1
100							$MAX_LINE_LENGTH # max line length when reading from files (default=2**16)
101							$CLD_TEXT_SIZE # text size in characters used for language ident. with CLD
102							$VERBOSE # verbose output (default=0)
103
104							Tokenization is very simple and replaces all non-alphabetic characters with a white-space character.
105
106							=cut
107
108
109							our $BLACKLISTDIR;
110							eval{ $BLACKLISTDIR = &dist_dir('Lingua-Identify-Blacklists') . '/blacklists' };
111
112							our $LOWERCASE = 1;
113							our $TOKENIZE = 1;
114							our $ALPHA_ONLY = 1;
115							our $MAX_LINE_LENGTH = 2**16; # limit the length of one line to be read
116							our $CLD_TEXT_SIZE = 2**16; # text size used for detecting lang with CLD
117							our $VERBOSE = 0;
118
119							my %blacklists = (); # hash of blacklists (langpair => {blacklist}, ...)
120							my %confusable = (); # hash of confusable languages (lang => [other_langs])
121
122							## the compact language identifier from Google Chrome
123							my $CLD = new Lingua::Identify::CLD;
124
125
126							# load all blacklists in the gneral BLACKLISTDIR
127							&load_blacklists( $BLACKLISTDIR );
128
129
130
131
132							=head1 Exported Functions
133
134							=head2 C<$langID = identify( $text [,%options] )>
135
136							Analyses a given text and returns a language ID as the result of the classification. C<%options> can be used to change the behaviour of the classifier. Possible options are
137
138							assumed => $assumed_lang
139							langs => \@list_of_possible_langs
140							use_margin => $score
141
142							If C are specified, it runs the classifier with blacklists for those languages (in a cascaded way, i.e. best1 = lang1 vs lang2, best2 = best1 vs lang3, ...). If C is specified, it runs all versus all and returns the language that wins the most (with margin=$score).
143
144							If the C language is given, it runs the blacklist classifier for all languages that can be confused with $assumed_lang (if blacklist models exist for them).
145
146							If neither C not C are specified, it first runs a general-purpose language identification (using Lingua::Identify::CLD and Lingua::Identify) and then checks with the blacklist classifier whether the detected language can be confused with another one. For example, CLD frequently classifies Serbian and Bosnian texts as Croatian but the blacklist classifier will detect that (and hopefully correct the decision).
147
148							=cut
149
150							sub identify{
151	605			605	1	3349	my $text = shift;
152	605					1703	my %options = @_;
153
154	605					1207	my %dic = ();
155	605					1052	my $total = 0;
156
157							# run the blacklist classifier if 'langs' are specified
158	605	50				2004	if (exists $options{langs}){
159	0					0	&process_string( $text, \%dic, $total, $options{text_size} );
160	0					0	return &classify( \%dic, %options );
161							}
162
163							# otherwise: check if there is an 'assumed' language
164							# if not: classify with CLD
165	605	50				2568	$options{assumed} = &identify_language( $text )
166							unless (exists $options{assumed});
167
168							# if there is an 'assumed' language:
169							# check if it can be confused with others (i.e. blacklists exist)
170	605	100				2494	if (exists $confusable{$options{assumed}}){
171	603					1773	$options{langs} = $confusable{$options{assumed}};
172							# finally: process the text and classify
173	603					1997	&process_string( $text, \%dic, $total );
174	603					3437	return &classify( \%dic, %options );
175							}
176	2					10	return $options{assumed};
177							}
178
179
180
181							=head2 C<$langID = identify_file( $filename [,%options] )>
182
183							Does the same as C but reads text from a file. It also takes the same options as the 'identify' function but allows two extra options:
184
185							text_size => $size, # number of tokens to be used for classification
186							every_line => 1
187
188							Using the C option, the classifier checks every input line seperately and returns a list of language ID's.
189
190							@langIDs = identify_file( $filename, every_line => 1, %options )
191
192							=cut
193
194
195
196							sub identify_file{
197	6			6	1	3480	my $file = shift;
198	6					36	my %options = @_;
199
200	6					12	my %dic = ();
201	6					14	my $total = 0;
202	6					18	my @predictions = ();
203
204	6	50				39	my $fh = defined $file ? open_file($file) : *STDIN;
205	6					70	my $reader = File::GetLineMaxLength->new($fh);
206
207							# mode 1: classify every line separately
208	6	100				206	if ($options{every_line}){
209	3					10	my @predictions = ();
210	3					17	while (my $line = $reader->getline($MAX_LINE_LENGTH)) {
211	600					121627	chomp $line;
212	600					2223	push( @predictions, &identify( $line, %options ) );
213							}
214	3					636	return @predictions;
215							}
216
217							# mode 2: classify all text together (optional: size limit)
218	3					9	my $text = '';
219	3					14	while (my $line = $reader->getline($MAX_LINE_LENGTH)) {
220
221							# save text if no languages are given (for blacklists)
222	600	50	33			121868	unless (exists $options{langs} \|\| exists $options{assumed}){
223	600	100				13479	if ( length($text) < $CLD_TEXT_SIZE ){
224	72					865	$text .= $line;
225							}
226							}
227
228							# prepare the data for blacklist classification
229							# (TODO: is this cheaper than keeping the text in memory and
230							# processing it later when needed?)
231	600					1275	chomp $line;
232	600					1491	&process_string($line,\%dic,$total);
233	600	50				4887	if ($options{text_size}){ # use only a certain number of words
234	0	0				0	if ($total > $options{text_size}){
235	0	0				0	print STDERR "use $total tokens for classification\n"
236							if ($VERBOSE);
237	0					0	last;
238							}
239							}
240							}
241
242							# no languages selected?
243	3	50				88	unless (exists $options{langs}){
244							# no assumed language set
245	3	50				9	unless (exists $options{assumed}){
246							# try to identify with the text we have saved above
247	3	50				23	$options{assumed} = &identify_language( $text )
248							unless (exists $options{assumed});
249							}
250	3	50				18	if (exists $confusable{$options{assumed}}){
251	3					11	$options{langs} = $confusable{$options{assumed}};
252							}
253							}
254
255							# finally: classify with blacklists
256	3	50				16	if (exists $options{langs}){
257	3					48	return &classify( \%dic, %options );
258							}
259
260							# no blacklists in this case ...
261	0					0	return $options{assumed};
262							}
263
264
265
266							=head2 C<$langID = identify_stdin( [,%options] )>
267
268							The same as C but reads from STDIN
269
270							=cut
271
272
273							sub identify_stdin{
274	0			0	1	0	return identify_file( undef, @_ );
275							}
276
277
278
279
280							=head2 C
281
282							Trains classifiers by learning blacklisted words for pairwise language discrimination. Returns nothing. Blacklists are stored in C. You may have to run the process as administrator if you don't have write permissions.
283
284							C<%traindata> is a hash of training data files associated with their corresponding language IDs:
285
286							'hr' => $croatian_text_file,
287							'sr' => $serbian_text_file,
288							...
289
290							C<%options> is a hash of optional parameters that change the behaviour of the learning algorithm. Possible parameters are:
291
292							min_high => $freq1, # minimal token frequency in one langusgae
293							max_low => $freq2, # maximal token frequency in the other language
294							min_diff => $score, # score difference threshold
295							text_size => $size, # maximum number of tokens to be used per text
296
297
298							=cut
299
300
301							sub train{
302	1			1	1	25	my $traindata = shift;
303	1					3	my %options = @_;
304
305	1					2	my @langs = keys %{$traindata};
	1					7
306
307	1					6	for my $s (0..$#langs){
308	3					19	for my $t ($s+1..$#langs){
309	3					267	print "traing blacklist for $langs[$s]-$langs[$t] ... ";
310	3					39	&train_blacklist( $$traindata{$langs[$s]},$$traindata{$langs[$t]},
311							outfile => "$BLACKLISTDIR/$langs[$s]-$langs[$t].txt",
312							%options );
313	3					986	print "saved in '$BLACKLISTDIR/$langs[$s]-$langs[$t].txt'\n";
314							}
315							}
316							}
317
318
319							=head2 C
320
321							This function learns a blacklist of words to discriminate between the language given in $file1 and the language given in $file2. It takes the same arguments (%options) as the C function above with one additional parameter:
322
323							outfile => $output_file
324
325							Using this parameter, the blacklist will be written to the specified file. Otherwise it will be printed to STDOUT.
326
327							The function returns nothing otherwise.
328
329							=cut
330
331
332
333							sub train_blacklist{
334	3			3	1	15	my ($file1,$file2,%options) = @_;
335
336	3	50				16	my $min_high = exists $options{min_high} ? $options{min_high} : 10;
337	3	50				85	my $max_low = exists $options{min_low} ? $options{max_low} : 3;
338	3	50				10	my $min_diff = exists $options{min_diff} ? $options{min_diff} : 0.8;
339
340	3					9	my %dic1=();
341	3					6	my %dic2=();
342
343	3					24	my $total1 = &read_file($file1,\%dic1,$options{text_size});
344	3					29	my $total2 = &read_file($file2,\%dic2,$options{text_size});
345
346	3	50				24	if ($options{outfile}){
347	3	50				386	mkdir dirname($options{outfile}) unless (-d dirname($options{outfile}));
348	3		50			483	open O,">$options{outfile}" \|\| die "cannot write to $options{outfile}\n";
349	3					63	binmode(O,":encoding(UTF-8)");
350							}
351
352	3					24432	foreach my $w (keys %dic1){
353	57240	100	66			418085	next if ((!exists $dic1{$w} \|\| $dic1{$w}<$min_high) &&
			100
			66
354							(!exists $dic2{$w} \|\| $dic2{$w}<$min_high));
355	3691	100	66			26389	next if ((exists $dic1{$w} && $dic1{$w}>$max_low) &&
			100
			66
356							(exists $dic2{$w} && $dic2{$w}>$max_low));
357
358	926	50				2080	my $c1 = exists $dic1{$w} ? $dic1{$w} : 0;
359	926	100				2115	my $c2 = exists $dic2{$w} ? $dic2{$w} : 0;
360
361	926					1366	my $s1 = $c1 * $total2;
362	926					945	my $s2 = $c2 * $total1;
363	926					1446	my $diff = ($s1 - $s2) / ($s1 + $s2);
364
365	926	100				2434	if (abs($diff) > $min_diff){
366	468	50				810	if ($options{outfile}){
367	468					4864	print O "$diff\t$w\t$c1\t$c2\n";
368							}
369							else{
370	0					0	print "$diff\t$w\t$c1\t$c2\n";
371							}
372							}
373							}
374							# don't forget words that do NOT appear in dic1!!!
375	3					52112	foreach my $w (keys %dic2){
376	69888	100				169996	next if (exists $dic1{$w});
377	48705	100				155449	next if ($dic2{$w}<10);
378	429	50				920	my $c1 = exists $dic1{$w} ? $dic1{$w} : 0;
379	429	50				939	my $c2 = exists $dic2{$w} ? $dic2{$w} : 0;
380	429	50				858	if ($options{outfile}){
381	429					3045	print O "-1\t$w\t$c1\t$c2\n";
382							}
383							else{
384	0					0	print "-1\t$w\t$c1\t$c2\n";
385							}
386							}
387	3	50				58813	close O if ($options{outfile});
388							}
389
390							=head2 C<@langs = available_languages()>
391
392							Returns a list of languages covered by the blacklists in the BLACKLISTDIR.
393
394							=cut
395
396							sub available_languages{
397	0	0		0	1	0	unless (keys %blacklists){
398	0					0	&load_blacklists( $BLACKLISTDIR );
399							}
400	0					0	my %langs = ();
401	0					0	foreach (keys %blacklists){
402	0					0	my ($lang1,$lang2) = split(/\-/);
403	0					0	$langs{$lang1}=1;
404	0					0	$langs{$lang2}=1;
405							}
406	0					0	return keys %langs;
407							}
408
409
410							=head2 C<%lists = available_blacklists()>
411
412							Resturns a hash of available language pairs (for which blacklists exist in the system).
413
414							%lists = ( srclang1 => { trglang1a => blacklist1a, trglang1b => blacklist1b },
415							srclang2 => { trglang2a => blacklist2a, ... }
416							.... )
417
418							=cut
419
420
421							sub available_blacklists{
422	4	50		4	1	33	unless (keys %blacklists){
423	0					0	&load_blacklists( $BLACKLISTDIR );
424							}
425	4					12	my %pairs = ();
426	4					24	foreach (keys %blacklists){
427	24					62	my ($lang1,$lang2) = split(/\-/);
428	24					87	$pairs{$lang1}{$lang2} = $_;
429	24	50	66			134	$pairs{$lang2}{$lang1} = $_
430							unless (defined $pairs{$lang2} && defined $pairs{$lang2}{$lang1});
431							}
432	4					46	return %pairs;
433							}
434
435
436
437
438							=head2 C
439
440							This function allows to run experiments, i.e. training and evaluating classifiers for the given languages (C<@langs>). The arrays of training data and test data need to be of the same size as C<@langs>. The function prints the overall accurcy and a confusion table given the data sets and the classification. C<%options> can be used to set classifier-specific parameters.
441
442							=cut
443
444
445							sub run_experiment{
446
447	4			4		5556	use Benchmark;
	4					40594
	4					40
448
449	0			0	1	0	my $trainfiles = shift;
450	0					0	my $evalfiles = shift;
451	0	0				0	my $options = ref($_[0]) eq 'HASH' ? shift : {};
452
453	0					0	my @traindata =
454	0	0				0	ref($trainfiles) eq 'ARRAY' ? @{$trainfiles} : split(/\s+/,$trainfiles);
455	0					0	my @evaldata =
456	0	0				0	ref($evalfiles) eq 'ARRAY' ? @{$evalfiles} : split(/\s+/,$evalfiles);
457	0					0	my @langs = @_;
458
459	0	0				0	die "no languages given!\n" unless (@langs);
460	0	0	0			0	die "no training nor evaluation data given!\n"
461							unless ($#traindata == $#evaldata \|\| $#traindata == $#langs);
462
463	0					0	my %trainset = ();
464	0					0	for (0..$#langs){ $trainset{$langs[$_]} = $traindata[$_]; }
	0					0
465
466							# train blacklists
467
468	0	0				0	if ($#traindata == $#langs){
469	0		0			0	$BLACKLISTDIR = $$options{blacklist_dir} \|\| "blacklist-experiment";
470	0					0	my $t1 = new Benchmark;
471	0					0	&train( \%trainset, %{$options} );
	0					0
472	0					0	print STDERR "training took: ".
473							timestr(timediff(new Benchmark, $t1)).".\n";
474							}
475
476	0					0	&initialize();
477
478							# classify test data
479
480	0	0				0	if ($#evaldata == $#langs){
481	0					0	print STDERR "classify ....\n";
482
483	0					0	my $correct=0;
484	0					0	my $count=0;
485	0					0	my %guesses=();
486
487	0					0	my %correct_lang=();
488	0					0	my %count_lang=();
489
490	0					0	my $t1 = new Benchmark;
491	0					0	foreach my $i (0..$#langs){
492	0		0			0	open IN,"<:encoding(UTF-8)",$evaldata[$i] \|\| die "...";
493	0					0	while (){
494	0					0	chomp;
495	0					0	my %dic = ();
496	0					0	&process_string($_,\%dic);
497	0					0	my $guess = &classify(\%dic,@langs);
498	0					0	$count++;
499	0					0	$count_lang{$langs[$i]}++;
500	0	0				0	if ($guess eq $langs[$i]){
501	0					0	$correct++;
502	0					0	$correct_lang{$langs[$i]}++;
503							}
504	0					0	$guesses{$langs[$i]}{$guess}++;
505							}
506	0					0	close IN;
507							}
508	0					0	print STDERR "classification took: ".
509							timestr(timediff(new Benchmark, $t1)).".\n";
510
511	0					0	printf "accuracy: %6.4f\n ",$correct/$count;
512	0					0	foreach my $c (@langs){
513	0					0	print " $c";
514							}
515	0					0	print "\n";
516	0					0	foreach my $c (@langs){
517	0					0	print "$c ";
518	0					0	foreach my $g (@langs){
519	0					0	printf "%4d",$guesses{$c}{$g};
520							}
521	0					0	printf " %6.4f",$correct_lang{$c}/$count_lang{$c};
522	0					0	print "\n";
523							}
524							}
525	0					0	system("wc -l $Lingua::Identify::Blacklists::BLACKLISTDIR/*.txt");
526							}
527
528
529							=head2 Module-internal functions
530
531							The following functions are not exported and are mainly used for internal purposes (but may be used from the outside if needed).
532
533							initialize() # reset the repository of blacklists
534							identify_language($text) # return lang-ID for $text (using CLD)
535							classify(\%dic,%options) # run the classifier
536							classify_cascaded(\%dic,@langs) # run a cascade of binary classifications
537
538							# run all versus all and return the one that wins most binary decisions
539							# (a score margin is used to adjust the reliability of the decisions)
540
541							classify_with_margin(\%dic,$margin,@langs)
542
543							load_blacklists($dir) # load all blacklists available in $dir
544							load_blacklist(\%list,$dir, # load a lang-pair specific blacklist
545							$lang1,$lang2)
546							read_file($file,\%dic,$max) # read a file and count token frequencies
547							process_string($string) # process a given string (lowercasing ...)
548
549							=cut
550
551
552	0			0	0	0	sub initialize{ %blacklists = (); %confusable = (); }
	0					0
553
554							sub identify_language{
555	608			608	0	3136	my ($lang, $id, $conf) = $CLD->identify( $_[0] );
556
557							# strangely enough CLD is not really reliable for English
558							# (all kinds of garbish input is recognized as English)
559							# --> check with Lingua::Identify
560	608	100				290648	if ($id eq 'en'){
561	2	100				12	$id = $id = langof( $_[0] ) ? $id : 'unknown';
562							}
563	608					9051	return $id;
564							}
565
566							sub classify{
567	606			606	0	1078	my $dic = shift;
568	606					2488	my %options = @_;
569	606	50				1848	$options{langs} = '' unless ($options{langs});
570
571	606					1975	my @langs = ref($options{langs}) eq 'ARRAY' ?
572	606	50				2204	@{$options{langs}} : split( /\s+/, $options{langs} ) ;
573
574	606	50				1501	@langs = available_languages() unless (@langs);
575
576	606	50				1540	return &classify_with_margin( $dic, $options{use_margin}, @langs )
577							if ($options{use_margin});
578	606					1531	return &classify_cascaded( $dic, @langs );
579							}
580
581							sub classify_cascaded{
582	606			606	0	852	my $dic = shift;
583	606					1444	my @langs = @_;
584
585	606					920	my $lang1 = shift(@langs);
586	606					1159	foreach my $lang2 (@langs){
587
588							# load blacklists on demand
589	1212	100				4382	unless (exists $blacklists{"$lang1-$lang2"}){
590	4					22	$blacklists{"$lang1-$lang2"}={};
591	4					24	&load_blacklist($blacklists{"$lang1-$lang2"},
592							$BLACKLISTDIR,$lang1,$lang2);
593							}
594	1212					2418	my $list = $blacklists{"$lang1-$lang2"};
595
596	1212					1630	my $score = 0;
597	1212					1318	foreach my $w (keys %{$dic}){
	1212					124792
598	448258	100				1066843	if (exists $$list{$w}){
599	32881					88165	$score += $$dic{$w} * $$list{$w};
600	32881	50				73836	print STDERR "$$dic{$w} x $w found ($$list{$w})\n" if ($VERBOSE);
601							}
602							}
603	1212	100				47538	if ($score < 0){
604	389					710	$lang1 = $lang2;
605							}
606	1212	50				3976	print STDERR "select $lang1 ($score)\n" if ($VERBOSE);
607							}
608	606					42116	return $lang1;
609							}
610
611
612							# OTHER WAY OF CLASSIFYING
613							# test all against all ...
614
615							sub classify_with_margin{
616	0			0	0	0	my $dic = shift;
617	0					0	my $margin = shift;
618	0					0	my @langs = @_;
619
620	0					0	my %selected = ();
621	0					0	while (@langs){
622	0					0	my $lang1 = shift(@langs);
623	0					0	foreach my $lang2 (@langs){
624
625							# load blacklists on demand
626	0	0				0	unless (exists $blacklists{"$lang1-$lang2"}){
627	0					0	$blacklists{"$lang1-$lang2"}={};
628	0					0	&load_blacklist($blacklists{"$lang1-$lang2"},
629							$BLACKLISTDIR,$lang1,$lang2);
630							}
631	0					0	my $list = $blacklists{"$lang1-$lang2"};
632
633	0					0	my $score = 0;
634	0					0	foreach my $w (keys %{$dic}){
	0					0
635	0	0				0	if (exists $$list{$w}){
636	0					0	$score += $$dic{$w} * $$list{$w};
637	0	0				0	print STDERR "$$dic{$w} x $w found ($$list{$w})\n"
638							if ($VERBOSE);
639							}
640							}
641	0	0				0	next if (abs($score) < $margin);
642	0	0				0	if ($score < 0){
643							# $selected{$lang2}-=$score;
644	0					0	$selected{$lang2}++;
645	0	0				0	print STDERR "select $lang2 ($score)\n" if ($VERBOSE);
646							}
647							else{
648							# $selected{$lang1}+=$score;
649	0					0	$selected{$lang1}++;
650	0	0				0	print STDERR "select $lang1 ($score)\n" if ($VERBOSE);
651							}
652							}
653							}
654	0					0	my ($best) = sort { $selected{$b} <=> $selected{$a} } keys %selected;
	0					0
655	0					0	return $best;
656							}
657
658
659							# load_all_blacklists = alias for load_blacklists
660
661	0			0	0	0	sub load_all_blacklists{ return load_blacklists(@_); }
662
663							sub load_blacklists{
664	4		33	4	0	27	my $dir = shift \|\| $BLACKLISTDIR;
665
666	4	50				219	opendir(my $dh, $dir) \|\| die "cannot read directory '$dir'\n";
667	4					108	while(readdir $dh) {
668	32	100				458	if (/^(.)-(.).txt$/){
669	24					262	$blacklists{"$1-$2"}={};
670	24					177	&load_blacklist($blacklists{"$1-$2"}, $dir, $1, $2);
671							}
672							}
673	4					196	closedir $dh;
674
675							# update list of confusable languages
676	4					26	my %lists = &available_blacklists();
677	4					22	foreach my $lang (keys %lists){
678	36					39	@{$confusable{$lang}} = keys %{$lists{$lang}};
	36					86
	36					81
679	36					47	unshift( @{$confusable{$lang}}, $lang );
	36					204
680							}
681							}
682
683
684							sub load_blacklist{
685	28			28	0	214	my ($list,$dir,$lang1,$lang2) = @_;
686
687	28					57	my $inverse = 0;
688	28	100				1974	if (! -e "$dir/$lang1-$lang2.txt"){
689	4					15	($lang1,$lang2) = ($lang2,$lang1);
690	4					7	$inverse = 1;
691							}
692
693	4		50	4		46	open F,"<:encoding(UTF-8)","$dir/$lang1-$lang2.txt" \|\| die "...";
	4					9
	4					40
	28					1944
694	28					105837	while (){
695	348340					688923	chomp;
696	348340					1019433	my ($score,$word) = split(/\t/);
697	348340	100				1886411	$$list{$word} = $inverse ? 0-$score : $score;
698							}
699	28					1732	close F;
700							}
701
702							sub open_file{
703	12			12	0	23	my $file = shift;
704							# allow gzipped input
705	12					22	my $fh;
706	12	50				61	if ($file=~/\.gz$/){
707	0		0			0	open $fh,"gzip -cd < $file \|" \|\| die "cannot open file '$file'";
708	0					0	binmode($fh,":encoding(UTF-8)");
709							}
710							else{
711	12		50			1227	open $fh,"<:encoding(UTF-8)",$file \|\| die "cannot open file '$file'";
712							}
713	12					1262	return $fh;
714							}
715
716							sub read_file{
717	6			6	0	22	my ($file,$dic,$max)=@_;
718
719							# use File::GetLineMaxLength to avoid filling the memory
720							# when reading from files without new lines
721	6					22	my $fh = open_file( $file );
722	6					68	my $reader = File::GetLineMaxLength->new($fh);
723
724	6					166	my $total = 0;
725	6					76	while (my $line = $reader->getline($MAX_LINE_LENGTH)) {
726	1200					242853	chomp $line;
727	1200					2835	&process_string($line,$dic,$total);
728	1200	50				9707	if ($max){
729	0	0				0	if ($total > $max){
730	0					0	print STDERR "read $total tokens from $file\n";
731	0					0	last;
732							}
733							}
734							}
735	6					517	close $fh;
736	6					80	return $total;
737							}
738
739
740							# process_string($string,\%dic,\$wordcount[,$maxwords])
741
742							sub process_string{
743	2403	50		2403	0	183010	$_[0]=lc($_[0]) if ($LOWERCASE);
744	4	50		4		10645	$_[0]=~s/((\A\|\s)\P{IsAlpha}+\|\P{IsAlpha}+(\s\|\Z))/ /gs if ($TOKENIZE);
	4					11
	4					92
	2403					2831403
745
746	2403	50				817104	my @words = $ALPHA_ONLY ?
747							grep(/^\p{IsAlpha}/,split(/\s+/,$_[0])) :
748							split(/\s+/,$_[0]);
749
750							# use only $maxwords words
751	2403	50				62516	splice(@words,$_[3]) if ($_[3]);
752
753	2403					5741	foreach my $w (@words){${$_[1]}{$w}++;$_[2]++;}
	1016058					945553
	1016058					2204555
	1016058					1345165
754							}
755
756							1;
757
758							__END__