File Coverage

blib/lib/Lingua/Identifier.pm
Criterion Covered Total %
statement 67 72 93.0
branch 6 8 75.0
condition n/a
subroutine 16 16 100.0
pod 4 4 100.0
total 93 100 93.0


line stmt bran cond sub pod time code
1             package Lingua::Identifier;
2             $Lingua::Identifier::VERSION = '0.01';
3 2     2   49595 use 5.014004;
  2         9  
  2         112  
4 2     2   14 use strict;
  2         4  
  2         101  
5 2     2   14 use warnings FATAL => 'all';
  2         8  
  2         120  
6              
7 2     2   2229 use File::ShareDir 'dist_dir';
  2         15198  
  2         271  
8 2     2   2291 use File::Spec::Functions;
  2         1870  
  2         374  
9              
10 2     2   1964 use Math::Matrix::MaybeGSL 0.006;
  2         110382  
  2         160  
11              
12 2     2   1312 use Lingua::Identifier::ForwardProp;
  2         4  
  2         54  
13 2     2   1604 use Lingua::Identifier::Feature::Trigrams;
  2         7  
  2         59  
14 2     2   1425 use Lingua::Identifier::Feature::Alphabet;
  2         9  
  2         2783  
15              
16             =encoding UTF-8
17              
18             =head1 NAME
19              
20             Lingua::Identifier - A NN based approach for language identification
21              
22             =cut
23              
24             our $sharedir = dist_dir('Lingua-Identifier');
25              
26             our $features = do( catfile($sharedir, "features.dmp"));
27             die __PACKAGE__ . "- could not load 'features.dmp'" unless defined $features;
28              
29             our $classes = do( catfile($sharedir, "classes.dmp"));
30             die __PACKAGE__ . "- could not load 'classes.dmp'" unless defined $classes;
31              
32             our $thetas;
33             _load_thetas($sharedir);
34              
35              
36             =head1 SYNOPSIS
37              
38             use Lingua::Identifier;
39              
40             my $identifier = Lingua::Identifier->new();
41              
42             # identify language on a file
43             my $lang = $identifier->identify_file("text.txt");
44              
45             # identify language on a string
46             my $lang = $identifier->identify($string);
47              
48             =head1 DESCRIPTION
49              
50             This documentation is not ready yet. These releases are just for
51             CPANtesters testing.
52              
53             =head2 C
54              
55             Constructs a new Language Identification object.
56              
57             my $identifier = Lingua::Identifier->new();
58              
59             =cut
60              
61             sub new {
62              
63 1     1 1 1566 return bless { languages => $classes }, __PACKAGE__;
64             }
65              
66             =head2 C
67              
68             Returns the list of codes for the active languages.
69              
70             =cut
71              
72             sub languages {
73 1     1 1 8 my $self = shift;
74 1         2 return @{$self->{languages}};
  1         25  
75             }
76              
77             =head2 C
78              
79             This method receives a filename and tries to identify its langauge.
80              
81             In scalar context returns the language id. In list context returns an
82             associative array, with language codes and respective scores.
83              
84             my $lang = $identifier->identify_file("sometext.txt");
85              
86             =cut
87              
88             sub identify_file {
89 1     1 1 13 my ($self, $filename) = @_;
90              
91 1         5 my $string = _load_file($filename);
92 1         7 $self->identify($string);
93             }
94              
95             =head2 C
96              
97             This method receives a string and tries to identify its langauge.
98              
99             In scalar context returns the language id. In list context returns an
100             associative array, with language codes and respective scores.
101              
102             my $lang = $identifier->identify($string);
103              
104             =cut
105              
106             sub identify {
107 1     1 1 3 my ($self, $string) = @_;
108              
109 1         6 my $ngrams = _compute_features($string);
110              
111 1         66 my $data = Matrix->new(scalar(@$features), 1);
112              
113 1         1420 my $i = 1;
114 1         6 for my $feature (@$features) {
115 1127 100       2077 if (exists($ngrams->{$feature})) {
116 163         413 $data->assign($i, 1, $ngrams->{$feature});
117             }
118 1127         3600 $i++;
119             }
120              
121 1         10 my $ans = Lingua::Identifier::ForwardProp::forward_prop($data, $thetas);
122              
123 1         9 my ($max, $pos) = $ans->max();
124              
125 1 50       710 if (wantarray) {
126 0         0 my $prob_classes = {};
127 0         0 my $i = 1;
128 0         0 for (@$classes) {
129 0         0 $prob_classes->{$_} = $ans->element($i++, 1);
130             }
131 0         0 return (%$prob_classes);
132             } else {
133 1         899 return $classes->[$pos-1];
134             }
135             }
136              
137             sub _load_file { ## XXXX - later might be useful to accept encoding
138 1     1   2 my $file = shift;
139 1         4 my $str = "";
140 1 50       60 open my $fh, "<:utf8", $file or die "Can not open file $file for reading: $!";
141 1         24 while (<$fh>) {
142 43         105 $str .= $_;
143             }
144 1         12 close $fh;
145              
146 1         8 return $str;
147             }
148              
149             sub _load_thetas {
150 2     2   6 my $path = shift;
151              
152 2         4 my $dir;
153              
154 2         141 opendir $dir, $path;
155 2         106 my @ts = readdir $dir;
156              
157 2         8 for my $tfile (@ts) {
158 14 100       77214014 if ($tfile =~ /theta-(\d+)\.dat/) {
159 4         40 my $file = catfile($path, $tfile);
160 4         503 print STDERR "Loading '$file'\n";
161 4         30 $thetas->[$1 - 1] = Matrix->read($file);
162             }
163             }
164              
165 2         147 closedir $dir;
166             }
167              
168             sub _compute_features {
169 1     1   2 my $str = shift;
170              
171 1         7 my $alphabets = Lingua::Identifier::Feature::Alphabet::features($str);
172 1         8 my $trigrams = Lingua::Identifier::Feature::Trigrams::features($str);
173              
174 1         605 return { %$trigrams, %$alphabets };
175             }
176              
177             =head1 AUTHOR
178              
179             Alberto Simões, C<< >>
180              
181             =head1 ACKNOWLEDGMENTS
182              
183             =over 4
184              
185             =item * Simon D. Byers
186              
187             =back
188              
189             =head1 BUGS
190              
191             Please report any bugs or feature requests to C
192             at rt.cpan.org>, or through the web interface at
193             L.
194             I will be notified, and then you'll automatically be notified of
195             progress on your bug as I make changes.
196              
197             =head1 SUPPORT
198              
199             You look for information at:
200              
201             =over 4
202              
203             =item * RT: CPAN's request tracker (report bugs here)
204              
205             L
206              
207             =item * AnnoCPAN: Annotated CPAN documentation
208              
209             L
210              
211             =item * CPAN Ratings
212              
213             L
214              
215             =item * Search CPAN
216              
217             L
218              
219             =back
220              
221             =head1 LICENSE AND COPYRIGHT
222              
223             Copyright 2014 Alberto Simões.
224              
225             This program is free software; you can redistribute it and/or modify it
226             under the terms of the the Artistic License (2.0). You may obtain a
227             copy of the full license at:
228              
229             L
230              
231             Any use, modification, and distribution of the Standard or Modified
232             Versions is governed by this Artistic License. By using, modifying or
233             distributing the Package, you accept this license. Do not use, modify,
234             or distribute the Package, if you do not accept this license.
235              
236             If your Modified Version has been derived from a Modified Version made
237             by someone other than you, you are nevertheless required to ensure that
238             your Modified Version complies with the requirements of this license.
239              
240             This license does not grant you the right to use any trademark, service
241             mark, tradename, or logo of the Copyright Holder.
242              
243             This license includes the non-exclusive, worldwide, free-of-charge
244             patent license to make, have made, use, offer to sell, sell, import and
245             otherwise transfer the Package with respect to any patent claims
246             licensable by the Copyright Holder that are necessarily infringed by the
247             Package. If you institute patent litigation (including a cross-claim or
248             counterclaim) against any party alleging that the Package constitutes
249             direct or contributory patent infringement, then this Artistic License
250             to you shall terminate on the date that such litigation is filed.
251              
252             Disclaimer of Warranty: THE PACKAGE IS PROVIDED BY THE COPYRIGHT HOLDER
253             AND CONTRIBUTORS "AS IS' AND WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES.
254             THE IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
255             PURPOSE, OR NON-INFRINGEMENT ARE DISCLAIMED TO THE EXTENT PERMITTED BY
256             YOUR LOCAL LAW. UNLESS REQUIRED BY LAW, NO COPYRIGHT HOLDER OR
257             CONTRIBUTOR WILL BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, OR
258             CONSEQUENTIAL DAMAGES ARISING IN ANY WAY OUT OF THE USE OF THE PACKAGE,
259             EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
260              
261             =cut
262              
263             1; # End of Lingua::Identifier