line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Encode::Detect::Upload; |
2
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
our $VERSION=0.04; |
4
|
|
|
|
|
|
|
|
5
|
|
|
|
|
|
|
=encoding utf8 |
6
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
=head1 NAME |
8
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
Encode::Detect::Upload - Attempt to guess user's locale encoding from IP, |
10
|
|
|
|
|
|
|
HTTP_ACCEPT_LANGUAGE and HTTP_USER_AGENT |
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
=head1 SYNOPSIS |
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
use Encode::Detect::Upload; |
15
|
|
|
|
|
|
|
my $detector = new Encode::Detect::Upload; |
16
|
|
|
|
|
|
|
# Feelin lucky! |
17
|
|
|
|
|
|
|
my $charset = $detector->detect(); |
18
|
|
|
|
|
|
|
# More sensible |
19
|
|
|
|
|
|
|
my ( $charset_list, $meta ) = $detector->detect(); |
20
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
=head1 DESCRIPTION |
22
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
Dealing with input from globally disperse users can be a real pain. Although when |
24
|
|
|
|
|
|
|
setting web forms to utf-8 browsers will often do the right thing, in some |
25
|
|
|
|
|
|
|
instances, such as text file uploads, you are stuck will trying to figure out |
26
|
|
|
|
|
|
|
the files charset encoding. L uses Mozilla's universal |
27
|
|
|
|
|
|
|
charset detector, which works great most of the time. But when it doesn't your |
28
|
|
|
|
|
|
|
stuck with asking the user, a user that all to often these days has a very low |
29
|
|
|
|
|
|
|
technical ability, and likely doesn't know what a charset it. |
30
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
In my experience with dealing with such user uploads, the charset of the file |
32
|
|
|
|
|
|
|
usually relates to the users OS, location and language settings. Although it's |
33
|
|
|
|
|
|
|
true that the file could have any encoding, the file could have been created on |
34
|
|
|
|
|
|
|
a different machine, with a different locale to the one that is doing the upload. |
35
|
|
|
|
|
|
|
But the use of this modules techniques along with that of |
36
|
|
|
|
|
|
|
L more cases can be handled correctly. Methods for |
37
|
|
|
|
|
|
|
helping the user chose encoding are also provided. |
38
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
=cut |
40
|
|
|
|
|
|
|
|
41
|
3
|
|
|
3
|
|
112241
|
use utf8; |
|
3
|
|
|
|
|
32
|
|
|
3
|
|
|
|
|
18
|
|
42
|
3
|
|
|
3
|
|
104
|
use strict; |
|
3
|
|
|
|
|
5
|
|
|
3
|
|
|
|
|
163
|
|
43
|
3
|
|
|
3
|
|
16
|
use warnings; |
|
3
|
|
|
|
|
11
|
|
|
3
|
|
|
|
|
140
|
|
44
|
3
|
|
|
3
|
|
14
|
use Carp; |
|
3
|
|
|
|
|
6
|
|
|
3
|
|
|
|
|
265
|
|
45
|
|
|
|
|
|
|
|
46
|
3
|
|
|
3
|
|
2956
|
use Encode::Detect::Upload::Data; |
|
3
|
|
|
|
|
19
|
|
|
3
|
|
|
|
|
367
|
|
47
|
3
|
|
|
3
|
|
3772
|
use Encode; |
|
3
|
|
|
|
|
46079
|
|
|
3
|
|
|
|
|
25517
|
|
48
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
my $country_lang = \%Encode::Detect::Upload::Data::country_lang; |
50
|
|
|
|
|
|
|
my $lang_charset = \%Encode::Detect::Upload::Data::lang_charset; |
51
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
# Try to load some other modules |
53
|
|
|
|
|
|
|
my $has_ipcountry = eval { require IP::Country; }; |
54
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
my $has_geoip = !$has_ipcountry && eval { require Geo::IP; }; |
56
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
my $has_detect = eval { require Encode::Detect::Detector; }; |
58
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
=head2 Methods |
61
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
=over 4 |
63
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
=item new() |
65
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
=item new(\%params) |
67
|
|
|
|
|
|
|
|
68
|
|
|
|
|
|
|
=item new(%params) |
69
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
Returns a new detection object. Parameters may be passed either as |
71
|
|
|
|
|
|
|
key/value pairs or as a hash references. The following parameters are |
72
|
|
|
|
|
|
|
recognised: |
73
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
die_on_missing Whether missing method parameters cause fatal errors (default: true) |
75
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
=cut |
77
|
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
sub new { |
79
|
2
|
|
|
2
|
1
|
1803
|
my $class = shift; |
80
|
2
|
|
|
|
|
10
|
my %config = ( |
81
|
|
|
|
|
|
|
die_on_missing => 1, |
82
|
|
|
|
|
|
|
); |
83
|
2
|
|
|
|
|
6
|
my $param; |
84
|
2
|
50
|
33
|
|
|
25
|
if ( @_ == 1 && ref $_[0] eq 'HASH' ) { |
|
|
50
|
|
|
|
|
|
85
|
0
|
|
|
|
|
0
|
$param = $_[0]; |
86
|
|
|
|
|
|
|
} |
87
|
|
|
|
|
|
|
elsif ( @_ % 2 == 0 ) { |
88
|
2
|
|
|
|
|
7
|
$param = { @_ }; |
89
|
|
|
|
|
|
|
} |
90
|
|
|
|
|
|
|
else { |
91
|
0
|
|
|
|
|
0
|
croak( "Invalid parameters, must be either single hashref or key=>value pairs" ); |
92
|
|
|
|
|
|
|
} |
93
|
2
|
50
|
|
|
|
16
|
if ( $param ) { |
94
|
2
|
|
|
|
|
12
|
%config = ( |
95
|
|
|
|
|
|
|
%config, |
96
|
|
|
|
|
|
|
%$param, |
97
|
|
|
|
|
|
|
); |
98
|
|
|
|
|
|
|
} |
99
|
2
|
|
|
|
|
10
|
my $self = bless \%config, $class; |
100
|
2
|
|
|
|
|
9
|
return $self; |
101
|
|
|
|
|
|
|
} |
102
|
|
|
|
|
|
|
|
103
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
=item get_os() |
105
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
=item get_os($user_agent_string) |
107
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
Extracts the operating system name from the supplied User-Agent header value, |
109
|
|
|
|
|
|
|
or C<$ENV{HTTP_USER_AGENT}> if not supplied. Dies if no user agent string |
110
|
|
|
|
|
|
|
is available. |
111
|
|
|
|
|
|
|
Returns either C, C, C or undefined if no match was |
112
|
|
|
|
|
|
|
made. |
113
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
=cut |
115
|
|
|
|
|
|
|
|
116
|
|
|
|
|
|
|
sub get_os { |
117
|
10
|
|
|
10
|
1
|
1243
|
my $self = shift; |
118
|
10
|
|
|
|
|
19
|
my $agent = shift; |
119
|
10
|
|
100
|
|
|
51
|
$agent ||= $ENV{HTTP_USER_AGENT}; |
120
|
10
|
100
|
66
|
|
|
68
|
croak( 'No USER_AGENT string passed, and $ENV{HTTP_USER_AGENT} is empty' ) |
121
|
|
|
|
|
|
|
if !$agent && $self->{die_on_missing}; |
122
|
|
|
|
|
|
|
|
123
|
|
|
|
|
|
|
# Basic regexps for matching |
124
|
9
|
100
|
|
|
|
68
|
return 'Windows' if $agent =~ /Windows/; |
125
|
3
|
100
|
|
|
|
18
|
return 'Macintosh' if $agent =~ /\b(?:Macintosh|Mac)\b/; |
126
|
2
|
100
|
|
|
|
14
|
return 'Linux' if $agent =~ /Linux/; |
127
|
1
|
|
|
|
|
5
|
return undef; |
128
|
|
|
|
|
|
|
} |
129
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
|
131
|
|
|
|
|
|
|
=item get_country() |
132
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
=item get_country($ip_address) |
134
|
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
=item get_country($ip_address,$geo_ip_data_filename) |
136
|
|
|
|
|
|
|
|
137
|
|
|
|
|
|
|
Looks up the user's country from the supplied IP address, or C<$ENV{REMOTE_ADDR}> |
138
|
|
|
|
|
|
|
by default. Dies if neither of L or L is installed. |
139
|
|
|
|
|
|
|
Returns the ISO 2 character country code. |
140
|
|
|
|
|
|
|
|
141
|
|
|
|
|
|
|
=cut |
142
|
|
|
|
|
|
|
|
143
|
|
|
|
|
|
|
sub get_country { |
144
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
145
|
0
|
0
|
0
|
|
|
0
|
croak( 'Could not load IP::Country or Geo::IP' ) unless $has_ipcountry || $has_geoip; |
146
|
0
|
|
|
|
|
0
|
my $ip = shift; |
147
|
0
|
|
0
|
|
|
0
|
$ip ||= $ENV{REMOTE_ADDR}; |
148
|
0
|
0
|
|
|
|
0
|
croak( 'No IP passed, and $ENV{REMOTE_ADDR} is empty' ) unless $ip; |
149
|
0
|
0
|
0
|
|
|
0
|
$ip=~/\A(?:0|[1-9]\d*)(?:\.(?:0|[1-9]\d*)){3}\z/ && !grep $_>255,split /\./,$ip |
150
|
|
|
|
|
|
|
or croak( "$ip is not a valid IP" ); |
151
|
|
|
|
|
|
|
|
152
|
|
|
|
|
|
|
# Use the available IP -> Country DB |
153
|
0
|
0
|
|
|
|
0
|
if ( $has_ipcountry ) { |
154
|
0
|
|
|
|
|
0
|
my $reg = IP::Country->new(); # TODO Cache? |
155
|
0
|
|
|
|
|
0
|
my $country = $reg->inet_atocc($ip); |
156
|
0
|
0
|
|
|
|
0
|
$country = undef if $country eq '**'; |
157
|
0
|
0
|
|
|
|
0
|
return defined $country ? lc $country : $country; |
158
|
|
|
|
|
|
|
} |
159
|
0
|
0
|
|
|
|
0
|
if ( $has_geoip ) { |
160
|
0
|
|
|
|
|
0
|
my $gi; |
161
|
0
|
|
|
|
|
0
|
my $data_file = shift; |
162
|
0
|
0
|
|
|
|
0
|
if ( $data_file ) { |
163
|
0
|
0
|
|
|
|
0
|
die( "Geo::IP data file $data_file does not exist" ) unless -e $data_file; |
164
|
0
|
|
|
|
|
0
|
$gi = Geo::IP->new( $data_file, 0 ); # 0 = GEOIP_STANDARD |
165
|
|
|
|
|
|
|
} |
166
|
|
|
|
|
|
|
else { |
167
|
0
|
|
|
|
|
0
|
$gi = Geo::IP->new( 0 ); |
168
|
|
|
|
|
|
|
} |
169
|
0
|
|
|
|
|
0
|
return lc $gi->country_code_by_addr( $ip ); |
170
|
|
|
|
|
|
|
} |
171
|
|
|
|
|
|
|
} |
172
|
|
|
|
|
|
|
|
173
|
|
|
|
|
|
|
|
174
|
|
|
|
|
|
|
=item get_country_lang($iso_2code) |
175
|
|
|
|
|
|
|
|
176
|
|
|
|
|
|
|
Returns the language tag(s) associated with the supplied country code. |
177
|
|
|
|
|
|
|
In scalar context returns the primary language tag; in list context |
178
|
|
|
|
|
|
|
returns all associated language tags. Dies if the supplied country |
179
|
|
|
|
|
|
|
code is undefined. Returns undef if no matching country is found. |
180
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
Language tags are defined in section 3.10 or RFC 2616, and can be 2 |
182
|
|
|
|
|
|
|
or 3 letters, optionally followed by a series of subtags, separated |
183
|
|
|
|
|
|
|
by dashes. |
184
|
|
|
|
|
|
|
|
185
|
|
|
|
|
|
|
=cut |
186
|
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
sub get_country_lang { |
188
|
3
|
|
|
3
|
1
|
3610
|
my $self = shift; |
189
|
3
|
|
|
|
|
5
|
my $country_code = shift; |
190
|
3
|
100
|
|
|
|
27
|
croak( 'No country passed' ) unless defined $country_code; |
191
|
2
|
50
|
|
|
|
12
|
my $country = $country_lang->{lc $country_code} |
192
|
|
|
|
|
|
|
or return; |
193
|
2
|
100
|
|
|
|
7
|
if ( wantarray ) { |
194
|
1
|
|
|
|
|
3
|
return @{ $country->{languages} }; |
|
1
|
|
|
|
|
7
|
|
195
|
|
|
|
|
|
|
} |
196
|
|
|
|
|
|
|
else { |
197
|
1
|
|
|
|
|
10
|
return $country->{languages}->[0]; |
198
|
|
|
|
|
|
|
} |
199
|
|
|
|
|
|
|
} |
200
|
|
|
|
|
|
|
|
201
|
|
|
|
|
|
|
|
202
|
|
|
|
|
|
|
=item get_country_name($iso_2code) |
203
|
|
|
|
|
|
|
|
204
|
|
|
|
|
|
|
Returns the name of the country specified by the suppied 2 letter code. |
205
|
|
|
|
|
|
|
Dies if no country is specified. |
206
|
|
|
|
|
|
|
|
207
|
|
|
|
|
|
|
=cut |
208
|
|
|
|
|
|
|
|
209
|
|
|
|
|
|
|
sub get_country_name { |
210
|
2
|
|
|
2
|
1
|
2877
|
my $self = shift; |
211
|
2
|
|
|
|
|
5
|
my $country_code = shift; |
212
|
2
|
100
|
|
|
|
20
|
croak( 'No country passed' ) unless defined $country_code; |
213
|
1
|
50
|
|
|
|
7
|
my $country = $country_lang->{lc $country_code} |
214
|
|
|
|
|
|
|
or return undef; |
215
|
1
|
|
|
|
|
7
|
return $country->{name}; |
216
|
|
|
|
|
|
|
} |
217
|
|
|
|
|
|
|
|
218
|
|
|
|
|
|
|
|
219
|
|
|
|
|
|
|
=item get_accept_lang() |
220
|
|
|
|
|
|
|
|
221
|
|
|
|
|
|
|
=item get_accept_lang($accept_lang_string) |
222
|
|
|
|
|
|
|
|
223
|
|
|
|
|
|
|
Returns the accepted language tag(s) described by the supplied Accept-Language |
224
|
|
|
|
|
|
|
header value, or from C<$ENV{HTTP_ACCEPT_LANGUAGE}> if not supplied. Dies if no |
225
|
|
|
|
|
|
|
header value is available. |
226
|
|
|
|
|
|
|
In scalar context, returns the first language tag listed. In list context, |
227
|
|
|
|
|
|
|
returns all tags, in the order they are listed in the header value. |
228
|
|
|
|
|
|
|
|
229
|
|
|
|
|
|
|
=cut |
230
|
|
|
|
|
|
|
|
231
|
|
|
|
|
|
|
sub get_accept_lang { |
232
|
6
|
|
|
6
|
1
|
3580
|
my $self = shift; |
233
|
6
|
|
|
|
|
13
|
my $accept = shift; |
234
|
6
|
|
100
|
|
|
30
|
$accept ||= $ENV{HTTP_ACCEPT_LANGUAGE}; |
235
|
6
|
100
|
66
|
|
|
40
|
croak( 'No ACCEPT_LANGUAGE string passed, and $ENV{HTTP_ACCEPT_LANGUAGE} is empty' ) |
236
|
|
|
|
|
|
|
if !$accept && $self->{die_on_missing}; |
237
|
|
|
|
|
|
|
# We are going to ignore q and assume the order is accurate... might not be the best policy |
238
|
5
|
|
|
|
|
10
|
my @langs; |
239
|
|
|
|
|
|
|
my %seen; |
240
|
5
|
|
|
|
|
36
|
foreach my $language ( split(/\s*,\s*/, $accept) ) { |
241
|
7
|
|
|
|
|
30
|
my ( $lang, $q ) = split(/\s*;\s*/, $language); |
242
|
7
|
|
|
|
|
17
|
$lang = lc $lang; |
243
|
7
|
100
|
|
|
|
20
|
if ( wantarray ) { |
244
|
6
|
50
|
|
|
|
21
|
next if $seen{$lang}; # filter out any duplicates |
245
|
6
|
|
|
|
|
13
|
push( @langs, $lang ); |
246
|
6
|
|
|
|
|
24
|
$seen{$lang}++; |
247
|
|
|
|
|
|
|
} |
248
|
|
|
|
|
|
|
else { |
249
|
1
|
|
|
|
|
8
|
return $lang; |
250
|
|
|
|
|
|
|
} |
251
|
|
|
|
|
|
|
} |
252
|
4
|
|
|
|
|
22
|
return @langs; |
253
|
|
|
|
|
|
|
} |
254
|
|
|
|
|
|
|
|
255
|
|
|
|
|
|
|
|
256
|
|
|
|
|
|
|
=item get_lang_name($language_code) |
257
|
|
|
|
|
|
|
|
258
|
|
|
|
|
|
|
Returns the name of the language specified by the supplied 2 or 3 letter |
259
|
|
|
|
|
|
|
ISO-639 language code. Dies if no language code is supplied. |
260
|
|
|
|
|
|
|
|
261
|
|
|
|
|
|
|
=cut |
262
|
|
|
|
|
|
|
|
263
|
|
|
|
|
|
|
sub get_lang_name { |
264
|
6
|
|
|
6
|
1
|
2369
|
my $self = shift; |
265
|
6
|
|
|
|
|
12
|
my $lang_code = shift; |
266
|
6
|
100
|
|
|
|
29
|
croak( 'No language passed' ) unless defined $lang_code; |
267
|
5
|
|
|
|
|
11
|
$lang_code = lc $lang_code; |
268
|
5
|
100
|
|
|
|
28
|
my $language = $lang_charset->{$lang_code} |
269
|
|
|
|
|
|
|
or return undef; |
270
|
4
|
|
|
|
|
42
|
return $language->{name}; |
271
|
|
|
|
|
|
|
} |
272
|
|
|
|
|
|
|
|
273
|
|
|
|
|
|
|
|
274
|
|
|
|
|
|
|
=item get_lang_list($language_tag) |
275
|
|
|
|
|
|
|
|
276
|
|
|
|
|
|
|
Returns the list of language tags which could be used for matching the supplied |
277
|
|
|
|
|
|
|
language tag. This will always include the supplied language tag. If the supplied |
278
|
|
|
|
|
|
|
tag includes a C or C subtag, or is a primary tag for which C |
279
|
|
|
|
|
|
|
or C subtags are available, all such subtags will be returned. If the |
280
|
|
|
|
|
|
|
supplied tag contains any subtags, the primary tag will also be returned. |
281
|
|
|
|
|
|
|
Dies is no language tag is supplied. |
282
|
|
|
|
|
|
|
|
283
|
|
|
|
|
|
|
=cut |
284
|
|
|
|
|
|
|
|
285
|
|
|
|
|
|
|
sub get_lang_list { |
286
|
3
|
|
|
3
|
1
|
10172
|
my $self = shift; |
287
|
3
|
|
|
|
|
6
|
my $lang = shift; |
288
|
3
|
100
|
|
|
|
40
|
croak( 'No language passed' ) unless defined $lang; |
289
|
2
|
|
|
|
|
5
|
$lang = lc $lang; |
290
|
2
|
|
|
|
|
6
|
my @lang_list = ($lang); |
291
|
2
|
|
|
|
|
9
|
my %lang_seen = ( $lang => 1 ); |
292
|
|
|
|
|
|
|
|
293
|
|
|
|
|
|
|
# Check for Cyrillic/Latin versions |
294
|
2
|
|
|
|
|
8
|
$lang =~ s/-(?:cyrl|latn)\z//; |
295
|
2
|
|
|
|
|
11
|
for my $sublang ("$lang-latn","$lang-cyrl") { |
296
|
4
|
100
|
|
|
|
22
|
$lang_charset->{$sublang} or next; |
297
|
2
|
50
|
|
|
|
13
|
$lang_seen{$sublang}++ |
298
|
|
|
|
|
|
|
or push @lang_list,$sublang; |
299
|
|
|
|
|
|
|
} |
300
|
|
|
|
|
|
|
|
301
|
|
|
|
|
|
|
# Check for general language |
302
|
2
|
100
|
|
|
|
17
|
if ( $lang =~ s/-.+\z// ) { |
303
|
1
|
50
|
|
|
|
7
|
if ( $lang_charset->{$lang} ) { |
304
|
1
|
50
|
|
|
|
6
|
$lang_seen{$lang}++ |
305
|
|
|
|
|
|
|
or push @lang_list, $lang; |
306
|
|
|
|
|
|
|
} |
307
|
|
|
|
|
|
|
# Check for cyrl/latn versions |
308
|
1
|
|
|
|
|
6
|
for my $sublang ( "$lang-latn", "$lang-cyrl" ) { |
309
|
2
|
50
|
|
|
|
10
|
$lang_charset->{$sublang} or next; |
310
|
0
|
0
|
|
|
|
0
|
$lang_seen{$sublang}++ |
311
|
|
|
|
|
|
|
or push @lang_list, $sublang; |
312
|
|
|
|
|
|
|
} |
313
|
|
|
|
|
|
|
} |
314
|
|
|
|
|
|
|
|
315
|
2
|
|
|
|
|
14
|
return @lang_list; |
316
|
|
|
|
|
|
|
} |
317
|
|
|
|
|
|
|
|
318
|
|
|
|
|
|
|
|
319
|
|
|
|
|
|
|
=item get_lang_charset($language_tag) |
320
|
|
|
|
|
|
|
|
321
|
|
|
|
|
|
|
=item get_lang_charset($language_tag, $os_name) |
322
|
|
|
|
|
|
|
|
323
|
|
|
|
|
|
|
Returns the charset(s) used by the supplied language. If an operating system |
324
|
|
|
|
|
|
|
name is supplied, treats its character sets preferentially. Dies if no |
325
|
|
|
|
|
|
|
language tag is supplied. In scalar context, returns the best matching |
326
|
|
|
|
|
|
|
charset. In list context, returns a list of all suitable charsets. |
327
|
|
|
|
|
|
|
|
328
|
|
|
|
|
|
|
=cut |
329
|
|
|
|
|
|
|
|
330
|
|
|
|
|
|
|
sub get_lang_charset { |
331
|
8
|
|
|
8
|
1
|
4841
|
my $self = shift; |
332
|
8
|
|
|
|
|
16
|
my $lang = shift; |
333
|
8
|
|
|
|
|
12
|
my $os = shift; |
334
|
8
|
100
|
|
|
|
42
|
croak( 'No language tag passed to get_lang_charset()' ) unless $lang; |
335
|
7
|
|
|
|
|
14
|
$lang = lc $lang; |
336
|
7
|
50
|
|
|
|
24
|
my $group = $lang_charset->{$lang} or return; |
337
|
7
|
|
|
|
|
20
|
my @oses = qw(windows macintosh linux); |
338
|
7
|
100
|
|
|
|
22
|
if ( $os ) { |
339
|
1
|
|
|
|
|
4
|
$os = lc $os; |
340
|
1
|
50
|
|
|
|
10
|
croak( "OS $os not recognised" ) unless $os =~ /\A(?:windows|linux|macintosh)\z/; |
341
|
1
|
|
|
|
|
6
|
@oses = ($os, grep $_ ne $os, @oses); |
342
|
|
|
|
|
|
|
} |
343
|
7
|
|
|
|
|
29
|
my @charsets = @$group{@oses}; |
344
|
7
|
100
|
|
|
|
20
|
if ( wantarray ) { |
345
|
6
|
|
|
|
|
28
|
return @charsets; |
346
|
|
|
|
|
|
|
} else { |
347
|
1
|
|
|
|
|
9
|
return $charsets[0]; |
348
|
|
|
|
|
|
|
} |
349
|
|
|
|
|
|
|
} |
350
|
|
|
|
|
|
|
|
351
|
|
|
|
|
|
|
|
352
|
|
|
|
|
|
|
=item get_words($sample_string) |
353
|
|
|
|
|
|
|
|
354
|
|
|
|
|
|
|
=item get_words($sample_string, $max_words) |
355
|
|
|
|
|
|
|
|
356
|
|
|
|
|
|
|
Returns a list of unique words from the supplied sample string which contain |
357
|
|
|
|
|
|
|
non-ASCII characters. Returns no more than the specified maximum number |
358
|
|
|
|
|
|
|
or words, which defaults to 10. Dies if no sample text is supplied. |
359
|
|
|
|
|
|
|
|
360
|
|
|
|
|
|
|
=cut |
361
|
|
|
|
|
|
|
|
362
|
|
|
|
|
|
|
sub get_words { |
363
|
7
|
|
|
7
|
1
|
3791
|
my $self = shift; |
364
|
7
|
|
|
|
|
13
|
my $text = shift; |
365
|
7
|
|
|
|
|
15
|
my $max = shift; |
366
|
7
|
|
100
|
|
|
26
|
$max ||= 10; |
367
|
7
|
100
|
|
|
|
35
|
croak( 'No sample text passed' ) unless $text; |
368
|
6
|
|
|
|
|
89
|
my ( @words, %words ); |
369
|
6
|
|
|
|
|
55
|
while ( $text =~ /([\w\x80-\xff]*[\x80-\xff][\w\x80-\xff]*)/g ) { |
370
|
7
|
50
|
|
|
|
38
|
next if $words{$1}++; |
371
|
7
|
|
|
|
|
21
|
push @words, $1; |
372
|
7
|
50
|
|
|
|
38
|
last if @words >= $max; |
373
|
|
|
|
|
|
|
}#while |
374
|
6
|
|
|
|
|
31
|
return @words; |
375
|
|
|
|
|
|
|
} |
376
|
|
|
|
|
|
|
|
377
|
|
|
|
|
|
|
|
378
|
|
|
|
|
|
|
=item detect(%params) |
379
|
|
|
|
|
|
|
|
380
|
|
|
|
|
|
|
=item detect(\%params) |
381
|
|
|
|
|
|
|
|
382
|
|
|
|
|
|
|
Determines the encoding of the supplied text. In scalar context, returns the most |
383
|
|
|
|
|
|
|
likely charset code. In list context returns an arrayref of charset codes, ordered |
384
|
|
|
|
|
|
|
from most to least likely, and a hashref of metadata. Dies if any required |
385
|
|
|
|
|
|
|
parameters are not supplied. The following parameters are accepted: |
386
|
|
|
|
|
|
|
|
387
|
|
|
|
|
|
|
text Text to determine the encoding of (required) |
388
|
|
|
|
|
|
|
words Maximum number of words to examine (default=10) |
389
|
|
|
|
|
|
|
ip User's IP address (default=$ENV{REMOTE_ADDR}) |
390
|
|
|
|
|
|
|
accept_lang Accept-Language header value (required, default=$ENV{HTTP_ACCEPT_LANGUAGE}) |
391
|
|
|
|
|
|
|
inc_linux Include Linux charsets? (default=0) |
392
|
|
|
|
|
|
|
ranking TODO document |
393
|
|
|
|
|
|
|
os OS name (Windows, Macintosh or Linux) |
394
|
|
|
|
|
|
|
user_agent User-Agent header value (required if os not supplied, |
395
|
|
|
|
|
|
|
default=$ENV{HTTP_USER_AGENT}) |
396
|
|
|
|
|
|
|
lang Language tag or arrayref thereof |
397
|
|
|
|
|
|
|
country Country code or arrayref thereof (required if lang not supplied) |
398
|
|
|
|
|
|
|
country_extra TODO document |
399
|
|
|
|
|
|
|
lang_extra TODO document |
400
|
|
|
|
|
|
|
|
401
|
|
|
|
|
|
|
|
402
|
|
|
|
|
|
|
Requires a sample text string. Can optionally be passed the number of words to |
403
|
|
|
|
|
|
|
try to match (default 10), the users IP, the users OS, the user_agent string, |
404
|
|
|
|
|
|
|
the language code(S), the accept_language string, whether linux charsets should |
405
|
|
|
|
|
|
|
be included, and for advanced use you can adjust the way languages and charsets |
406
|
|
|
|
|
|
|
are ranked. Returns either a single charset (in scalar context) or a list of |
407
|
|
|
|
|
|
|
charsets ordered by most likely with associated meta data. If |
408
|
|
|
|
|
|
|
L is available it's guess is used to improve accuracy. |
409
|
|
|
|
|
|
|
|
410
|
|
|
|
|
|
|
For discussion of ranking heuristics and how to adjust them, see the section below. |
411
|
|
|
|
|
|
|
|
412
|
|
|
|
|
|
|
# I'm feeling lucky |
413
|
|
|
|
|
|
|
my $charset = $detector->detect(); |
414
|
|
|
|
|
|
|
|
415
|
|
|
|
|
|
|
# I'm feeling realistic |
416
|
|
|
|
|
|
|
my ( $charset_list, $charset_meta ) = $detector->detect( text => '...' ); |
417
|
|
|
|
|
|
|
|
418
|
|
|
|
|
|
|
# Data structure example |
419
|
|
|
|
|
|
|
$charset_list = [ 'x-mac-cyrillic', 'x-mac-ce', 'windows-1251', 'x-mac-ukrainian'... ]; |
420
|
|
|
|
|
|
|
$charset_meta = { |
421
|
|
|
|
|
|
|
charsets => { |
422
|
|
|
|
|
|
|
'x-mac-cyrillic' => { |
423
|
|
|
|
|
|
|
pos => 1, # Ranking position |
424
|
|
|
|
|
|
|
words => [ 'Здравствуй', ... ], # Sample word list |
425
|
|
|
|
|
|
|
lang => [ 'ru', ... ], # Language tags that led to this charset |
426
|
|
|
|
|
|
|
}, |
427
|
|
|
|
|
|
|
'x-mac-ce' => { |
428
|
|
|
|
|
|
|
pos => 2, |
429
|
|
|
|
|
|
|
words => [ 'ášūŗ‚ŮÚ‚ůť', ... ], |
430
|
|
|
|
|
|
|
lang => [ 'sr', ... ], |
431
|
|
|
|
|
|
|
}, |
432
|
|
|
|
|
|
|
'windows-1251' => { |
433
|
|
|
|
|
|
|
pos => 3, |
434
|
|
|
|
|
|
|
words => [ '‡дравствуй', ... ], |
435
|
|
|
|
|
|
|
lang => [ 'ru', ... ], |
436
|
|
|
|
|
|
|
mozilla => 1, # In this example mozilla guessed wrong |
437
|
|
|
|
|
|
|
}, |
438
|
|
|
|
|
|
|
... |
439
|
|
|
|
|
|
|
}, |
440
|
|
|
|
|
|
|
lang => { |
441
|
|
|
|
|
|
|
ru => { |
442
|
|
|
|
|
|
|
name => 'Russian', # Language name |
443
|
|
|
|
|
|
|
both => 1, # Matched from both country and accept_lang |
444
|
|
|
|
|
|
|
country => 1, # Matched from country (IP) |
445
|
|
|
|
|
|
|
accept => 1, # Matched from accept_lang |
446
|
|
|
|
|
|
|
pos => 1, # Ranking position |
447
|
|
|
|
|
|
|
}, |
448
|
|
|
|
|
|
|
... |
449
|
|
|
|
|
|
|
}, |
450
|
|
|
|
|
|
|
country => { |
451
|
|
|
|
|
|
|
name => 'Russia', |
452
|
|
|
|
|
|
|
tag => 'ru', |
453
|
|
|
|
|
|
|
}, |
454
|
|
|
|
|
|
|
error => [ 'utf-8', ... ], # Text wouldn't parse as utf-8 |
455
|
|
|
|
|
|
|
} |
456
|
|
|
|
|
|
|
|
457
|
|
|
|
|
|
|
=cut |
458
|
|
|
|
|
|
|
|
459
|
|
|
|
|
|
|
sub detect { |
460
|
5
|
|
|
5
|
1
|
4401
|
my $self = shift; |
461
|
5
|
|
|
|
|
11
|
my $param; |
462
|
5
|
50
|
33
|
|
|
41
|
if ( @_ == 1 && ref $_[0] eq 'HASH' ) { |
|
|
50
|
|
|
|
|
|
463
|
0
|
|
|
|
|
0
|
$param = $_[0]; |
464
|
|
|
|
|
|
|
} |
465
|
|
|
|
|
|
|
elsif ( @_ % 2 == 0 ) { |
466
|
5
|
|
|
|
|
30
|
$param = { @_ }; |
467
|
|
|
|
|
|
|
} |
468
|
|
|
|
|
|
|
else { |
469
|
0
|
|
|
|
|
0
|
croak( "Invalid parameters, must be either single hashref or key=>value pairs" ); |
470
|
|
|
|
|
|
|
} |
471
|
|
|
|
|
|
|
# TODO(LH) Maybe some param name validation |
472
|
5
|
|
|
|
|
61
|
my %conf = ( |
473
|
|
|
|
|
|
|
words => 10, |
474
|
|
|
|
|
|
|
ip => $ENV{REMOTE_ADDR}, |
475
|
|
|
|
|
|
|
user_agent => $ENV{HTTP_USER_AGENT}, |
476
|
|
|
|
|
|
|
accept_lang => $ENV{HTTP_ACCEPT_LANGUAGE}, |
477
|
|
|
|
|
|
|
inc_linux => 0, |
478
|
|
|
|
|
|
|
%$param, |
479
|
|
|
|
|
|
|
); |
480
|
|
|
|
|
|
|
|
481
|
5
|
|
|
|
|
69
|
my %rank = ( |
482
|
|
|
|
|
|
|
lang => { |
483
|
|
|
|
|
|
|
start => 'AC', |
484
|
|
|
|
|
|
|
repeat => 'AC', |
485
|
|
|
|
|
|
|
}, |
486
|
|
|
|
|
|
|
lang_both => 1, |
487
|
|
|
|
|
|
|
char => { |
488
|
|
|
|
|
|
|
windows => { |
489
|
|
|
|
|
|
|
start => 'WW', |
490
|
|
|
|
|
|
|
repeat => 'WML', |
491
|
|
|
|
|
|
|
}, |
492
|
|
|
|
|
|
|
macintosh => { |
493
|
|
|
|
|
|
|
start => 'M', |
494
|
|
|
|
|
|
|
repeat => 'MWL', |
495
|
|
|
|
|
|
|
}, |
496
|
|
|
|
|
|
|
linux => { |
497
|
|
|
|
|
|
|
start => 'LWM', |
498
|
|
|
|
|
|
|
repeat => 'LWM', |
499
|
|
|
|
|
|
|
}, |
500
|
|
|
|
|
|
|
}, |
501
|
|
|
|
|
|
|
mozilla_move => 1, |
502
|
|
|
|
|
|
|
mozilla_insert => 3, |
503
|
|
|
|
|
|
|
); |
504
|
5
|
100
|
|
|
|
23
|
if ( $conf{ranking} ) { |
505
|
2
|
|
|
|
|
14
|
%rank = ( |
506
|
|
|
|
|
|
|
%rank, |
507
|
2
|
|
|
|
|
8
|
%{ $conf{ranking} } |
508
|
|
|
|
|
|
|
); |
509
|
|
|
|
|
|
|
# Validate ranking sequences |
510
|
2
|
50
|
|
|
|
12
|
croak( 'Missing language ranking sequence' ) unless $rank{lang}->{repeat}; |
511
|
2
|
|
|
|
|
5
|
foreach my $os ( qw( windows macintosh linux ) ) { |
512
|
6
|
50
|
|
|
|
25
|
croak( 'Missing ' . ucfirst $os . ' charset ranking sequence' ) unless $rank{char}->{$os}->{repeat}; |
513
|
|
|
|
|
|
|
} |
514
|
|
|
|
|
|
|
} |
515
|
|
|
|
|
|
|
|
516
|
|
|
|
|
|
|
# Get the OS |
517
|
5
|
50
|
|
|
|
21
|
unless ( $conf{os} ) { |
518
|
5
|
|
|
|
|
44
|
$conf{os} = $self->get_os( $conf{user_agent} ); |
519
|
|
|
|
|
|
|
} |
520
|
|
|
|
|
|
|
# Default to windows if we still don't have an OS |
521
|
5
|
50
|
|
|
|
14
|
$conf{os} = 'windows' unless $conf{os}; |
522
|
5
|
|
|
|
|
15
|
$conf{os} = lc $conf{os}; |
523
|
|
|
|
|
|
|
# OS of linux implies inc_linux |
524
|
5
|
50
|
|
|
|
16
|
$conf{inc_linux} = 1 if $conf{os} eq 'linux'; |
525
|
|
|
|
|
|
|
|
526
|
|
|
|
|
|
|
# Get the list of language tags |
527
|
5
|
|
|
|
|
21
|
my($country_meta, $lang_meta) = $self->_detect_get_langs(\%conf,\%rank); |
528
|
|
|
|
|
|
|
|
529
|
|
|
|
|
|
|
# Get the related charsets and meta data |
530
|
5
|
|
|
|
|
23
|
my @words = $self->get_words( $conf{text}, $conf{words} ); |
531
|
5
|
|
|
|
|
22
|
my %char_hash = ( |
532
|
|
|
|
|
|
|
W => [], |
533
|
|
|
|
|
|
|
M => [], |
534
|
|
|
|
|
|
|
L => [], |
535
|
|
|
|
|
|
|
); |
536
|
5
|
|
|
|
|
8
|
my %char_meta; |
537
|
|
|
|
|
|
|
my %char_error; |
538
|
5
|
|
|
|
|
29
|
$self->_detect_check_langs(\%char_meta,\%char_hash,\%char_error,\%conf,\@words); |
539
|
|
|
|
|
|
|
|
540
|
|
|
|
|
|
|
# Does this parse as UTF-8? |
541
|
5
|
|
|
|
|
352
|
my $is_utf8 = 1; |
542
|
5
|
|
|
|
|
9
|
eval { decode( 'UTF-8', $conf{text}, Encode::FB_CROAK ) }; |
|
5
|
|
|
|
|
27
|
|
543
|
5
|
100
|
|
|
|
637
|
$is_utf8 = 0 if $@; |
544
|
|
|
|
|
|
|
# Make sure we have UTF-8 charset info |
545
|
5
|
100
|
|
|
|
13
|
if ( $is_utf8 ) { |
546
|
|
|
|
|
|
|
# UTF-8 could be any language, so doesn't tend to be picked up above |
547
|
0
|
|
|
|
|
0
|
$char_meta{'utf-8'} = { |
548
|
|
|
|
|
|
|
pos => 1, |
549
|
|
|
|
|
|
|
lang => [], |
550
|
1
|
|
|
|
|
7
|
words => [ map { decode( 'UTF-8', $_ ) } @words ], |
551
|
|
|
|
|
|
|
}; |
552
|
|
|
|
|
|
|
} |
553
|
|
|
|
|
|
|
else { |
554
|
4
|
|
|
|
|
11
|
$char_error{'utf-8'}++; |
555
|
|
|
|
|
|
|
} |
556
|
|
|
|
|
|
|
|
557
|
|
|
|
|
|
|
# Rank position |
558
|
5
|
|
33
|
|
|
29
|
$rank{char}->{ $conf{os} }->{start} ||= $rank{char}->{ $conf{os} }->{repeat}; |
559
|
5
|
|
|
|
|
30
|
my @sequence = split( //, $rank{char}->{ $conf{os} }->{start} ); |
560
|
5
|
100
|
|
|
|
18
|
my $pos = $is_utf8 ? 2 : 1; |
561
|
5
|
|
|
|
|
18
|
while ( my $type = shift @sequence ) { |
562
|
25
|
100
|
100
|
|
|
28
|
last unless @{ $char_hash{W} } || @{ $char_hash{M} } || @{ $char_hash{L} }; |
|
25
|
|
66
|
|
|
95
|
|
|
20
|
|
|
|
|
85
|
|
|
5
|
|
|
|
|
24
|
|
563
|
20
|
100
|
|
|
|
68
|
push( @sequence, split( //, $rank{char}->{ $conf{os} }->{repeat} ) ) unless @sequence; |
564
|
20
|
|
|
|
|
25
|
while ( my $charset = shift @{ $char_hash{$type} } ) { |
|
20
|
|
|
|
|
86
|
|
565
|
10
|
50
|
|
|
|
27
|
if ( $char_meta{$charset}->{pos} ) { |
566
|
0
|
|
|
|
|
0
|
next; |
567
|
|
|
|
|
|
|
} |
568
|
|
|
|
|
|
|
else { |
569
|
10
|
|
|
|
|
26
|
$char_meta{$charset}->{pos} = $pos; |
570
|
10
|
|
|
|
|
13
|
$pos++; |
571
|
10
|
|
|
|
|
39
|
last; |
572
|
|
|
|
|
|
|
} |
573
|
|
|
|
|
|
|
}#while |
574
|
|
|
|
|
|
|
}#while |
575
|
|
|
|
|
|
|
|
576
|
|
|
|
|
|
|
# Can we see what Mozilla detection thinks? |
577
|
5
|
|
|
|
|
9
|
my $mozilla; |
578
|
5
|
50
|
|
|
|
14
|
if ( $has_detect ) { |
579
|
0
|
|
|
|
|
0
|
$mozilla = Encode::Detect::Detector::detect( $conf{text} ); |
580
|
0
|
0
|
|
|
|
0
|
$mozilla = lc $mozilla if defined $mozilla; |
581
|
0
|
0
|
|
|
|
0
|
if ( $mozilla ) { |
582
|
|
|
|
|
|
|
# Check charset can decode |
583
|
0
|
|
|
|
|
0
|
my $charset_encode = _try_charset( $mozilla, $conf{text} ); |
584
|
0
|
0
|
|
|
|
0
|
if ( $charset_encode ) { |
585
|
|
|
|
|
|
|
# Check we have the Mozilla charset in our list |
586
|
0
|
0
|
|
|
|
0
|
if ( $char_meta{$mozilla} ) { |
587
|
0
|
|
|
|
|
0
|
$char_meta{$mozilla}->{mozilla} = 1; |
588
|
|
|
|
|
|
|
# Should Mozilla affect position? |
589
|
0
|
0
|
0
|
|
|
0
|
if ( $rank{mozilla_move} && $char_meta{$mozilla}->{pos} != 1 ) { |
590
|
0
|
|
|
|
|
0
|
my $pos_new = $char_meta{$mozilla}->{pos} - $rank{mozilla_move}; |
591
|
0
|
0
|
|
|
|
0
|
$pos_new = 1 if $pos_new < 1; |
592
|
|
|
|
|
|
|
# Move other charsets |
593
|
0
|
0
|
|
|
|
0
|
map { $_->{pos}++ } grep { |
|
0
|
|
|
|
|
0
|
|
594
|
0
|
|
|
|
|
0
|
$_->{pos} >= $pos_new && |
595
|
|
|
|
|
|
|
$_->{pos} < $char_meta{$mozilla}->{pos} |
596
|
|
|
|
|
|
|
} values %char_meta; |
597
|
0
|
|
|
|
|
0
|
$char_meta{$mozilla}->{pos} = $pos_new; |
598
|
|
|
|
|
|
|
} |
599
|
|
|
|
|
|
|
}#if |
600
|
|
|
|
|
|
|
else { |
601
|
|
|
|
|
|
|
# Insert Mozilla if it's not in list? |
602
|
0
|
0
|
|
|
|
0
|
if ( $rank{mozilla_insert} ) { |
603
|
|
|
|
|
|
|
# Push everything else down |
604
|
0
|
|
|
|
|
0
|
map { $_->{pos}++ } grep { |
|
0
|
|
|
|
|
0
|
|
605
|
0
|
|
|
|
|
0
|
$_->{pos} >= $rank{mozilla_insert} |
606
|
|
|
|
|
|
|
} values %char_meta; |
607
|
0
|
|
|
|
|
0
|
$char_meta{$mozilla} = { |
608
|
|
|
|
|
|
|
lang => [], |
609
|
0
|
|
|
|
|
0
|
words => [ map { decode( $charset_encode, $_ ) } @words ], |
610
|
|
|
|
|
|
|
pos => $rank{mozilla_insert}, |
611
|
|
|
|
|
|
|
mozilla => 1, |
612
|
|
|
|
|
|
|
}; |
613
|
|
|
|
|
|
|
} |
614
|
|
|
|
|
|
|
}#else |
615
|
|
|
|
|
|
|
}#if |
616
|
|
|
|
|
|
|
else { |
617
|
0
|
|
|
|
|
0
|
$char_error{ $mozilla }++; |
618
|
|
|
|
|
|
|
} |
619
|
|
|
|
|
|
|
}#if |
620
|
|
|
|
|
|
|
}#if |
621
|
|
|
|
|
|
|
|
622
|
|
|
|
|
|
|
# Prep return |
623
|
5
|
|
|
|
|
39
|
my @charsets = sort { $char_meta{$a}->{pos} <=> $char_meta{$b}->{pos} } keys %char_meta; |
|
7
|
|
|
|
|
33
|
|
624
|
5
|
100
|
|
|
|
16
|
if ( wantarray ) { |
625
|
1
|
|
|
|
|
10
|
my %meta = ( |
626
|
|
|
|
|
|
|
charsets => \%char_meta, |
627
|
|
|
|
|
|
|
lang => $lang_meta, |
628
|
|
|
|
|
|
|
country => $country_meta, |
629
|
|
|
|
|
|
|
error => [ keys %char_error ], |
630
|
|
|
|
|
|
|
); |
631
|
1
|
|
|
|
|
16
|
return ( \@charsets, \%meta); |
632
|
|
|
|
|
|
|
} |
633
|
|
|
|
|
|
|
else { |
634
|
4
|
|
|
|
|
82
|
return $charsets[0]; |
635
|
|
|
|
|
|
|
} |
636
|
|
|
|
|
|
|
}#sub |
637
|
|
|
|
|
|
|
|
638
|
|
|
|
|
|
|
sub _detect_get_langs{ |
639
|
5
|
|
|
5
|
|
10
|
my $self = shift; |
640
|
5
|
|
|
|
|
7
|
my $conf = shift; |
641
|
5
|
|
|
|
|
10
|
my $rank = shift; |
642
|
|
|
|
|
|
|
|
643
|
5
|
|
|
|
|
7
|
my %country_meta; |
644
|
|
|
|
|
|
|
my %lang_meta; |
645
|
5
|
100
|
|
|
|
55
|
if ( $conf->{lang} ) { |
646
|
2
|
50
|
|
|
|
10
|
$conf->{lang} = [ $conf->{lang} ] unless ref $conf->{lang}; |
647
|
2
|
|
|
|
|
5
|
$conf->{lang} = [ map { lc $_ } @{ $conf->{lang} } ]; |
|
2
|
|
|
|
|
9
|
|
|
2
|
|
|
|
|
5
|
|
648
|
|
|
|
|
|
|
} |
649
|
|
|
|
|
|
|
else { |
650
|
|
|
|
|
|
|
## Get language list from conf with meta data |
651
|
|
|
|
|
|
|
# Start with country list |
652
|
3
|
|
|
|
|
6
|
my @country_list; |
653
|
3
|
50
|
|
|
|
23
|
if ( $conf->{country} ) { |
654
|
0
|
0
|
|
|
|
0
|
@country_list = ref $conf->{country} ? @{ $conf->{country} } : ( $conf->{country} ); |
|
0
|
|
|
|
|
0
|
|
655
|
0
|
|
|
|
|
0
|
@country_list = map { lc $_ } @country_list; |
|
0
|
|
|
|
|
0
|
|
656
|
|
|
|
|
|
|
} |
657
|
|
|
|
|
|
|
else { |
658
|
|
|
|
|
|
|
# See if we have IP's to lookup countries for, we may have several |
659
|
3
|
50
|
33
|
|
|
36
|
if ( $conf->{ip} && ($has_ipcountry || $has_geoip) ) { |
|
|
|
33
|
|
|
|
|
660
|
0
|
0
|
|
|
|
0
|
$conf->{ip} = [ $conf->{ip} ] unless ref $conf->{ip}; |
661
|
0
|
|
|
|
|
0
|
foreach my $ip ( @{ $conf->{ip} } ) { |
|
0
|
|
|
|
|
0
|
|
662
|
0
|
|
|
|
|
0
|
my $country = $self->get_country( $ip ); |
663
|
0
|
0
|
|
|
|
0
|
next unless $country; |
664
|
0
|
0
|
|
|
|
0
|
if ( $country_meta{$country} ) { |
665
|
0
|
|
|
|
|
0
|
push( @{ $country_meta{$country}->{ip} }, $ip ); |
|
0
|
|
|
|
|
0
|
|
666
|
|
|
|
|
|
|
} |
667
|
|
|
|
|
|
|
else { |
668
|
0
|
|
|
|
|
0
|
push( @country_list, $country ); |
669
|
0
|
|
|
|
|
0
|
$country_meta{$country} = { |
670
|
|
|
|
|
|
|
ip => [$ip], |
671
|
|
|
|
|
|
|
}; |
672
|
|
|
|
|
|
|
} |
673
|
|
|
|
|
|
|
}#foreach |
674
|
|
|
|
|
|
|
}#if |
675
|
|
|
|
|
|
|
# Are there extra countries to add to the start or end of the list |
676
|
3
|
50
|
|
|
|
13
|
if ( ref $conf->{country_extra} ) { |
677
|
0
|
|
|
|
|
0
|
foreach my $position ( qw( end start ) ) { |
678
|
0
|
0
|
|
|
|
0
|
if ( $conf->{country_extra}->{$position} ) { |
679
|
0
|
0
|
|
|
|
0
|
$conf->{country_extra}->{$position} = [ $conf->{country_extra}->{$position} ] unless ref $conf->{country_extra}->{$position}; |
680
|
0
|
|
|
|
|
0
|
$conf->{country_extra}->{$position} = [ map { lc $_ } @{ $conf->{country_extra}->{$position} } ]; |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
681
|
0
|
|
|
|
|
0
|
foreach my $country ( @{ $conf->{country_extra}->{$position} } ) { |
|
0
|
|
|
|
|
0
|
|
682
|
|
|
|
|
|
|
# Check if it's already in the list, in which case remove |
683
|
0
|
0
|
|
|
|
0
|
if ( $country_meta{$country} ) { |
684
|
0
|
|
0
|
|
|
0
|
$country_meta{$country}->{extra} ||= []; |
685
|
0
|
|
|
|
|
0
|
push( @{ $country_meta{$country}->{extra} }, $position ); |
|
0
|
|
|
|
|
0
|
|
686
|
|
|
|
|
|
|
# If adding to the end, leave in current position, only move to start |
687
|
0
|
0
|
|
|
|
0
|
@country_list = grep { $_ ne $country } @country_list if $position eq 'start'; |
|
0
|
|
|
|
|
0
|
|
688
|
|
|
|
|
|
|
} |
689
|
|
|
|
|
|
|
else { |
690
|
0
|
|
|
|
|
0
|
$country_meta{$country} = { |
691
|
|
|
|
|
|
|
extra => [ $position ], |
692
|
|
|
|
|
|
|
}; |
693
|
|
|
|
|
|
|
} |
694
|
|
|
|
|
|
|
} |
695
|
|
|
|
|
|
|
}#if |
696
|
|
|
|
|
|
|
}#foreach |
697
|
|
|
|
|
|
|
# Add to front/back of list |
698
|
0
|
0
|
|
|
|
0
|
unshift( @country_list, @{ $conf->{country_extra}->{start} } ) if ref $conf->{country_extra}->{start}; |
|
0
|
|
|
|
|
0
|
|
699
|
0
|
0
|
|
|
|
0
|
push( @country_list, @{ $conf->{country_extra}->{end} } ) if ref $conf->{country_extra}->{end}; |
|
0
|
|
|
|
|
0
|
|
700
|
|
|
|
|
|
|
}#if |
701
|
|
|
|
|
|
|
}#else |
702
|
|
|
|
|
|
|
# Get lang tags from countries |
703
|
3
|
|
|
|
|
6
|
my @lang_country; |
704
|
|
|
|
|
|
|
my %country_seen; |
705
|
3
|
|
|
|
|
9
|
foreach my $country ( @country_list ) { |
706
|
0
|
|
|
|
|
0
|
$country_meta{$country}->{name} = $self->get_country_name( $country ); |
707
|
0
|
|
|
|
|
0
|
my @lang_list = $self->get_country_lang( $country ); |
708
|
0
|
|
|
|
|
0
|
foreach my $lang ( @lang_list ) { |
709
|
0
|
0
|
|
|
|
0
|
next if $country_seen{$lang}; |
710
|
0
|
|
|
|
|
0
|
push( @lang_country, $lang ); |
711
|
0
|
|
|
|
|
0
|
$country_seen{$lang}++; |
712
|
|
|
|
|
|
|
} |
713
|
|
|
|
|
|
|
}#foreach |
714
|
|
|
|
|
|
|
|
715
|
|
|
|
|
|
|
# Now lang list from accept_langs |
716
|
3
|
|
|
|
|
7
|
my @lang_accept; |
717
|
|
|
|
|
|
|
my %accept_seen; |
718
|
3
|
50
|
|
|
|
10
|
if ( $conf->{accept_lang} ) { |
719
|
3
|
50
|
|
|
|
18
|
$conf->{accept_lang} = [ $conf->{accept_lang} ] unless ref $conf->{accept_lang}; |
720
|
3
|
|
|
|
|
6
|
foreach my $accept_lang ( @{ $conf->{accept_lang} } ) { |
|
3
|
|
|
|
|
11
|
|
721
|
3
|
|
|
|
|
14
|
my @lang_list = $self->get_accept_lang( $accept_lang ); |
722
|
3
|
|
|
|
|
11
|
foreach my $lang ( @lang_list ) { |
723
|
3
|
50
|
|
|
|
12
|
next if $accept_seen{$lang}; |
724
|
3
|
|
|
|
|
8
|
push( @lang_accept, $lang ); |
725
|
3
|
|
|
|
|
14
|
$accept_seen{$lang}++; |
726
|
|
|
|
|
|
|
} |
727
|
|
|
|
|
|
|
} |
728
|
|
|
|
|
|
|
} |
729
|
|
|
|
|
|
|
# Are there extra lang tags to add to the start or end of the list |
730
|
3
|
|
|
|
|
6
|
my %extra_seen; |
731
|
|
|
|
|
|
|
my %extra_list; |
732
|
3
|
50
|
|
|
|
11
|
if ( ref $conf->{lang_extra} ) { |
733
|
0
|
|
|
|
|
0
|
foreach my $position ( qw( end start ) ) { |
734
|
0
|
|
|
|
|
0
|
$extra_list{$position} = []; |
735
|
0
|
0
|
|
|
|
0
|
if ( $conf->{lang_extra}->{$position} ) { |
736
|
0
|
0
|
|
|
|
0
|
$conf->{lang_extra}->{$position} = [ $conf->{lang_extra}->{$position} ] unless ref $conf->{lang_extra}->{$position}; |
737
|
0
|
|
|
|
|
0
|
$conf->{lang_extra}->{$position} = [ map { lc $_ } @{ $conf->{lang_extra}->{$position} } ]; |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
738
|
0
|
|
|
|
|
0
|
$extra_list{$position} = $conf->{lang_extra}->{$position}; |
739
|
0
|
|
|
|
|
0
|
$extra_seen{$position} = { map { $_ => 1 } @{ $conf->{lang_extra}->{$position} } }; |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
740
|
|
|
|
|
|
|
} |
741
|
|
|
|
|
|
|
}#foreach |
742
|
|
|
|
|
|
|
}#if |
743
|
|
|
|
|
|
|
## Rank languages based on order and appearance in both lists |
744
|
|
|
|
|
|
|
# Which lists they appear in |
745
|
3
|
|
|
|
|
143
|
foreach my $lang ( @lang_accept, @lang_country, @{ $extra_list{start} }, @{ $extra_list{end} } ) { |
|
3
|
|
|
|
|
8
|
|
|
3
|
|
|
|
|
9
|
|
746
|
3
|
50
|
|
|
|
10
|
next if $lang_meta{$lang}; |
747
|
3
|
50
|
33
|
|
|
120
|
$lang_meta{$lang} = { |
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
748
|
|
|
|
|
|
|
both => $accept_seen{$lang} && $country_seen{$lang} ? 1 : 0, |
749
|
|
|
|
|
|
|
accept => $accept_seen{$lang} || 0, |
750
|
|
|
|
|
|
|
country => $country_seen{$lang} || 0, |
751
|
|
|
|
|
|
|
start => $extra_seen{start}->{$lang} || 0, |
752
|
|
|
|
|
|
|
end => $extra_seen{end}->{$lang} || 0, |
753
|
|
|
|
|
|
|
name => $self->get_lang_name($lang), |
754
|
|
|
|
|
|
|
}; |
755
|
|
|
|
|
|
|
}#foreach |
756
|
|
|
|
|
|
|
## Rank position |
757
|
|
|
|
|
|
|
# Extra start will go first |
758
|
3
|
|
|
|
|
8
|
my $pos = 1; |
759
|
3
|
|
|
|
|
5
|
foreach my $lang ( @{ $extra_list{start} } ) { |
|
3
|
|
|
|
|
12
|
|
760
|
0
|
0
|
|
|
|
0
|
next if $lang_meta{$lang}->{pos}; |
761
|
0
|
|
|
|
|
0
|
$lang_meta{$lang}->{pos} = $pos; |
762
|
0
|
|
|
|
|
0
|
$pos++; |
763
|
|
|
|
|
|
|
}#foreach |
764
|
|
|
|
|
|
|
# Then sequence |
765
|
3
|
|
|
|
|
14
|
my %lang_hash = ( |
766
|
|
|
|
|
|
|
A => \@lang_accept, |
767
|
|
|
|
|
|
|
C => \@lang_country, |
768
|
|
|
|
|
|
|
); |
769
|
3
|
|
33
|
|
|
14
|
$rank->{lang}->{start} ||= $rank->{lang}->{repeat}; |
770
|
3
|
|
|
|
|
16
|
my @sequence = split( //, $rank->{lang}->{start} ); |
771
|
3
|
|
|
|
|
13
|
while ( my $type = shift @sequence ) { |
772
|
6
|
100
|
66
|
|
|
7
|
last unless @{ $lang_hash{A} } || @{ $lang_hash{C} }; |
|
6
|
|
|
|
|
34
|
|
|
3
|
|
|
|
|
15
|
|
773
|
3
|
50
|
|
|
|
10
|
push( @sequence, split( //, $rank->{lang}->{repeat} ) ) unless @sequence; |
774
|
3
|
|
|
|
|
6
|
while ( my $lang = shift @{ $lang_hash{$type} } ) { |
|
3
|
|
|
|
|
14
|
|
775
|
3
|
50
|
|
|
|
11
|
if ( $lang_meta{$lang}->{pos} ) { |
776
|
0
|
|
|
|
|
0
|
next; |
777
|
|
|
|
|
|
|
} |
778
|
|
|
|
|
|
|
else { |
779
|
3
|
|
|
|
|
15
|
$lang_meta{$lang}->{pos} = $pos; |
780
|
3
|
|
|
|
|
4
|
$pos++; |
781
|
3
|
|
|
|
|
15
|
last; |
782
|
|
|
|
|
|
|
} |
783
|
|
|
|
|
|
|
}#while |
784
|
|
|
|
|
|
|
}#while |
785
|
|
|
|
|
|
|
# Extra end added to the end |
786
|
3
|
|
|
|
|
5
|
foreach my $lang ( @{ $extra_list{end} } ) { |
|
3
|
|
|
|
|
20
|
|
787
|
0
|
0
|
|
|
|
0
|
next if $lang_meta{$lang}->{pos}; |
788
|
0
|
|
|
|
|
0
|
$lang_meta{$lang}->{pos} = $pos; |
789
|
0
|
|
|
|
|
0
|
$pos++; |
790
|
|
|
|
|
|
|
}#foreach |
791
|
|
|
|
|
|
|
# Prefer languages that appear in both? |
792
|
3
|
50
|
|
|
|
16
|
if ( $rank->{lang_both} ) { |
793
|
0
|
0
|
0
|
|
|
0
|
$conf->{lang} = [ sort { |
794
|
3
|
|
|
|
|
27
|
$lang_meta{$b}->{start} <=> $lang_meta{$a}->{start} || |
795
|
|
|
|
|
|
|
$lang_meta{$b}->{both} <=> $lang_meta{$a}->{both} || |
796
|
|
|
|
|
|
|
$lang_meta{$a}->{pos} <=> $lang_meta{$b}->{pos} |
797
|
|
|
|
|
|
|
} keys %lang_meta ]; |
798
|
|
|
|
|
|
|
} |
799
|
|
|
|
|
|
|
else { |
800
|
0
|
|
|
|
|
0
|
$conf->{lang} = [ sort { |
801
|
0
|
|
|
|
|
0
|
$lang_meta{$a}->{pos} <=> $lang_meta{$b}->{pos} |
802
|
|
|
|
|
|
|
} keys %lang_meta ]; |
803
|
|
|
|
|
|
|
} |
804
|
|
|
|
|
|
|
}#else |
805
|
|
|
|
|
|
|
|
806
|
5
|
|
|
|
|
23
|
return (\%country_meta,\%lang_meta); |
807
|
|
|
|
|
|
|
} |
808
|
|
|
|
|
|
|
|
809
|
|
|
|
|
|
|
sub _detect_check_langs{ |
810
|
5
|
|
|
5
|
|
8
|
my $self = shift; |
811
|
5
|
|
|
|
|
7
|
my $char_meta = shift; |
812
|
5
|
|
|
|
|
8
|
my $char_hash = shift; |
813
|
5
|
|
|
|
|
8
|
my $char_error = shift; |
814
|
5
|
|
|
|
|
10
|
my $conf = shift; |
815
|
5
|
|
|
|
|
7
|
my $words = shift; |
816
|
5
|
50
|
|
|
|
19
|
my @os_list = ('W','M',$conf->{inc_linux} ? 'L' : ()); |
817
|
5
|
|
|
|
|
9
|
foreach my $lang ( @{ $conf->{lang} } ) { |
|
5
|
|
|
|
|
12
|
|
818
|
5
|
|
|
|
|
16
|
my @charsets = $self->get_lang_charset( $lang ); |
819
|
5
|
50
|
|
|
|
12
|
next unless @charsets; |
820
|
5
|
|
|
|
|
19
|
for ( my $i=0; $i <= $#os_list; $i++ ) { |
821
|
10
|
|
|
|
|
185
|
my $charset = $charsets[$i]; |
822
|
10
|
|
|
|
|
15
|
my $os = $os_list[$i]; |
823
|
10
|
50
|
|
|
|
31
|
next if $char_error->{ $charset }; |
824
|
10
|
50
|
|
|
|
23
|
if ( $char_meta->{ $charset } ) { |
825
|
0
|
|
|
|
|
0
|
push( @{ $char_meta->{$charset}->{lang} }, $lang ); |
|
0
|
|
|
|
|
0
|
|
826
|
|
|
|
|
|
|
} |
827
|
|
|
|
|
|
|
else { |
828
|
|
|
|
|
|
|
# Test charset parses |
829
|
10
|
|
|
|
|
31
|
my $charset_encode = _try_charset( $charset, $conf->{text} ); |
830
|
10
|
50
|
|
|
|
67
|
if ( $charset_encode ) { |
831
|
10
|
|
|
|
|
15
|
push( @{ $char_hash->{ $os } }, $charset ); |
|
10
|
|
|
|
|
33
|
|
832
|
10
|
|
|
|
|
199
|
$char_meta->{ $charset } = { |
833
|
|
|
|
|
|
|
lang => [ $lang ], |
834
|
10
|
|
|
|
|
110
|
words => [ map { decode( $charset_encode, $_ ) } @$words ], |
835
|
|
|
|
|
|
|
}; |
836
|
|
|
|
|
|
|
} |
837
|
|
|
|
|
|
|
else { |
838
|
0
|
|
|
|
|
0
|
$char_error->{ $charset }++; |
839
|
|
|
|
|
|
|
} |
840
|
|
|
|
|
|
|
}#else |
841
|
|
|
|
|
|
|
}#for |
842
|
|
|
|
|
|
|
}#foreach |
843
|
|
|
|
|
|
|
} |
844
|
|
|
|
|
|
|
|
845
|
|
|
|
|
|
|
sub _try_charset { |
846
|
10
|
|
|
10
|
|
23
|
my ( $charset, $text ) = @_; |
847
|
|
|
|
|
|
|
# Older versions of Encode::Alias don't map x-mac-* encodings properly |
848
|
10
|
|
|
|
|
82
|
$charset =~ s/^(?:x[_-])?mac[_-](.*)$/mac$1/; |
849
|
10
|
|
|
|
|
19
|
$charset =~ s/^macce$/maccentraleurroman/; |
850
|
10
|
|
|
|
|
16
|
eval { decode( $charset, $text, Encode::FB_CROAK ) }; |
|
10
|
|
|
|
|
60
|
|
851
|
10
|
50
|
|
|
|
7201
|
return $@ ? 0 : $charset; |
852
|
|
|
|
|
|
|
}#sub |
853
|
|
|
|
|
|
|
|
854
|
|
|
|
|
|
|
|
855
|
|
|
|
|
|
|
=back |
856
|
|
|
|
|
|
|
|
857
|
|
|
|
|
|
|
=head1 RANKING SYSTEM |
858
|
|
|
|
|
|
|
|
859
|
|
|
|
|
|
|
Unfortunately the heuristics employed by this method aren't straight forward. |
860
|
|
|
|
|
|
|
Several key scenarios are taken into consideration, namely: |
861
|
|
|
|
|
|
|
|
862
|
|
|
|
|
|
|
The upload charset is: |
863
|
|
|
|
|
|
|
for the language that matches the browsers language settings and OS. |
864
|
|
|
|
|
|
|
for the language that matches the uploaders countries official language and OS. |
865
|
|
|
|
|
|
|
for the language that matches the browsers language settings, but a different OS. |
866
|
|
|
|
|
|
|
for the language that matches the uploaders countries official language, but a different OS. |
867
|
|
|
|
|
|
|
unrelated, hopefully detected by Mozilla's universal charset detector. |
868
|
|
|
|
|
|
|
|
869
|
|
|
|
|
|
|
Although the browsers language setting is preferred, it's not unusually for it |
870
|
|
|
|
|
|
|
to be incorrect. For example a surprising number of UK users have en-US rather |
871
|
|
|
|
|
|
|
than en-GB. In such instances the language from the IP would be more accurate. |
872
|
|
|
|
|
|
|
For this reason if the Mozilla detected charset matches an IP dervied charset it |
873
|
|
|
|
|
|
|
is brought to the front. |
874
|
|
|
|
|
|
|
However, an Englishman uploading a file whilst abroad would not give an accurate |
875
|
|
|
|
|
|
|
language from IP. Likewise, some countries like South Africa have several |
876
|
|
|
|
|
|
|
recognised languages. |
877
|
|
|
|
|
|
|
Some countries have inhabitants that use either Latin or Cyrillic alphabets for |
878
|
|
|
|
|
|
|
the same language. In these instances, the Mozilla detector is used to determine |
879
|
|
|
|
|
|
|
which is more likely, but both options will be returned. |
880
|
|
|
|
|
|
|
The use of Macintosh computers has been on the rise, as has the appearance of |
881
|
|
|
|
|
|
|
their charsets. In fact that's what led me to write this module, as the Mozilla |
882
|
|
|
|
|
|
|
detector doesn't cover every encoding and was missing Mac-Roman. Generally |
883
|
|
|
|
|
|
|
Windows users are less likely to upload files with Macintosh encoding, Although |
884
|
|
|
|
|
|
|
the same cannot be said the other way around. For this reason, when the OS is |
885
|
|
|
|
|
|
|
Macintosh it's matching charsets will come first, followed by the likely |
886
|
|
|
|
|
|
|
Windows, alternating between the two. |
887
|
|
|
|
|
|
|
|
888
|
|
|
|
|
|
|
We assume linux systems are mostly UTF-8 these days, that their pre-UTF-8 ISO |
889
|
|
|
|
|
|
|
charsets were roughly the same as the Windows equivalents, and that Linux users |
890
|
|
|
|
|
|
|
are generally more computer savvy. For these reasons Linux charsets are not |
891
|
|
|
|
|
|
|
included in results by default. |
892
|
|
|
|
|
|
|
|
893
|
|
|
|
|
|
|
Rather than ranking charsets through some kind of weighting based on appearance, |
894
|
|
|
|
|
|
|
we apply configurable patterns. Weight would always favour common charsets, |
895
|
|
|
|
|
|
|
hopefully the ranking patterns work better. |
896
|
|
|
|
|
|
|
|
897
|
|
|
|
|
|
|
This is the first version of this module. I'm open to suggestions with regards |
898
|
|
|
|
|
|
|
improved heuristics, and possibly configurable heuristics. |
899
|
|
|
|
|
|
|
|
900
|
|
|
|
|
|
|
You can override the default ranking by passing the appropriate data structure |
901
|
|
|
|
|
|
|
to detect(). You need to at least provide the repeat string for lang and all the |
902
|
|
|
|
|
|
|
OSs. |
903
|
|
|
|
|
|
|
|
904
|
|
|
|
|
|
|
IP country lookup and accept_language parsing is used initially to generate a |
905
|
|
|
|
|
|
|
list of matching languages. The order in which these are then ranked is based |
906
|
|
|
|
|
|
|
on their appearance (accept_lang), or popularity (country), and the sequence |
907
|
|
|
|
|
|
|
given. A represents accept_lang and C represents country, so a sequence starting |
908
|
|
|
|
|
|
|
with AC and repeating with AC would generate ACACACACAC... until there are no |
909
|
|
|
|
|
|
|
matching languages left. The lang_both option pushes charsets that come from |
910
|
|
|
|
|
|
|
both accept_lang and country. |
911
|
|
|
|
|
|
|
|
912
|
|
|
|
|
|
|
Next charsets are matched from the languages by OS. Depending on what OS has |
913
|
|
|
|
|
|
|
been passed, or detected from user_agent. The char sequences contain W for |
914
|
|
|
|
|
|
|
Windows, M for Macintosh or L for Linux. The Linux charsets are filtered out |
915
|
|
|
|
|
|
|
unless the OS is Linux or the inc_linux config option is enabled. So a Windows |
916
|
|
|
|
|
|
|
OS with sequence starting WW and repeating WML would generate WWWMWMWMWM... |
917
|
|
|
|
|
|
|
matching the first 3 likely windows charsets, then the most likely Macintosh, |
918
|
|
|
|
|
|
|
etc. Charsets are tested to see if they can decode the text, invalid ones are |
919
|
|
|
|
|
|
|
filtered out. |
920
|
|
|
|
|
|
|
|
921
|
|
|
|
|
|
|
The string is tested to see whether it looks like UTF-8. If it does that's |
922
|
|
|
|
|
|
|
pushed to the front on the list. If the Mozilla charset detector is available |
923
|
|
|
|
|
|
|
it's used to see what charset it returns. The option mozilla_move sets how the |
924
|
|
|
|
|
|
|
many places to move the matching charset forward in the list. The |
925
|
|
|
|
|
|
|
mozilla_insert options defines in what position to insert the Mozilla match if |
926
|
|
|
|
|
|
|
it's not already in the list. |
927
|
|
|
|
|
|
|
|
928
|
|
|
|
|
|
|
my %ranking = ( |
929
|
|
|
|
|
|
|
lang => { |
930
|
|
|
|
|
|
|
start => 'AC', |
931
|
|
|
|
|
|
|
repeat => 'AC', |
932
|
|
|
|
|
|
|
}, |
933
|
|
|
|
|
|
|
# Rank languages that appear in both country and accept_lang first |
934
|
|
|
|
|
|
|
lang_both => 1, |
935
|
|
|
|
|
|
|
char => { |
936
|
|
|
|
|
|
|
windows => { |
937
|
|
|
|
|
|
|
start => 'WW', |
938
|
|
|
|
|
|
|
repeat => 'WML', |
939
|
|
|
|
|
|
|
}, |
940
|
|
|
|
|
|
|
macintosh => { |
941
|
|
|
|
|
|
|
start => 'M', |
942
|
|
|
|
|
|
|
repeat => 'MWL', |
943
|
|
|
|
|
|
|
}, |
944
|
|
|
|
|
|
|
linux => { |
945
|
|
|
|
|
|
|
start => 'LWM', |
946
|
|
|
|
|
|
|
repeat => 'LWM', |
947
|
|
|
|
|
|
|
}, |
948
|
|
|
|
|
|
|
}, |
949
|
|
|
|
|
|
|
# Mozilla detected charset options |
950
|
|
|
|
|
|
|
mozilla_move => 1, # Number of positions to move the forward |
951
|
|
|
|
|
|
|
mozilla_insert => 3, # At what position to insert if it's not in list |
952
|
|
|
|
|
|
|
); |
953
|
|
|
|
|
|
|
my $charset = $detector->detect( ranking => \%ranking ); |
954
|
|
|
|
|
|
|
|
955
|
|
|
|
|
|
|
=head1 LICENSE |
956
|
|
|
|
|
|
|
|
957
|
|
|
|
|
|
|
This is released under the Artistic |
958
|
|
|
|
|
|
|
License. See L. |
959
|
|
|
|
|
|
|
|
960
|
|
|
|
|
|
|
=head1 AUTHORS |
961
|
|
|
|
|
|
|
|
962
|
|
|
|
|
|
|
Lyle Hopkins - L |
963
|
|
|
|
|
|
|
|
964
|
|
|
|
|
|
|
Peter Haworth - L |
965
|
|
|
|
|
|
|
|
966
|
|
|
|
|
|
|
Development kindly sponsored by - L |
967
|
|
|
|
|
|
|
|
968
|
|
|
|
|
|
|
=head1 REFERENCES |
969
|
|
|
|
|
|
|
|
970
|
|
|
|
|
|
|
I had a hard time finding good data sources, all the information I needed was |
971
|
|
|
|
|
|
|
pretty spread out. These are the main sites I used, but there was lots of |
972
|
|
|
|
|
|
|
googling to fill in the gaps. |
973
|
|
|
|
|
|
|
|
974
|
|
|
|
|
|
|
L |
975
|
|
|
|
|
|
|
L |
976
|
|
|
|
|
|
|
L |
977
|
|
|
|
|
|
|
L |
978
|
|
|
|
|
|
|
L |
979
|
|
|
|
|
|
|
L |
980
|
|
|
|
|
|
|
L |
981
|
|
|
|
|
|
|
L |
982
|
|
|
|
|
|
|
L |
983
|
|
|
|
|
|
|
|
984
|
|
|
|
|
|
|
=head1 SEE ALSO |
985
|
|
|
|
|
|
|
|
986
|
|
|
|
|
|
|
L, L, L, L |
987
|
|
|
|
|
|
|
|
988
|
|
|
|
|
|
|
=head1 TODO |
989
|
|
|
|
|
|
|
|
990
|
|
|
|
|
|
|
Make default between Latin and Cyrillic based on popularity in language |
991
|
|
|
|
|
|
|
Write some tests |
992
|
|
|
|
|
|
|
Rank regions differently? |
993
|
|
|
|
|
|
|
Generalize environment examination, defaults only at detect() or confgurable thru the detector object itself |
994
|
|
|
|
|
|
|
|
995
|
|
|
|
|
|
|
=cut |
996
|
|
|
|
|
|
|
|
997
|
|
|
|
|
|
|
|
998
|
|
|
|
|
|
|
|
999
|
|
|
|
|
|
|
1; |