line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package WWW::Scraper::Wikipedia::ISO3166; |
2
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
require v5.10.1; |
4
|
1
|
|
|
1
|
|
13591
|
use strict; |
|
1
|
|
|
|
|
12
|
|
|
1
|
|
|
|
|
25
|
|
5
|
1
|
|
|
1
|
|
3
|
use warnings; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
21
|
|
6
|
|
|
|
|
|
|
|
7
|
1
|
|
|
1
|
|
760
|
use File::ShareDir; |
|
1
|
|
|
|
|
4671
|
|
|
1
|
|
|
|
|
41
|
|
8
|
1
|
|
|
1
|
|
5
|
use File::Spec; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
15
|
|
9
|
|
|
|
|
|
|
|
10
|
1
|
|
|
1
|
|
486
|
use Moo; |
|
1
|
|
|
|
|
9511
|
|
|
1
|
|
|
|
|
3
|
|
11
|
|
|
|
|
|
|
|
12
|
1
|
|
|
1
|
|
1547
|
use Types::Standard qw/Int Str/; |
|
1
|
|
|
|
|
47097
|
|
|
1
|
|
|
|
|
7
|
|
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
has config_file => |
15
|
|
|
|
|
|
|
( |
16
|
|
|
|
|
|
|
default => sub{return '.htwww.scraper.wikipedia.iso3166.conf'}, |
17
|
|
|
|
|
|
|
is => 'rw', |
18
|
|
|
|
|
|
|
isa => Str, |
19
|
|
|
|
|
|
|
required => 0, |
20
|
|
|
|
|
|
|
); |
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
has data_file => |
23
|
|
|
|
|
|
|
( |
24
|
|
|
|
|
|
|
default => sub{return 'data/en.wikipedia.org.wiki.ISO_3166-2'}, |
25
|
|
|
|
|
|
|
is => 'rw', |
26
|
|
|
|
|
|
|
isa => Str, |
27
|
|
|
|
|
|
|
required => 0, |
28
|
|
|
|
|
|
|
); |
29
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
has share_dir => |
31
|
|
|
|
|
|
|
( |
32
|
|
|
|
|
|
|
default => sub{return ''}, |
33
|
|
|
|
|
|
|
is => 'rw', |
34
|
|
|
|
|
|
|
isa => Str, |
35
|
|
|
|
|
|
|
required => 0, |
36
|
|
|
|
|
|
|
); |
37
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
has sqlite_file => |
39
|
|
|
|
|
|
|
( |
40
|
|
|
|
|
|
|
default => sub{return 'www.scraper.wikipedia.iso3166.sqlite'}, |
41
|
|
|
|
|
|
|
is => 'rw', |
42
|
|
|
|
|
|
|
isa => Str, |
43
|
|
|
|
|
|
|
required => 0, |
44
|
|
|
|
|
|
|
); |
45
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
has verbose => |
47
|
|
|
|
|
|
|
( |
48
|
|
|
|
|
|
|
default => sub{return 0}, |
49
|
|
|
|
|
|
|
is => 'rw', |
50
|
|
|
|
|
|
|
isa => Int, |
51
|
|
|
|
|
|
|
required => 0, |
52
|
|
|
|
|
|
|
); |
53
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
our $VERSION = '1.03'; |
55
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
# ----------------------------------------------- |
57
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
sub BUILD |
59
|
|
|
|
|
|
|
{ |
60
|
0
|
|
|
0
|
0
|
|
my($self, $arg) = @_; |
61
|
0
|
|
|
|
|
|
(my $package = __PACKAGE__) =~ s/::/-/g; |
62
|
0
|
0
|
|
|
|
|
my($dir_name) = $ENV{AUTHOR_TESTING} ? 'share' : File::ShareDir::dist_dir($package); |
63
|
|
|
|
|
|
|
|
64
|
0
|
|
|
|
|
|
$self -> config_file(File::Spec -> catfile($dir_name, $self -> config_file) ); |
65
|
0
|
|
|
|
|
|
$self -> sqlite_file(File::Spec -> catfile($dir_name, $self -> sqlite_file) ); |
66
|
|
|
|
|
|
|
|
67
|
|
|
|
|
|
|
} # End of BUILD. |
68
|
|
|
|
|
|
|
|
69
|
|
|
|
|
|
|
# ----------------------------------------------- |
70
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
sub log |
72
|
|
|
|
|
|
|
{ |
73
|
0
|
|
|
0
|
1
|
|
my($self, $level, $s) = @_; |
74
|
0
|
|
0
|
|
|
|
$level ||= 'debug'; |
75
|
0
|
|
0
|
|
|
|
$s ||= ''; |
76
|
|
|
|
|
|
|
|
77
|
0
|
0
|
|
|
|
|
print "$level: $s. \n" if ($self -> verbose); |
78
|
|
|
|
|
|
|
|
79
|
|
|
|
|
|
|
} # End of log. |
80
|
|
|
|
|
|
|
|
81
|
|
|
|
|
|
|
# ----------------------------------------------- |
82
|
|
|
|
|
|
|
|
83
|
|
|
|
|
|
|
sub run |
84
|
|
|
|
|
|
|
{ |
85
|
0
|
|
|
0
|
0
|
|
my($self) = @_; |
86
|
|
|
|
|
|
|
|
87
|
|
|
|
|
|
|
# Return 0 for success and 1 for failure. |
88
|
|
|
|
|
|
|
|
89
|
0
|
|
|
|
|
|
return 0; |
90
|
|
|
|
|
|
|
|
91
|
|
|
|
|
|
|
} # End of run. |
92
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
# ----------------------------------------------- |
94
|
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
1; |
96
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
=pod |
98
|
|
|
|
|
|
|
|
99
|
|
|
|
|
|
|
=head1 NAME |
100
|
|
|
|
|
|
|
|
101
|
|
|
|
|
|
|
WWW::Scraper::Wikipedia::ISO3166 - Gently scrape Wikipedia for ISO3166-2 data |
102
|
|
|
|
|
|
|
|
103
|
|
|
|
|
|
|
=encoding utf-8 |
104
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
=head1 Synopsis |
106
|
|
|
|
|
|
|
|
107
|
|
|
|
|
|
|
Wikipedia I. You do not need to run the scripts which download pages from there. |
108
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
Just use the SQLite database shipped with this module, as discussed next. |
110
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
=head2 Methods which return hashrefs |
112
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
use WWW::Scraper::Wikipedia::ISO3166::Database; |
114
|
|
|
|
|
|
|
|
115
|
|
|
|
|
|
|
my($database) = WWW::Scraper::Wikipedia::ISO3166::Database -> new; |
116
|
|
|
|
|
|
|
my($countries) = $database -> read_countries_table; |
117
|
|
|
|
|
|
|
my($subcountries) = $database -> read_subcountries_table; |
118
|
|
|
|
|
|
|
... |
119
|
|
|
|
|
|
|
|
120
|
|
|
|
|
|
|
Each key in %$countries and %$subcountries points to a hashref of all columns for the given key. |
121
|
|
|
|
|
|
|
|
122
|
|
|
|
|
|
|
So, $$countries{13} points to this hashref: |
123
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
{ |
125
|
|
|
|
|
|
|
id => 13, |
126
|
|
|
|
|
|
|
code2 => 'AU', |
127
|
|
|
|
|
|
|
code3 => '', |
128
|
|
|
|
|
|
|
fc_name => 'australia', |
129
|
|
|
|
|
|
|
hash_subcountries => 'Yes', |
130
|
|
|
|
|
|
|
name => 'Australia', |
131
|
|
|
|
|
|
|
timestamp => '2012-05-08 04:04:43', |
132
|
|
|
|
|
|
|
} |
133
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
One element of %$subcountries is $$subcountries{4276}: |
135
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
{ |
137
|
|
|
|
|
|
|
id => 4276, |
138
|
|
|
|
|
|
|
country_id => 13, |
139
|
|
|
|
|
|
|
code => 'AU-VIC', |
140
|
|
|
|
|
|
|
fc_name => 'victoria', |
141
|
|
|
|
|
|
|
name => 'Victoria', |
142
|
|
|
|
|
|
|
sequence => 5, |
143
|
|
|
|
|
|
|
timestamp => '2012-05-08 04:05:27', |
144
|
|
|
|
|
|
|
} |
145
|
|
|
|
|
|
|
|
146
|
|
|
|
|
|
|
=head3 Warnings |
147
|
|
|
|
|
|
|
|
148
|
|
|
|
|
|
|
# 1: These hashrefs use the table's primary key as the hashref's key. In the case of the I |
149
|
|
|
|
|
|
|
table, the primary key is the country's id, and is used as subcountries.country_id. But, in the case of |
150
|
|
|
|
|
|
|
the I table, the id does not have any meaning apart from being a db primary key. |
151
|
|
|
|
|
|
|
See L for details. |
152
|
|
|
|
|
|
|
|
153
|
|
|
|
|
|
|
# 2: Do not assume subcountry names are unique within a country. |
154
|
|
|
|
|
|
|
|
155
|
|
|
|
|
|
|
L. |
156
|
|
|
|
|
|
|
|
157
|
|
|
|
|
|
|
=head2 Scripts which output to a file |
158
|
|
|
|
|
|
|
|
159
|
|
|
|
|
|
|
All scripts respond to the -h option. |
160
|
|
|
|
|
|
|
|
161
|
|
|
|
|
|
|
Some examples: |
162
|
|
|
|
|
|
|
|
163
|
|
|
|
|
|
|
shell>perl scripts/export.as.csv.pl -c countries.csv -s subcountries.csv |
164
|
|
|
|
|
|
|
shell>perl scripts/export.as.html.pl -w iso.3166-2.html |
165
|
|
|
|
|
|
|
|
166
|
|
|
|
|
|
|
This file is on-line at: L. |
167
|
|
|
|
|
|
|
|
168
|
|
|
|
|
|
|
shell>perl scripts/report.statistics.pl |
169
|
|
|
|
|
|
|
|
170
|
|
|
|
|
|
|
Output statistics: |
171
|
|
|
|
|
|
|
countries_in_db => 249. |
172
|
|
|
|
|
|
|
has_subcounties => 199. |
173
|
|
|
|
|
|
|
subcountries_in_db => 4593. |
174
|
|
|
|
|
|
|
subcountry_files_downloaded => 249. |
175
|
|
|
|
|
|
|
|
176
|
|
|
|
|
|
|
=head1 Description |
177
|
|
|
|
|
|
|
|
178
|
|
|
|
|
|
|
C is a pure Perl module. |
179
|
|
|
|
|
|
|
|
180
|
|
|
|
|
|
|
It is used to download various ISO3166-related pages from Wikipedia, and to then import data |
181
|
|
|
|
|
|
|
(scraped from those pages) into an SQLite database. |
182
|
|
|
|
|
|
|
|
183
|
|
|
|
|
|
|
The pages have already been downloaded, so that phase only needs to be run when pages are updated. |
184
|
|
|
|
|
|
|
|
185
|
|
|
|
|
|
|
Likewise, the data has been imported. |
186
|
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
This means you would normally only ever use the database in read-only mode. |
188
|
|
|
|
|
|
|
|
189
|
|
|
|
|
|
|
Its components are: |
190
|
|
|
|
|
|
|
|
191
|
|
|
|
|
|
|
=over 4 |
192
|
|
|
|
|
|
|
|
193
|
|
|
|
|
|
|
=item o scripts/get.country.page.pl |
194
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
1: Downloads the ISO3166-1_alpha-3 page from Wikipedia. |
196
|
|
|
|
|
|
|
|
197
|
|
|
|
|
|
|
Input: L. |
198
|
|
|
|
|
|
|
|
199
|
|
|
|
|
|
|
Output: data/en.wikipedia.org.wiki.ISO_3166-2.3.html. |
200
|
|
|
|
|
|
|
|
201
|
|
|
|
|
|
|
2: Downloads the ISO3166-2 page from Wikipedia. |
202
|
|
|
|
|
|
|
|
203
|
|
|
|
|
|
|
Input: L. |
204
|
|
|
|
|
|
|
|
205
|
|
|
|
|
|
|
Output: data/en.wikipedia.org.wiki.ISO_3166-2.html. |
206
|
|
|
|
|
|
|
|
207
|
|
|
|
|
|
|
=item o scripts/populate.countries.pl |
208
|
|
|
|
|
|
|
|
209
|
|
|
|
|
|
|
Imports country data into an SQLite database. |
210
|
|
|
|
|
|
|
|
211
|
|
|
|
|
|
|
inputs: data/en.wikipedia.org.wiki.ISO_3166-2.html, data/en.wikipedia.org.wiki.ISO_3166-2.3.html. |
212
|
|
|
|
|
|
|
|
213
|
|
|
|
|
|
|
Output: share/www.scraper.wikipedia.iso3166.sqlite. |
214
|
|
|
|
|
|
|
|
215
|
|
|
|
|
|
|
=item o scripts/get.subcountry.page.pl and scripts/get.subcountry.pages.pl |
216
|
|
|
|
|
|
|
|
217
|
|
|
|
|
|
|
Downloads each countries' corresponding subcountries page. |
218
|
|
|
|
|
|
|
|
219
|
|
|
|
|
|
|
Source: http://en.wikipedia.org/wiki/ISO_3166:$code2.html. |
220
|
|
|
|
|
|
|
|
221
|
|
|
|
|
|
|
Output: data/en.wikipedia.org.wiki.ISO_3166-2.$code2.html. |
222
|
|
|
|
|
|
|
|
223
|
|
|
|
|
|
|
=item o scripts/populate.subcountry.pl and scripts/populate.subcountries.pl |
224
|
|
|
|
|
|
|
|
225
|
|
|
|
|
|
|
Imports subcountry data into the database. |
226
|
|
|
|
|
|
|
|
227
|
|
|
|
|
|
|
Source: data/en.wikipedia.org.wiki.ISO_3166-2.$code2.html. |
228
|
|
|
|
|
|
|
|
229
|
|
|
|
|
|
|
Output: share/www.scraper.wikipedia.iso3166.sqlite. |
230
|
|
|
|
|
|
|
|
231
|
|
|
|
|
|
|
Note: When the distro is installed, this SQLite file is installed too. |
232
|
|
|
|
|
|
|
See L for details. |
233
|
|
|
|
|
|
|
|
234
|
|
|
|
|
|
|
=item o scripts/export.as.csv.pl -c c.csv -s s.csv |
235
|
|
|
|
|
|
|
|
236
|
|
|
|
|
|
|
Exports the country and subcountry data as CSV. |
237
|
|
|
|
|
|
|
|
238
|
|
|
|
|
|
|
Input: share/www.scraper.wikipedia.iso3166.sqlite. |
239
|
|
|
|
|
|
|
|
240
|
|
|
|
|
|
|
Output: data/countries.csv and data/subcountries.csv. |
241
|
|
|
|
|
|
|
|
242
|
|
|
|
|
|
|
=item o scripts/export.as.html -w c.html |
243
|
|
|
|
|
|
|
|
244
|
|
|
|
|
|
|
Exports the country and subcountry data as HTML. |
245
|
|
|
|
|
|
|
|
246
|
|
|
|
|
|
|
Input: share/www.scraper.wikipedia.iso3166.sqlite. |
247
|
|
|
|
|
|
|
|
248
|
|
|
|
|
|
|
Output: data/iso.3166-2.html. |
249
|
|
|
|
|
|
|
|
250
|
|
|
|
|
|
|
On-line: L. |
251
|
|
|
|
|
|
|
|
252
|
|
|
|
|
|
|
=item o scripts/get.statoids.pl |
253
|
|
|
|
|
|
|
|
254
|
|
|
|
|
|
|
Downloads some pages from L in case one day we need to convert from FIPS to ISO 3166-2. |
255
|
|
|
|
|
|
|
|
256
|
|
|
|
|
|
|
See data/List_of_FIPS_region_codes_*.html. |
257
|
|
|
|
|
|
|
|
258
|
|
|
|
|
|
|
=item o scripts/populate.fips.codes.pl |
259
|
|
|
|
|
|
|
|
260
|
|
|
|
|
|
|
This reads the files output by scripts/get.statoids.pl and produces 2 reports, data/wikipedia.fips.codes.txt |
261
|
|
|
|
|
|
|
and data/wikipedia.fips.mismatch.log. These are discussed in L |
262
|
|
|
|
|
|
|
|
263
|
|
|
|
|
|
|
=item o scripts/test.nfc.pl |
264
|
|
|
|
|
|
|
|
265
|
|
|
|
|
|
|
See L's NFC() for sorting?> for a discussion of this script. |
266
|
|
|
|
|
|
|
|
267
|
|
|
|
|
|
|
=back |
268
|
|
|
|
|
|
|
|
269
|
|
|
|
|
|
|
=head1 Constructor and initialization |
270
|
|
|
|
|
|
|
|
271
|
|
|
|
|
|
|
new(...) returns an object of type C. |
272
|
|
|
|
|
|
|
|
273
|
|
|
|
|
|
|
This is the class's contructor. |
274
|
|
|
|
|
|
|
|
275
|
|
|
|
|
|
|
Usage: C<< WWW::Scraper::Wikipedia::ISO3166 -> new() >>. |
276
|
|
|
|
|
|
|
|
277
|
|
|
|
|
|
|
This method takes a hash of options. |
278
|
|
|
|
|
|
|
|
279
|
|
|
|
|
|
|
Call C as C<< new(option_1 => value_1, option_2 => value_2, ...) >>. |
280
|
|
|
|
|
|
|
|
281
|
|
|
|
|
|
|
Available options (these are also methods): |
282
|
|
|
|
|
|
|
|
283
|
|
|
|
|
|
|
=over 4 |
284
|
|
|
|
|
|
|
|
285
|
|
|
|
|
|
|
=item o config_file => $file_name |
286
|
|
|
|
|
|
|
|
287
|
|
|
|
|
|
|
The name of the file containing config info, such as I and I. |
288
|
|
|
|
|
|
|
These are used by L. |
289
|
|
|
|
|
|
|
|
290
|
|
|
|
|
|
|
The code prefixes this name with the directory returned by L. |
291
|
|
|
|
|
|
|
|
292
|
|
|
|
|
|
|
Default: .htwww.scraper.wikipedia.iso3166.conf. |
293
|
|
|
|
|
|
|
|
294
|
|
|
|
|
|
|
=item o sqlite_file => $file_name |
295
|
|
|
|
|
|
|
|
296
|
|
|
|
|
|
|
The name of the SQLite database of country and subcountry data. |
297
|
|
|
|
|
|
|
|
298
|
|
|
|
|
|
|
The code prefixes this name with the directory returned by L. |
299
|
|
|
|
|
|
|
|
300
|
|
|
|
|
|
|
Default: www.scraper.wikipedia.iso3166.sqlite. |
301
|
|
|
|
|
|
|
|
302
|
|
|
|
|
|
|
=item o verbose => $integer |
303
|
|
|
|
|
|
|
|
304
|
|
|
|
|
|
|
Print more or less information. |
305
|
|
|
|
|
|
|
|
306
|
|
|
|
|
|
|
Default: 0 (print nothing). |
307
|
|
|
|
|
|
|
|
308
|
|
|
|
|
|
|
=back |
309
|
|
|
|
|
|
|
|
310
|
|
|
|
|
|
|
=head1 Distributions |
311
|
|
|
|
|
|
|
|
312
|
|
|
|
|
|
|
This module is available as a Unix-style distro (*.tgz). |
313
|
|
|
|
|
|
|
|
314
|
|
|
|
|
|
|
Install WWW::Scraper::Wikipedia::ISO3166 as you would for any C module: |
315
|
|
|
|
|
|
|
|
316
|
|
|
|
|
|
|
Run: |
317
|
|
|
|
|
|
|
|
318
|
|
|
|
|
|
|
cpanm WWW::Scraper::Wikipedia::ISO3166 |
319
|
|
|
|
|
|
|
|
320
|
|
|
|
|
|
|
or run: |
321
|
|
|
|
|
|
|
|
322
|
|
|
|
|
|
|
sudo cpan WWW::Scraper::Wikipedia::ISO3166 |
323
|
|
|
|
|
|
|
|
324
|
|
|
|
|
|
|
or unpack the distro, and then run: |
325
|
|
|
|
|
|
|
|
326
|
|
|
|
|
|
|
perl Makefile.PL |
327
|
|
|
|
|
|
|
make (or dmake) |
328
|
|
|
|
|
|
|
make test |
329
|
|
|
|
|
|
|
make install |
330
|
|
|
|
|
|
|
|
331
|
|
|
|
|
|
|
See L for details. |
332
|
|
|
|
|
|
|
|
333
|
|
|
|
|
|
|
See L for |
334
|
|
|
|
|
|
|
help on unpacking and installing. |
335
|
|
|
|
|
|
|
|
336
|
|
|
|
|
|
|
=head1 Methods |
337
|
|
|
|
|
|
|
|
338
|
|
|
|
|
|
|
=head2 config_file($file_name) |
339
|
|
|
|
|
|
|
|
340
|
|
|
|
|
|
|
Get or set the name of the config file. |
341
|
|
|
|
|
|
|
|
342
|
|
|
|
|
|
|
The code prefixes this name with the directory returned by L. |
343
|
|
|
|
|
|
|
|
344
|
|
|
|
|
|
|
Also, I is an option to L. |
345
|
|
|
|
|
|
|
|
346
|
|
|
|
|
|
|
=head2 log($level => $s) |
347
|
|
|
|
|
|
|
|
348
|
|
|
|
|
|
|
Print $s at log level $level, if ($self -> verbose); |
349
|
|
|
|
|
|
|
|
350
|
|
|
|
|
|
|
Since $self -> verbose defaults to 0, nothing is printed by default. |
351
|
|
|
|
|
|
|
|
352
|
|
|
|
|
|
|
=head2 new() |
353
|
|
|
|
|
|
|
|
354
|
|
|
|
|
|
|
See L. |
355
|
|
|
|
|
|
|
|
356
|
|
|
|
|
|
|
=head2 sqlite_file($file_name) |
357
|
|
|
|
|
|
|
|
358
|
|
|
|
|
|
|
Get or set the name of the database file. |
359
|
|
|
|
|
|
|
|
360
|
|
|
|
|
|
|
The code prefixes this name with the directory returned by L. |
361
|
|
|
|
|
|
|
|
362
|
|
|
|
|
|
|
Also, I is an option to L. |
363
|
|
|
|
|
|
|
|
364
|
|
|
|
|
|
|
=head2 verbose($integer) |
365
|
|
|
|
|
|
|
|
366
|
|
|
|
|
|
|
Get or set the verbosity level. |
367
|
|
|
|
|
|
|
|
368
|
|
|
|
|
|
|
Also, I is an option to L. |
369
|
|
|
|
|
|
|
|
370
|
|
|
|
|
|
|
=head1 FAQ |
371
|
|
|
|
|
|
|
|
372
|
|
|
|
|
|
|
=head2 Design faults in ISO3166 |
373
|
|
|
|
|
|
|
|
374
|
|
|
|
|
|
|
Where ISO3166 uses Country Name, I would have used Long Name and Short Name. |
375
|
|
|
|
|
|
|
|
376
|
|
|
|
|
|
|
Then we'd have: |
377
|
|
|
|
|
|
|
|
378
|
|
|
|
|
|
|
Long Name: Bolivia, Plurinational State of |
379
|
|
|
|
|
|
|
Short Name: Bolivia |
380
|
|
|
|
|
|
|
|
381
|
|
|
|
|
|
|
This distro uses the value directly from Wikipedia, which is what I have called 'Long Name', for |
382
|
|
|
|
|
|
|
all country and subcountry names. |
383
|
|
|
|
|
|
|
|
384
|
|
|
|
|
|
|
=head2 Where is the database? |
385
|
|
|
|
|
|
|
|
386
|
|
|
|
|
|
|
It is shipped in share/www.scraper.wikipedia.iso3166.sqlite. |
387
|
|
|
|
|
|
|
|
388
|
|
|
|
|
|
|
It is installed into the distro's shared dir, as returned by L. |
389
|
|
|
|
|
|
|
On my machine that's: |
390
|
|
|
|
|
|
|
|
391
|
|
|
|
|
|
|
/home/ron/perl5/perlbrew/perls/perl-5.14.2/lib/site_perl/5.14.2/auto/share/dist/WWW-Scraper-Wikipedia-ISO3166/www.scraper.wikipedia.iso3166.sqlite. |
392
|
|
|
|
|
|
|
|
393
|
|
|
|
|
|
|
=head2 What is the database schema? |
394
|
|
|
|
|
|
|
|
395
|
|
|
|
|
|
|
A single SQLite file holds 2 tables, I and I: |
396
|
|
|
|
|
|
|
|
397
|
|
|
|
|
|
|
countries subcountries |
398
|
|
|
|
|
|
|
--------- ------------ |
399
|
|
|
|
|
|
|
id id |
400
|
|
|
|
|
|
|
code2 country_id |
401
|
|
|
|
|
|
|
code3 code |
402
|
|
|
|
|
|
|
fc_name fc_name |
403
|
|
|
|
|
|
|
has_subcountries name |
404
|
|
|
|
|
|
|
name sequence |
405
|
|
|
|
|
|
|
timestamp timestamp |
406
|
|
|
|
|
|
|
|
407
|
|
|
|
|
|
|
I has a couple of special cases. 2 countries have no value for code3: |
408
|
|
|
|
|
|
|
Libyan Arab Jamahiriya and Sint Maarten. |
409
|
|
|
|
|
|
|
3-letter codes which almost match: LBY => Libya and MAF => Saint Martin (French part). |
410
|
|
|
|
|
|
|
|
411
|
|
|
|
|
|
|
I points to I. |
412
|
|
|
|
|
|
|
|
413
|
|
|
|
|
|
|
I is output from calling fc(decode('utf8', $name) ). |
414
|
|
|
|
|
|
|
|
415
|
|
|
|
|
|
|
For decode(), see L. |
416
|
|
|
|
|
|
|
|
417
|
|
|
|
|
|
|
For fc(), see L. |
418
|
|
|
|
|
|
|
|
419
|
|
|
|
|
|
|
$name is from a Wikipedia page. |
420
|
|
|
|
|
|
|
|
421
|
|
|
|
|
|
|
I is 'Yes' or 'No'. |
422
|
|
|
|
|
|
|
|
423
|
|
|
|
|
|
|
I is output from calling decode('utf8', $name). |
424
|
|
|
|
|
|
|
|
425
|
|
|
|
|
|
|
I is a number (1 .. N) indicating the order in which subcountry names appear in the list |
426
|
|
|
|
|
|
|
on that subcountry's Wikipedia page. |
427
|
|
|
|
|
|
|
|
428
|
|
|
|
|
|
|
See the source code of L for details of the SQL |
429
|
|
|
|
|
|
|
used to create the tables. |
430
|
|
|
|
|
|
|
|
431
|
|
|
|
|
|
|
=head2 What do I do if I find a mistake in the data? |
432
|
|
|
|
|
|
|
|
433
|
|
|
|
|
|
|
What data? What mistake? How do you know it's wrong? |
434
|
|
|
|
|
|
|
|
435
|
|
|
|
|
|
|
Also, you must decide what exactly you were expecting the data to be. |
436
|
|
|
|
|
|
|
|
437
|
|
|
|
|
|
|
If the problem is the ISO data, report it to them. |
438
|
|
|
|
|
|
|
|
439
|
|
|
|
|
|
|
If the problem is the Wikipedia data, get agreement from everyone concerned and update Wikipedia. |
440
|
|
|
|
|
|
|
|
441
|
|
|
|
|
|
|
If the problem is the output from my code, try to identify the bug in the code and report it via the |
442
|
|
|
|
|
|
|
usual mechanism. See L. |
443
|
|
|
|
|
|
|
|
444
|
|
|
|
|
|
|
If the problem is with your computer's display of the data, consider (in alphabetical order): |
445
|
|
|
|
|
|
|
|
446
|
|
|
|
|
|
|
=over 4 |
447
|
|
|
|
|
|
|
|
448
|
|
|
|
|
|
|
=item o CSV |
449
|
|
|
|
|
|
|
|
450
|
|
|
|
|
|
|
Does the file display correctly in 'Emacs'? On the screen using 'less'? |
451
|
|
|
|
|
|
|
|
452
|
|
|
|
|
|
|
scripts/export.as.csv.pl uses: use open ':utf8'; |
453
|
|
|
|
|
|
|
|
454
|
|
|
|
|
|
|
Is that not working? |
455
|
|
|
|
|
|
|
|
456
|
|
|
|
|
|
|
=item o DBD::SQLite |
457
|
|
|
|
|
|
|
|
458
|
|
|
|
|
|
|
Did you set the sqlite_unicode attribute? Use something like: |
459
|
|
|
|
|
|
|
|
460
|
|
|
|
|
|
|
my($dsn) = 'dbi:SQLite:dbname=www.scraper.wikipedia.iso3166.sqlite'; # Sample only. |
461
|
|
|
|
|
|
|
my($attributes) = {AutoCommit => 1, RaiseError => 1, sqlite_unicode => 1}; |
462
|
|
|
|
|
|
|
my($dbh) = DBI -> connect($dsn, '', '', $attributes); |
463
|
|
|
|
|
|
|
|
464
|
|
|
|
|
|
|
The SQLite file ships in the share/ directory of the distro, and must be found by File::ShareDir |
465
|
|
|
|
|
|
|
at run time. |
466
|
|
|
|
|
|
|
|
467
|
|
|
|
|
|
|
Did you set the foreign_keys pragma (if needed)? Use: |
468
|
|
|
|
|
|
|
|
469
|
|
|
|
|
|
|
$dbh -> do('PRAGMA foreign_keys = ON'); |
470
|
|
|
|
|
|
|
|
471
|
|
|
|
|
|
|
=item o HTML |
472
|
|
|
|
|
|
|
|
473
|
|
|
|
|
|
|
The template htdocs/assets/templates/www/scraper/wikipedia/iso3166/iso3166.report.tx which ships with |
474
|
|
|
|
|
|
|
this distro contains this line: |
475
|
|
|
|
|
|
|
|
476
|
|
|
|
|
|
|
|
477
|
|
|
|
|
|
|
|
478
|
|
|
|
|
|
|
Is that not working? |
479
|
|
|
|
|
|
|
|
480
|
|
|
|
|
|
|
=item o Locale |
481
|
|
|
|
|
|
|
|
482
|
|
|
|
|
|
|
Here's my setup: |
483
|
|
|
|
|
|
|
|
484
|
|
|
|
|
|
|
shell>locale |
485
|
|
|
|
|
|
|
LANG=en_AU.utf8 |
486
|
|
|
|
|
|
|
LANGUAGE= |
487
|
|
|
|
|
|
|
LC_CTYPE="en_AU.utf8" |
488
|
|
|
|
|
|
|
LC_NUMERIC="en_AU.utf8" |
489
|
|
|
|
|
|
|
LC_TIME="en_AU.utf8" |
490
|
|
|
|
|
|
|
LC_COLLATE="en_AU.utf8" |
491
|
|
|
|
|
|
|
LC_MONETARY="en_AU.utf8" |
492
|
|
|
|
|
|
|
LC_MESSAGES="en_AU.utf8" |
493
|
|
|
|
|
|
|
LC_PAPER="en_AU.utf8" |
494
|
|
|
|
|
|
|
LC_NAME="en_AU.utf8" |
495
|
|
|
|
|
|
|
LC_ADDRESS="en_AU.utf8" |
496
|
|
|
|
|
|
|
LC_TELEPHONE="en_AU.utf8" |
497
|
|
|
|
|
|
|
LC_MEASUREMENT="en_AU.utf8" |
498
|
|
|
|
|
|
|
LC_IDENTIFICATION="en_AU.utf8" |
499
|
|
|
|
|
|
|
LC_ALL= |
500
|
|
|
|
|
|
|
|
501
|
|
|
|
|
|
|
=item o OS |
502
|
|
|
|
|
|
|
|
503
|
|
|
|
|
|
|
Unicode is a moving target. Perhaps your OS's installed version of unicode files needs updating. |
504
|
|
|
|
|
|
|
|
505
|
|
|
|
|
|
|
=item o SQLite |
506
|
|
|
|
|
|
|
|
507
|
|
|
|
|
|
|
Both Oracle and SQLite.org ship a program called sqlite3. They are not compatible. |
508
|
|
|
|
|
|
|
Which one are you using? I use the one from the SQLite.org. |
509
|
|
|
|
|
|
|
|
510
|
|
|
|
|
|
|
AFAICT, sqlite3 does not have command line options, or options while running, to set unicode or pragmas. |
511
|
|
|
|
|
|
|
|
512
|
|
|
|
|
|
|
=back |
513
|
|
|
|
|
|
|
|
514
|
|
|
|
|
|
|
=head2 Why did you use L's NFC() for sorting? |
515
|
|
|
|
|
|
|
|
516
|
|
|
|
|
|
|
This question implies why not use NFD() instead. |
517
|
|
|
|
|
|
|
|
518
|
|
|
|
|
|
|
Run scripts/test.nfc.pl, and the output is: |
519
|
|
|
|
|
|
|
|
520
|
|
|
|
|
|
|
code2 => AX |
521
|
|
|
|
|
|
|
code3 => ALA |
522
|
|
|
|
|
|
|
fc_name => åland islands |
523
|
|
|
|
|
|
|
has_subcountries => No |
524
|
|
|
|
|
|
|
id => 15 |
525
|
|
|
|
|
|
|
name => Åland Islands |
526
|
|
|
|
|
|
|
timestamp => 2012-05-13 23:37:20 |
527
|
|
|
|
|
|
|
|
528
|
|
|
|
|
|
|
And this (Åland Islands) is what Wikipedia displays. So, NFC() it is. |
529
|
|
|
|
|
|
|
|
530
|
|
|
|
|
|
|
See L, and specifically prescription # 1. |
531
|
|
|
|
|
|
|
|
532
|
|
|
|
|
|
|
See also section 1.2 Normalization Forms in L. |
533
|
|
|
|
|
|
|
|
534
|
|
|
|
|
|
|
See also L. |
535
|
|
|
|
|
|
|
|
536
|
|
|
|
|
|
|
=head2 What is $ENV{AUTHOR_TESTING} used for? |
537
|
|
|
|
|
|
|
|
538
|
|
|
|
|
|
|
When this env var is 1, scripts output to share/*.sqlite within the distro's dir. That's how I populate the |
539
|
|
|
|
|
|
|
database tables. After installation, the database is elsewhere, and read-only, so you don't want the scripts |
540
|
|
|
|
|
|
|
writing to that copy anyway. |
541
|
|
|
|
|
|
|
|
542
|
|
|
|
|
|
|
At run-time, L is used to find the installed version of *.sqlite. |
543
|
|
|
|
|
|
|
|
544
|
|
|
|
|
|
|
=head2 What FIPS data is included? |
545
|
|
|
|
|
|
|
|
546
|
|
|
|
|
|
|
Firstly, scripts/get.fips.pages.pl downloads some Wikipedia data, into data/List_of_FIPS_region_codes_*.html. |
547
|
|
|
|
|
|
|
|
548
|
|
|
|
|
|
|
Secondly, the latter files are parsed by scripts/populate.fips.codes.pl and the 2 reports are in |
549
|
|
|
|
|
|
|
data/wikipedia.fips.codes.txt, and data/wikipedia.fips.mismatch.log. |
550
|
|
|
|
|
|
|
|
551
|
|
|
|
|
|
|
This data is I written into the SQLite database yet, but it's available in case it's included |
552
|
|
|
|
|
|
|
one day. |
553
|
|
|
|
|
|
|
|
554
|
|
|
|
|
|
|
=head1 Wikipedia's Terms of Use |
555
|
|
|
|
|
|
|
|
556
|
|
|
|
|
|
|
See L. |
557
|
|
|
|
|
|
|
|
558
|
|
|
|
|
|
|
Also, since I'm distributing copies of Wikipedia-sourced material, reformatted but not changed by editing, |
559
|
|
|
|
|
|
|
I hereby give notice that their material is released under CC-BY-SA. |
560
|
|
|
|
|
|
|
See L for that licence. |
561
|
|
|
|
|
|
|
|
562
|
|
|
|
|
|
|
=head1 References |
563
|
|
|
|
|
|
|
|
564
|
|
|
|
|
|
|
In no particular order: |
565
|
|
|
|
|
|
|
|
566
|
|
|
|
|
|
|
L |
567
|
|
|
|
|
|
|
|
568
|
|
|
|
|
|
|
L |
569
|
|
|
|
|
|
|
|
570
|
|
|
|
|
|
|
L |
571
|
|
|
|
|
|
|
|
572
|
|
|
|
|
|
|
L |
573
|
|
|
|
|
|
|
|
574
|
|
|
|
|
|
|
L |
575
|
|
|
|
|
|
|
|
576
|
|
|
|
|
|
|
This is complex set of XML files concerning currency, postal, etc, formats and other details for various countries |
577
|
|
|
|
|
|
|
and/or languages. |
578
|
|
|
|
|
|
|
|
579
|
|
|
|
|
|
|
For Debian etc users: /usr/share/xml/iso-codes/iso_3166_2.xml, as installed from the iso-codes package, with: |
580
|
|
|
|
|
|
|
|
581
|
|
|
|
|
|
|
sudo apt-get install iso-codes |
582
|
|
|
|
|
|
|
|
583
|
|
|
|
|
|
|
L |
584
|
|
|
|
|
|
|
|
585
|
|
|
|
|
|
|
L |
586
|
|
|
|
|
|
|
|
587
|
|
|
|
|
|
|
L |
588
|
|
|
|
|
|
|
|
589
|
|
|
|
|
|
|
Check the Monthly Archives at Perl.com, starting in April 2012, for a series of Unicode-specific articles by |
590
|
|
|
|
|
|
|
Tom Christiansen. |
591
|
|
|
|
|
|
|
|
592
|
|
|
|
|
|
|
L |
593
|
|
|
|
|
|
|
|
594
|
|
|
|
|
|
|
L |
595
|
|
|
|
|
|
|
|
596
|
|
|
|
|
|
|
=head1 Repository |
597
|
|
|
|
|
|
|
|
598
|
|
|
|
|
|
|
L |
599
|
|
|
|
|
|
|
|
600
|
|
|
|
|
|
|
=head1 Support |
601
|
|
|
|
|
|
|
|
602
|
|
|
|
|
|
|
Email the author, or log a bug on RT: |
603
|
|
|
|
|
|
|
|
604
|
|
|
|
|
|
|
L. |
605
|
|
|
|
|
|
|
|
606
|
|
|
|
|
|
|
=head1 Author |
607
|
|
|
|
|
|
|
|
608
|
|
|
|
|
|
|
C was written by Ron Savage Iron@savage.net.auE> in 2012. |
609
|
|
|
|
|
|
|
|
610
|
|
|
|
|
|
|
Home page: L. |
611
|
|
|
|
|
|
|
|
612
|
|
|
|
|
|
|
=head1 Copyright |
613
|
|
|
|
|
|
|
|
614
|
|
|
|
|
|
|
Australian copyright (c) 2012 Ron Savage. |
615
|
|
|
|
|
|
|
|
616
|
|
|
|
|
|
|
All Programs of mine are 'OSI Certified Open Source Software'; |
617
|
|
|
|
|
|
|
you can redistribute them and/or modify them under the terms of |
618
|
|
|
|
|
|
|
The Artistic License, a copy of which is available at: |
619
|
|
|
|
|
|
|
http://www.opensource.org/licenses/index.html |
620
|
|
|
|
|
|
|
|
621
|
|
|
|
|
|
|
|
622
|
|
|
|
|
|
|
=cut |