| line | stmt | bran | cond | sub | pod | time | code | 
| 1 |  |  |  |  |  |  | package WWW::Scraper::Wikipedia::ISO3166; | 
| 2 |  |  |  |  |  |  |  | 
| 3 |  |  |  |  |  |  | require v5.10.1; | 
| 4 | 1 |  |  | 1 |  | 382 | use strict; | 
|  | 1 |  |  |  |  | 1 |  | 
|  | 1 |  |  |  |  | 23 |  | 
| 5 | 1 |  |  | 1 |  | 2 | use warnings; | 
|  | 1 |  |  |  |  | 7 |  | 
|  | 1 |  |  |  |  | 20 |  | 
| 6 |  |  |  |  |  |  |  | 
| 7 | 1 |  |  | 1 |  | 418 | use File::ShareDir; | 
|  | 1 |  |  |  |  | 4374 |  | 
|  | 1 |  |  |  |  | 37 |  | 
| 8 | 1 |  |  | 1 |  | 4 | use File::Spec; | 
|  | 1 |  |  |  |  | 1 |  | 
|  | 1 |  |  |  |  | 13 |  | 
| 9 |  |  |  |  |  |  |  | 
| 10 | 1 |  |  | 1 |  | 472 | use Moo; | 
|  | 1 |  |  |  |  | 9132 |  | 
|  | 1 |  |  |  |  | 3 |  | 
| 11 |  |  |  |  |  |  |  | 
| 12 | 1 |  |  | 1 |  | 1509 | use Types::Standard qw/Int Str/; | 
|  | 1 |  |  |  |  | 44475 |  | 
|  | 1 |  |  |  |  | 7 |  | 
| 13 |  |  |  |  |  |  |  | 
| 14 |  |  |  |  |  |  | has config_file => | 
| 15 |  |  |  |  |  |  | ( | 
| 16 |  |  |  |  |  |  | default  => sub{return '.htwww.scraper.wikipedia.iso3166.conf'}, | 
| 17 |  |  |  |  |  |  | is       => 'rw', | 
| 18 |  |  |  |  |  |  | isa      => Str, | 
| 19 |  |  |  |  |  |  | required => 0, | 
| 20 |  |  |  |  |  |  | ); | 
| 21 |  |  |  |  |  |  |  | 
| 22 |  |  |  |  |  |  | has data_file => | 
| 23 |  |  |  |  |  |  | ( | 
| 24 |  |  |  |  |  |  | default  => sub{return 'data/en.wikipedia.org.wiki.ISO_3166-2'}, | 
| 25 |  |  |  |  |  |  | is       => 'rw', | 
| 26 |  |  |  |  |  |  | isa      => Str, | 
| 27 |  |  |  |  |  |  | required => 0, | 
| 28 |  |  |  |  |  |  | ); | 
| 29 |  |  |  |  |  |  |  | 
| 30 |  |  |  |  |  |  | has share_dir => | 
| 31 |  |  |  |  |  |  | ( | 
| 32 |  |  |  |  |  |  | default  => sub{return ''}, | 
| 33 |  |  |  |  |  |  | is       => 'rw', | 
| 34 |  |  |  |  |  |  | isa      => Str, | 
| 35 |  |  |  |  |  |  | required => 0, | 
| 36 |  |  |  |  |  |  | ); | 
| 37 |  |  |  |  |  |  |  | 
| 38 |  |  |  |  |  |  | has sqlite_file => | 
| 39 |  |  |  |  |  |  | ( | 
| 40 |  |  |  |  |  |  | default  => sub{return 'www.scraper.wikipedia.iso3166.sqlite'}, | 
| 41 |  |  |  |  |  |  | is       => 'rw', | 
| 42 |  |  |  |  |  |  | isa      => Str, | 
| 43 |  |  |  |  |  |  | required => 0, | 
| 44 |  |  |  |  |  |  | ); | 
| 45 |  |  |  |  |  |  |  | 
| 46 |  |  |  |  |  |  | has verbose => | 
| 47 |  |  |  |  |  |  | ( | 
| 48 |  |  |  |  |  |  | default  => sub{return 0}, | 
| 49 |  |  |  |  |  |  | is       => 'rw', | 
| 50 |  |  |  |  |  |  | isa      => Int, | 
| 51 |  |  |  |  |  |  | required => 0, | 
| 52 |  |  |  |  |  |  | ); | 
| 53 |  |  |  |  |  |  |  | 
| 54 |  |  |  |  |  |  | our $VERSION = '1.04'; | 
| 55 |  |  |  |  |  |  |  | 
| 56 |  |  |  |  |  |  | # ----------------------------------------------- | 
| 57 |  |  |  |  |  |  |  | 
| 58 |  |  |  |  |  |  | sub BUILD | 
| 59 |  |  |  |  |  |  | { | 
| 60 | 0 |  |  | 0 | 0 |  | my($self, $arg)    = @_; | 
| 61 | 0 |  |  |  |  |  | (my $package       = __PACKAGE__) =~ s/::/-/g; | 
| 62 | 0 | 0 |  |  |  |  | my($dir_name)      = $ENV{AUTHOR_TESTING} ? 'share' : File::ShareDir::dist_dir($package); | 
| 63 |  |  |  |  |  |  |  | 
| 64 | 0 |  |  |  |  |  | $self -> config_file(File::Spec -> catfile($dir_name, $self -> config_file) ); | 
| 65 | 0 |  |  |  |  |  | $self -> sqlite_file(File::Spec -> catfile($dir_name, $self -> sqlite_file) ); | 
| 66 |  |  |  |  |  |  |  | 
| 67 |  |  |  |  |  |  | } # End of BUILD. | 
| 68 |  |  |  |  |  |  |  | 
| 69 |  |  |  |  |  |  | # ----------------------------------------------- | 
| 70 |  |  |  |  |  |  |  | 
| 71 |  |  |  |  |  |  | sub log | 
| 72 |  |  |  |  |  |  | { | 
| 73 | 0 |  |  | 0 | 1 |  | my($self, $level, $s) = @_; | 
| 74 | 0 |  | 0 |  |  |  | $level ||= 'debug'; | 
| 75 | 0 |  | 0 |  |  |  | $s     ||= ''; | 
| 76 |  |  |  |  |  |  |  | 
| 77 | 0 | 0 |  |  |  |  | print "$level: $s. \n" if ($self -> verbose); | 
| 78 |  |  |  |  |  |  |  | 
| 79 |  |  |  |  |  |  | }	# End of log. | 
| 80 |  |  |  |  |  |  |  | 
| 81 |  |  |  |  |  |  | # ----------------------------------------------- | 
| 82 |  |  |  |  |  |  |  | 
| 83 |  |  |  |  |  |  | sub run | 
| 84 |  |  |  |  |  |  | { | 
| 85 | 0 |  |  | 0 | 0 |  | my($self) = @_; | 
| 86 |  |  |  |  |  |  |  | 
| 87 |  |  |  |  |  |  | # Return 0 for success and 1 for failure. | 
| 88 |  |  |  |  |  |  |  | 
| 89 | 0 |  |  |  |  |  | return 0; | 
| 90 |  |  |  |  |  |  |  | 
| 91 |  |  |  |  |  |  | } # End of run. | 
| 92 |  |  |  |  |  |  |  | 
| 93 |  |  |  |  |  |  | # ----------------------------------------------- | 
| 94 |  |  |  |  |  |  |  | 
| 95 |  |  |  |  |  |  | 1; | 
| 96 |  |  |  |  |  |  |  | 
| 97 |  |  |  |  |  |  | =pod | 
| 98 |  |  |  |  |  |  |  | 
| 99 |  |  |  |  |  |  | =head1 NAME | 
| 100 |  |  |  |  |  |  |  | 
| 101 |  |  |  |  |  |  | WWW::Scraper::Wikipedia::ISO3166 - Gently scrape Wikipedia for ISO3166-2 data | 
| 102 |  |  |  |  |  |  |  | 
| 103 |  |  |  |  |  |  | =encoding utf-8 | 
| 104 |  |  |  |  |  |  |  | 
| 105 |  |  |  |  |  |  | =head1 Synopsis | 
| 106 |  |  |  |  |  |  |  | 
| 107 |  |  |  |  |  |  | Wikipedia I. You do not need to run the scripts which download pages from there. | 
| 108 |  |  |  |  |  |  |  | 
| 109 |  |  |  |  |  |  | Just use the SQLite database shipped with this module, as discussed next. | 
| 110 |  |  |  |  |  |  |  | 
| 111 |  |  |  |  |  |  | =head2 Methods which return hashrefs | 
| 112 |  |  |  |  |  |  |  | 
| 113 |  |  |  |  |  |  | use WWW::Scraper::Wikipedia::ISO3166::Database; | 
| 114 |  |  |  |  |  |  |  | 
| 115 |  |  |  |  |  |  | my($database)     = WWW::Scraper::Wikipedia::ISO3166::Database -> new; | 
| 116 |  |  |  |  |  |  | my($countries)    = $database -> read_countries_table; | 
| 117 |  |  |  |  |  |  | my($subcountries) = $database -> read_subcountries_table; | 
| 118 |  |  |  |  |  |  | ... | 
| 119 |  |  |  |  |  |  |  | 
| 120 |  |  |  |  |  |  | Each key in %$countries and %$subcountries points to a hashref of all columns for the given key. | 
| 121 |  |  |  |  |  |  |  | 
| 122 |  |  |  |  |  |  | So, $$countries{13} points to this hashref: | 
| 123 |  |  |  |  |  |  |  | 
| 124 |  |  |  |  |  |  | { | 
| 125 |  |  |  |  |  |  | id                => 13, | 
| 126 |  |  |  |  |  |  | code2             => 'AU', | 
| 127 |  |  |  |  |  |  | code3             => '', | 
| 128 |  |  |  |  |  |  | fc_name           => 'australia', | 
| 129 |  |  |  |  |  |  | hash_subcountries => 'Yes', | 
| 130 |  |  |  |  |  |  | name              => 'Australia', | 
| 131 |  |  |  |  |  |  | timestamp         => '2012-05-08 04:04:43', | 
| 132 |  |  |  |  |  |  | } | 
| 133 |  |  |  |  |  |  |  | 
| 134 |  |  |  |  |  |  | One element of %$subcountries is $$subcountries{4276}: | 
| 135 |  |  |  |  |  |  |  | 
| 136 |  |  |  |  |  |  | { | 
| 137 |  |  |  |  |  |  | id         => 4276, | 
| 138 |  |  |  |  |  |  | country_id => 13, | 
| 139 |  |  |  |  |  |  | code       => 'AU-VIC', | 
| 140 |  |  |  |  |  |  | fc_name    => 'victoria', | 
| 141 |  |  |  |  |  |  | name       => 'Victoria', | 
| 142 |  |  |  |  |  |  | sequence   => 5, | 
| 143 |  |  |  |  |  |  | timestamp  => '2012-05-08 04:05:27', | 
| 144 |  |  |  |  |  |  | } | 
| 145 |  |  |  |  |  |  |  | 
| 146 |  |  |  |  |  |  | =head3 Warnings | 
| 147 |  |  |  |  |  |  |  | 
| 148 |  |  |  |  |  |  | # 1: These hashrefs use the table's primary key as the hashref's key. In the case of the I | 
| 149 |  |  |  |  |  |  | table, the primary key is the country's id, and is used as subcountries.country_id. But, in the case of | 
| 150 |  |  |  |  |  |  | the I table, the id does not have any meaning apart from being a db primary key. | 
| 151 |  |  |  |  |  |  | See L for details. | 
| 152 |  |  |  |  |  |  |  | 
| 153 |  |  |  |  |  |  | # 2: Do not assume subcountry names are unique within a country. | 
| 154 |  |  |  |  |  |  |  | 
| 155 |  |  |  |  |  |  | L. | 
| 156 |  |  |  |  |  |  |  | 
| 157 |  |  |  |  |  |  | =head2 Scripts which output to a file | 
| 158 |  |  |  |  |  |  |  | 
| 159 |  |  |  |  |  |  | All scripts respond to the -h option. | 
| 160 |  |  |  |  |  |  |  | 
| 161 |  |  |  |  |  |  | Some examples: | 
| 162 |  |  |  |  |  |  |  | 
| 163 |  |  |  |  |  |  | shell>perl scripts/export.as.csv.pl -c countries.csv -s subcountries.csv | 
| 164 |  |  |  |  |  |  | shell>perl scripts/export.as.html.pl -w iso.3166-2.html | 
| 165 |  |  |  |  |  |  |  | 
| 166 |  |  |  |  |  |  | This file is on-line at: L. | 
| 167 |  |  |  |  |  |  |  | 
| 168 |  |  |  |  |  |  | shell>perl scripts/report.statistics.pl | 
| 169 |  |  |  |  |  |  |  | 
| 170 |  |  |  |  |  |  | Output statistics: | 
| 171 |  |  |  |  |  |  | countries_in_db => 249. | 
| 172 |  |  |  |  |  |  | has_subcounties => 199. | 
| 173 |  |  |  |  |  |  | subcountries_in_db => 4593. | 
| 174 |  |  |  |  |  |  | subcountry_files_downloaded => 249. | 
| 175 |  |  |  |  |  |  |  | 
| 176 |  |  |  |  |  |  | =head1 Description | 
| 177 |  |  |  |  |  |  |  | 
| 178 |  |  |  |  |  |  | C is a pure Perl module. | 
| 179 |  |  |  |  |  |  |  | 
| 180 |  |  |  |  |  |  | It is used to download various ISO3166-related pages from Wikipedia, and to then import data | 
| 181 |  |  |  |  |  |  | (scraped from those pages) into an SQLite database. | 
| 182 |  |  |  |  |  |  |  | 
| 183 |  |  |  |  |  |  | The pages have already been downloaded, so that phase only needs to be run when pages are updated. | 
| 184 |  |  |  |  |  |  |  | 
| 185 |  |  |  |  |  |  | Likewise, the data has been imported. | 
| 186 |  |  |  |  |  |  |  | 
| 187 |  |  |  |  |  |  | This means you would normally only ever use the database in read-only mode. | 
| 188 |  |  |  |  |  |  |  | 
| 189 |  |  |  |  |  |  | Its components are: | 
| 190 |  |  |  |  |  |  |  | 
| 191 |  |  |  |  |  |  | =over 4 | 
| 192 |  |  |  |  |  |  |  | 
| 193 |  |  |  |  |  |  | =item o scripts/get.country.page.pl | 
| 194 |  |  |  |  |  |  |  | 
| 195 |  |  |  |  |  |  | 1: Downloads the ISO3166-1_alpha-3 page from Wikipedia. | 
| 196 |  |  |  |  |  |  |  | 
| 197 |  |  |  |  |  |  | Input: L. | 
| 198 |  |  |  |  |  |  |  | 
| 199 |  |  |  |  |  |  | Output: data/en.wikipedia.org.wiki.ISO_3166-2.3.html. | 
| 200 |  |  |  |  |  |  |  | 
| 201 |  |  |  |  |  |  | 2: Downloads the ISO3166-2 page from Wikipedia. | 
| 202 |  |  |  |  |  |  |  | 
| 203 |  |  |  |  |  |  | Input: L. | 
| 204 |  |  |  |  |  |  |  | 
| 205 |  |  |  |  |  |  | Output: data/en.wikipedia.org.wiki.ISO_3166-2.html. | 
| 206 |  |  |  |  |  |  |  | 
| 207 |  |  |  |  |  |  | =item o scripts/populate.countries.pl | 
| 208 |  |  |  |  |  |  |  | 
| 209 |  |  |  |  |  |  | Imports country data into an SQLite database. | 
| 210 |  |  |  |  |  |  |  | 
| 211 |  |  |  |  |  |  | inputs: data/en.wikipedia.org.wiki.ISO_3166-2.html, data/en.wikipedia.org.wiki.ISO_3166-2.3.html. | 
| 212 |  |  |  |  |  |  |  | 
| 213 |  |  |  |  |  |  | Output: share/www.scraper.wikipedia.iso3166.sqlite. | 
| 214 |  |  |  |  |  |  |  | 
| 215 |  |  |  |  |  |  | =item o scripts/get.subcountry.page.pl and scripts/get.subcountry.pages.pl | 
| 216 |  |  |  |  |  |  |  | 
| 217 |  |  |  |  |  |  | Downloads each countries' corresponding subcountries page. | 
| 218 |  |  |  |  |  |  |  | 
| 219 |  |  |  |  |  |  | Source: http://en.wikipedia.org/wiki/ISO_3166:$code2.html. | 
| 220 |  |  |  |  |  |  |  | 
| 221 |  |  |  |  |  |  | Output: data/en.wikipedia.org.wiki.ISO_3166-2.$code2.html. | 
| 222 |  |  |  |  |  |  |  | 
| 223 |  |  |  |  |  |  | =item o scripts/populate.subcountry.pl and scripts/populate.subcountries.pl | 
| 224 |  |  |  |  |  |  |  | 
| 225 |  |  |  |  |  |  | Imports subcountry data into the database. | 
| 226 |  |  |  |  |  |  |  | 
| 227 |  |  |  |  |  |  | Source: data/en.wikipedia.org.wiki.ISO_3166-2.$code2.html. | 
| 228 |  |  |  |  |  |  |  | 
| 229 |  |  |  |  |  |  | Output: share/www.scraper.wikipedia.iso3166.sqlite. | 
| 230 |  |  |  |  |  |  |  | 
| 231 |  |  |  |  |  |  | Note: When the distro is installed, this SQLite file is installed too. | 
| 232 |  |  |  |  |  |  | See L for details. | 
| 233 |  |  |  |  |  |  |  | 
| 234 |  |  |  |  |  |  | =item o scripts/export.as.csv.pl -c c.csv -s s.csv | 
| 235 |  |  |  |  |  |  |  | 
| 236 |  |  |  |  |  |  | Exports the country and subcountry data as CSV. | 
| 237 |  |  |  |  |  |  |  | 
| 238 |  |  |  |  |  |  | Input: share/www.scraper.wikipedia.iso3166.sqlite. | 
| 239 |  |  |  |  |  |  |  | 
| 240 |  |  |  |  |  |  | Output: data/countries.csv and data/subcountries.csv. | 
| 241 |  |  |  |  |  |  |  | 
| 242 |  |  |  |  |  |  | =item o scripts/export.as.html -w c.html | 
| 243 |  |  |  |  |  |  |  | 
| 244 |  |  |  |  |  |  | Exports the country and subcountry data as HTML. | 
| 245 |  |  |  |  |  |  |  | 
| 246 |  |  |  |  |  |  | Input: share/www.scraper.wikipedia.iso3166.sqlite. | 
| 247 |  |  |  |  |  |  |  | 
| 248 |  |  |  |  |  |  | Output: data/iso.3166-2.html. | 
| 249 |  |  |  |  |  |  |  | 
| 250 |  |  |  |  |  |  | On-line: L. | 
| 251 |  |  |  |  |  |  |  | 
| 252 |  |  |  |  |  |  | =item o scripts/get.statoids.pl | 
| 253 |  |  |  |  |  |  |  | 
| 254 |  |  |  |  |  |  | Downloads some pages from L in case one day we need to convert from FIPS to ISO 3166-2. | 
| 255 |  |  |  |  |  |  |  | 
| 256 |  |  |  |  |  |  | See data/List_of_FIPS_region_codes_*.html. | 
| 257 |  |  |  |  |  |  |  | 
| 258 |  |  |  |  |  |  | =item o scripts/populate.fips.codes.pl | 
| 259 |  |  |  |  |  |  |  | 
| 260 |  |  |  |  |  |  | This reads the files output by scripts/get.statoids.pl and produces 2 reports, data/wikipedia.fips.codes.txt | 
| 261 |  |  |  |  |  |  | and data/wikipedia.fips.mismatch.log. These are discussed in L | 
| 262 |  |  |  |  |  |  |  | 
| 263 |  |  |  |  |  |  | =item o scripts/test.nfc.pl | 
| 264 |  |  |  |  |  |  |  | 
| 265 |  |  |  |  |  |  | See L's NFC() for sorting?> for a discussion of this script. | 
| 266 |  |  |  |  |  |  |  | 
| 267 |  |  |  |  |  |  | =back | 
| 268 |  |  |  |  |  |  |  | 
| 269 |  |  |  |  |  |  | =head1 Constructor and initialization | 
| 270 |  |  |  |  |  |  |  | 
| 271 |  |  |  |  |  |  | new(...) returns an object of type C. | 
| 272 |  |  |  |  |  |  |  | 
| 273 |  |  |  |  |  |  | This is the class's contructor. | 
| 274 |  |  |  |  |  |  |  | 
| 275 |  |  |  |  |  |  | Usage: C<< WWW::Scraper::Wikipedia::ISO3166 -> new() >>. | 
| 276 |  |  |  |  |  |  |  | 
| 277 |  |  |  |  |  |  | This method takes a hash of options. | 
| 278 |  |  |  |  |  |  |  | 
| 279 |  |  |  |  |  |  | Call C as C<< new(option_1 => value_1, option_2 => value_2, ...) >>. | 
| 280 |  |  |  |  |  |  |  | 
| 281 |  |  |  |  |  |  | Available options (these are also methods): | 
| 282 |  |  |  |  |  |  |  | 
| 283 |  |  |  |  |  |  | =over 4 | 
| 284 |  |  |  |  |  |  |  | 
| 285 |  |  |  |  |  |  | =item o config_file => $file_name | 
| 286 |  |  |  |  |  |  |  | 
| 287 |  |  |  |  |  |  | The name of the file containing config info, such as I and I. | 
| 288 |  |  |  |  |  |  | These are used by L. | 
| 289 |  |  |  |  |  |  |  | 
| 290 |  |  |  |  |  |  | The code prefixes this name with the directory returned by L. | 
| 291 |  |  |  |  |  |  |  | 
| 292 |  |  |  |  |  |  | Default: .htwww.scraper.wikipedia.iso3166.conf. | 
| 293 |  |  |  |  |  |  |  | 
| 294 |  |  |  |  |  |  | =item o sqlite_file => $file_name | 
| 295 |  |  |  |  |  |  |  | 
| 296 |  |  |  |  |  |  | The name of the SQLite database of country and subcountry data. | 
| 297 |  |  |  |  |  |  |  | 
| 298 |  |  |  |  |  |  | The code prefixes this name with the directory returned by L. | 
| 299 |  |  |  |  |  |  |  | 
| 300 |  |  |  |  |  |  | Default: www.scraper.wikipedia.iso3166.sqlite. | 
| 301 |  |  |  |  |  |  |  | 
| 302 |  |  |  |  |  |  | =item o verbose => $integer | 
| 303 |  |  |  |  |  |  |  | 
| 304 |  |  |  |  |  |  | Print more or less information. | 
| 305 |  |  |  |  |  |  |  | 
| 306 |  |  |  |  |  |  | Default: 0 (print nothing). | 
| 307 |  |  |  |  |  |  |  | 
| 308 |  |  |  |  |  |  | =back | 
| 309 |  |  |  |  |  |  |  | 
| 310 |  |  |  |  |  |  | =head1 Distributions | 
| 311 |  |  |  |  |  |  |  | 
| 312 |  |  |  |  |  |  | This module is available as a Unix-style distro (*.tgz). | 
| 313 |  |  |  |  |  |  |  | 
| 314 |  |  |  |  |  |  | Install WWW::Scraper::Wikipedia::ISO3166 as you would for any C module: | 
| 315 |  |  |  |  |  |  |  | 
| 316 |  |  |  |  |  |  | Run: | 
| 317 |  |  |  |  |  |  |  | 
| 318 |  |  |  |  |  |  | cpanm WWW::Scraper::Wikipedia::ISO3166 | 
| 319 |  |  |  |  |  |  |  | 
| 320 |  |  |  |  |  |  | or run: | 
| 321 |  |  |  |  |  |  |  | 
| 322 |  |  |  |  |  |  | sudo cpan WWW::Scraper::Wikipedia::ISO3166 | 
| 323 |  |  |  |  |  |  |  | 
| 324 |  |  |  |  |  |  | or unpack the distro, and then run: | 
| 325 |  |  |  |  |  |  |  | 
| 326 |  |  |  |  |  |  | perl Makefile.PL | 
| 327 |  |  |  |  |  |  | make (or dmake) | 
| 328 |  |  |  |  |  |  | make test | 
| 329 |  |  |  |  |  |  | make install | 
| 330 |  |  |  |  |  |  |  | 
| 331 |  |  |  |  |  |  | See L for details. | 
| 332 |  |  |  |  |  |  |  | 
| 333 |  |  |  |  |  |  | See L for | 
| 334 |  |  |  |  |  |  | help on unpacking and installing. | 
| 335 |  |  |  |  |  |  |  | 
| 336 |  |  |  |  |  |  | =head1 Methods | 
| 337 |  |  |  |  |  |  |  | 
| 338 |  |  |  |  |  |  | =head2 config_file($file_name) | 
| 339 |  |  |  |  |  |  |  | 
| 340 |  |  |  |  |  |  | Get or set the name of the config file. | 
| 341 |  |  |  |  |  |  |  | 
| 342 |  |  |  |  |  |  | The code prefixes this name with the directory returned by L. | 
| 343 |  |  |  |  |  |  |  | 
| 344 |  |  |  |  |  |  | Also, I is an option to L. | 
| 345 |  |  |  |  |  |  |  | 
| 346 |  |  |  |  |  |  | =head2 log($level => $s) | 
| 347 |  |  |  |  |  |  |  | 
| 348 |  |  |  |  |  |  | Print $s at log level $level, if ($self -> verbose); | 
| 349 |  |  |  |  |  |  |  | 
| 350 |  |  |  |  |  |  | Since $self -> verbose defaults to 0, nothing is printed by default. | 
| 351 |  |  |  |  |  |  |  | 
| 352 |  |  |  |  |  |  | =head2 new() | 
| 353 |  |  |  |  |  |  |  | 
| 354 |  |  |  |  |  |  | See L. | 
| 355 |  |  |  |  |  |  |  | 
| 356 |  |  |  |  |  |  | =head2 sqlite_file($file_name) | 
| 357 |  |  |  |  |  |  |  | 
| 358 |  |  |  |  |  |  | Get or set the name of the database file. | 
| 359 |  |  |  |  |  |  |  | 
| 360 |  |  |  |  |  |  | The code prefixes this name with the directory returned by L. | 
| 361 |  |  |  |  |  |  |  | 
| 362 |  |  |  |  |  |  | Also, I is an option to L. | 
| 363 |  |  |  |  |  |  |  | 
| 364 |  |  |  |  |  |  | =head2 verbose($integer) | 
| 365 |  |  |  |  |  |  |  | 
| 366 |  |  |  |  |  |  | Get or set the verbosity level. | 
| 367 |  |  |  |  |  |  |  | 
| 368 |  |  |  |  |  |  | Also, I is an option to L. | 
| 369 |  |  |  |  |  |  |  | 
| 370 |  |  |  |  |  |  | =head1 FAQ | 
| 371 |  |  |  |  |  |  |  | 
| 372 |  |  |  |  |  |  | =head2 Design faults in ISO3166 | 
| 373 |  |  |  |  |  |  |  | 
| 374 |  |  |  |  |  |  | Where ISO3166 uses Country Name, I would have used Long Name and Short Name. | 
| 375 |  |  |  |  |  |  |  | 
| 376 |  |  |  |  |  |  | Then we'd have: | 
| 377 |  |  |  |  |  |  |  | 
| 378 |  |  |  |  |  |  | Long Name:  Bolivia, Plurinational State of | 
| 379 |  |  |  |  |  |  | Short Name: Bolivia | 
| 380 |  |  |  |  |  |  |  | 
| 381 |  |  |  |  |  |  | This distro uses the value directly from Wikipedia, which is what I have called 'Long Name', for | 
| 382 |  |  |  |  |  |  | all country and subcountry names. | 
| 383 |  |  |  |  |  |  |  | 
| 384 |  |  |  |  |  |  | =head2 Where is the database? | 
| 385 |  |  |  |  |  |  |  | 
| 386 |  |  |  |  |  |  | It is shipped in share/www.scraper.wikipedia.iso3166.sqlite. | 
| 387 |  |  |  |  |  |  |  | 
| 388 |  |  |  |  |  |  | It is installed into the distro's shared dir, as returned by L. | 
| 389 |  |  |  |  |  |  | On my machine that's: | 
| 390 |  |  |  |  |  |  |  | 
| 391 |  |  |  |  |  |  | /home/ron/perl5/perlbrew/perls/perl-5.14.2/lib/site_perl/5.14.2/auto/share/dist/WWW-Scraper-Wikipedia-ISO3166/www.scraper.wikipedia.iso3166.sqlite. | 
| 392 |  |  |  |  |  |  |  | 
| 393 |  |  |  |  |  |  | =head2 What is the database schema? | 
| 394 |  |  |  |  |  |  |  | 
| 395 |  |  |  |  |  |  | A single SQLite file holds 2 tables, I and I: | 
| 396 |  |  |  |  |  |  |  | 
| 397 |  |  |  |  |  |  | countries           subcountries | 
| 398 |  |  |  |  |  |  | ---------           ------------ | 
| 399 |  |  |  |  |  |  | id                  id | 
| 400 |  |  |  |  |  |  | code2               country_id | 
| 401 |  |  |  |  |  |  | code3               code | 
| 402 |  |  |  |  |  |  | fc_name             fc_name | 
| 403 |  |  |  |  |  |  | has_subcountries    name | 
| 404 |  |  |  |  |  |  | name                sequence | 
| 405 |  |  |  |  |  |  | timestamp           timestamp | 
| 406 |  |  |  |  |  |  |  | 
| 407 |  |  |  |  |  |  | I has a couple of special cases. 2 countries have no value for code3: | 
| 408 |  |  |  |  |  |  | Libyan Arab Jamahiriya and Sint Maarten. | 
| 409 |  |  |  |  |  |  | 3-letter codes which almost match: LBY => Libya and MAF => Saint Martin (French part). | 
| 410 |  |  |  |  |  |  |  | 
| 411 |  |  |  |  |  |  | I points to I. | 
| 412 |  |  |  |  |  |  |  | 
| 413 |  |  |  |  |  |  | I is output from calling fc(decode('utf8', $name) ). | 
| 414 |  |  |  |  |  |  |  | 
| 415 |  |  |  |  |  |  | For decode(), see L. | 
| 416 |  |  |  |  |  |  |  | 
| 417 |  |  |  |  |  |  | For fc(), see L. | 
| 418 |  |  |  |  |  |  |  | 
| 419 |  |  |  |  |  |  | $name is from a Wikipedia page. | 
| 420 |  |  |  |  |  |  |  | 
| 421 |  |  |  |  |  |  | I is 'Yes' or 'No'. | 
| 422 |  |  |  |  |  |  |  | 
| 423 |  |  |  |  |  |  | I is output from calling decode('utf8', $name). | 
| 424 |  |  |  |  |  |  |  | 
| 425 |  |  |  |  |  |  | I is a number (1 .. N) indicating the order in which subcountry names appear in the list | 
| 426 |  |  |  |  |  |  | on that subcountry's Wikipedia page. | 
| 427 |  |  |  |  |  |  |  | 
| 428 |  |  |  |  |  |  | See the source code of L for details of the SQL | 
| 429 |  |  |  |  |  |  | used to create the tables. | 
| 430 |  |  |  |  |  |  |  | 
| 431 |  |  |  |  |  |  | =head2 What do I do if I find a mistake in the data? | 
| 432 |  |  |  |  |  |  |  | 
| 433 |  |  |  |  |  |  | What data? What mistake? How do you know it's wrong? | 
| 434 |  |  |  |  |  |  |  | 
| 435 |  |  |  |  |  |  | Also, you must decide what exactly you were expecting the data to be. | 
| 436 |  |  |  |  |  |  |  | 
| 437 |  |  |  |  |  |  | If the problem is the ISO data, report it to them. | 
| 438 |  |  |  |  |  |  |  | 
| 439 |  |  |  |  |  |  | If the problem is the Wikipedia data, get agreement from everyone concerned and update Wikipedia. | 
| 440 |  |  |  |  |  |  |  | 
| 441 |  |  |  |  |  |  | If the problem is the output from my code, try to identify the bug in the code and report it via the | 
| 442 |  |  |  |  |  |  | usual mechanism. See L. | 
| 443 |  |  |  |  |  |  |  | 
| 444 |  |  |  |  |  |  | If the problem is with your computer's display of the data, consider (in alphabetical order): | 
| 445 |  |  |  |  |  |  |  | 
| 446 |  |  |  |  |  |  | =over 4 | 
| 447 |  |  |  |  |  |  |  | 
| 448 |  |  |  |  |  |  | =item o CSV | 
| 449 |  |  |  |  |  |  |  | 
| 450 |  |  |  |  |  |  | Does the file display correctly in 'Emacs'? On the screen using 'less'? | 
| 451 |  |  |  |  |  |  |  | 
| 452 |  |  |  |  |  |  | scripts/export.as.csv.pl uses: use open ':utf8'; | 
| 453 |  |  |  |  |  |  |  | 
| 454 |  |  |  |  |  |  | Is that not working? | 
| 455 |  |  |  |  |  |  |  | 
| 456 |  |  |  |  |  |  | =item o DBD::SQLite | 
| 457 |  |  |  |  |  |  |  | 
| 458 |  |  |  |  |  |  | Did you set the sqlite_unicode attribute? Use something like: | 
| 459 |  |  |  |  |  |  |  | 
| 460 |  |  |  |  |  |  | my($dsn)        = 'dbi:SQLite:dbname=www.scraper.wikipedia.iso3166.sqlite'; # Sample only. | 
| 461 |  |  |  |  |  |  | my($attributes) = {AutoCommit => 1, RaiseError => 1, sqlite_unicode => 1}; | 
| 462 |  |  |  |  |  |  | my($dbh)        = DBI -> connect($dsn, '', '', $attributes); | 
| 463 |  |  |  |  |  |  |  | 
| 464 |  |  |  |  |  |  | The SQLite file ships in the share/ directory of the distro, and must be found by File::ShareDir | 
| 465 |  |  |  |  |  |  | at run time. | 
| 466 |  |  |  |  |  |  |  | 
| 467 |  |  |  |  |  |  | Did you set the foreign_keys pragma (if needed)? Use: | 
| 468 |  |  |  |  |  |  |  | 
| 469 |  |  |  |  |  |  | $dbh -> do('PRAGMA foreign_keys = ON'); | 
| 470 |  |  |  |  |  |  |  | 
| 471 |  |  |  |  |  |  | =item o HTML | 
| 472 |  |  |  |  |  |  |  | 
| 473 |  |  |  |  |  |  | The template htdocs/assets/templates/www/scraper/wikipedia/iso3166/iso3166.report.tx which ships with | 
| 474 |  |  |  |  |  |  | this distro contains this line: | 
| 475 |  |  |  |  |  |  |  | 
| 476 |  |  |  |  |  |  |  | 
| 477 |  |  |  |  |  |  |  | 
| 478 |  |  |  |  |  |  | Is that not working? | 
| 479 |  |  |  |  |  |  |  | 
| 480 |  |  |  |  |  |  | =item o Locale | 
| 481 |  |  |  |  |  |  |  | 
| 482 |  |  |  |  |  |  | Here's my setup: | 
| 483 |  |  |  |  |  |  |  | 
| 484 |  |  |  |  |  |  | shell>locale | 
| 485 |  |  |  |  |  |  | LANG=en_AU.utf8 | 
| 486 |  |  |  |  |  |  | LANGUAGE= | 
| 487 |  |  |  |  |  |  | LC_CTYPE="en_AU.utf8" | 
| 488 |  |  |  |  |  |  | LC_NUMERIC="en_AU.utf8" | 
| 489 |  |  |  |  |  |  | LC_TIME="en_AU.utf8" | 
| 490 |  |  |  |  |  |  | LC_COLLATE="en_AU.utf8" | 
| 491 |  |  |  |  |  |  | LC_MONETARY="en_AU.utf8" | 
| 492 |  |  |  |  |  |  | LC_MESSAGES="en_AU.utf8" | 
| 493 |  |  |  |  |  |  | LC_PAPER="en_AU.utf8" | 
| 494 |  |  |  |  |  |  | LC_NAME="en_AU.utf8" | 
| 495 |  |  |  |  |  |  | LC_ADDRESS="en_AU.utf8" | 
| 496 |  |  |  |  |  |  | LC_TELEPHONE="en_AU.utf8" | 
| 497 |  |  |  |  |  |  | LC_MEASUREMENT="en_AU.utf8" | 
| 498 |  |  |  |  |  |  | LC_IDENTIFICATION="en_AU.utf8" | 
| 499 |  |  |  |  |  |  | LC_ALL= | 
| 500 |  |  |  |  |  |  |  | 
| 501 |  |  |  |  |  |  | =item o OS | 
| 502 |  |  |  |  |  |  |  | 
| 503 |  |  |  |  |  |  | Unicode is a moving target. Perhaps your OS's installed version of unicode files needs updating. | 
| 504 |  |  |  |  |  |  |  | 
| 505 |  |  |  |  |  |  | =item o SQLite | 
| 506 |  |  |  |  |  |  |  | 
| 507 |  |  |  |  |  |  | Both Oracle and SQLite.org ship a program called sqlite3. They are not compatible. | 
| 508 |  |  |  |  |  |  | Which one are you using? I use the one from the SQLite.org. | 
| 509 |  |  |  |  |  |  |  | 
| 510 |  |  |  |  |  |  | AFAICT, sqlite3 does not have command line options, or options while running, to set unicode or pragmas. | 
| 511 |  |  |  |  |  |  |  | 
| 512 |  |  |  |  |  |  | =back | 
| 513 |  |  |  |  |  |  |  | 
| 514 |  |  |  |  |  |  | =head2 Why did you use L's NFC() for sorting? | 
| 515 |  |  |  |  |  |  |  | 
| 516 |  |  |  |  |  |  | This question implies why not use NFD() instead. | 
| 517 |  |  |  |  |  |  |  | 
| 518 |  |  |  |  |  |  | Run scripts/test.nfc.pl, and the output is: | 
| 519 |  |  |  |  |  |  |  | 
| 520 |  |  |  |  |  |  | code2 => AX | 
| 521 |  |  |  |  |  |  | code3 => ALA | 
| 522 |  |  |  |  |  |  | fc_name => åland islands | 
| 523 |  |  |  |  |  |  | has_subcountries => No | 
| 524 |  |  |  |  |  |  | id => 15 | 
| 525 |  |  |  |  |  |  | name => Åland Islands | 
| 526 |  |  |  |  |  |  | timestamp => 2012-05-13 23:37:20 | 
| 527 |  |  |  |  |  |  |  | 
| 528 |  |  |  |  |  |  | And this (Åland Islands) is what Wikipedia displays. So, NFC() it is. | 
| 529 |  |  |  |  |  |  |  | 
| 530 |  |  |  |  |  |  | See L, and specifically prescription # 1. | 
| 531 |  |  |  |  |  |  |  | 
| 532 |  |  |  |  |  |  | See also section 1.2 Normalization Forms in L. | 
| 533 |  |  |  |  |  |  |  | 
| 534 |  |  |  |  |  |  | See also L. | 
| 535 |  |  |  |  |  |  |  | 
| 536 |  |  |  |  |  |  | =head2 What is $ENV{AUTHOR_TESTING} used for? | 
| 537 |  |  |  |  |  |  |  | 
| 538 |  |  |  |  |  |  | When this env var is 1, scripts output to share/*.sqlite within the distro's dir. That's how I populate the | 
| 539 |  |  |  |  |  |  | database tables. After installation, the database is elsewhere, and read-only, so you don't want the scripts | 
| 540 |  |  |  |  |  |  | writing to that copy anyway. | 
| 541 |  |  |  |  |  |  |  | 
| 542 |  |  |  |  |  |  | At run-time, L is used to find the installed version of *.sqlite. | 
| 543 |  |  |  |  |  |  |  | 
| 544 |  |  |  |  |  |  | =head2 What FIPS data is included? | 
| 545 |  |  |  |  |  |  |  | 
| 546 |  |  |  |  |  |  | Firstly, scripts/get.fips.pages.pl downloads some Wikipedia data, into data/List_of_FIPS_region_codes_*.html. | 
| 547 |  |  |  |  |  |  |  | 
| 548 |  |  |  |  |  |  | Secondly, the latter files are parsed by scripts/populate.fips.codes.pl and the 2 reports are in | 
| 549 |  |  |  |  |  |  | data/wikipedia.fips.codes.txt, and data/wikipedia.fips.mismatch.log. | 
| 550 |  |  |  |  |  |  |  | 
| 551 |  |  |  |  |  |  | This data is I written into the SQLite database yet, but it's available in case it's included | 
| 552 |  |  |  |  |  |  | one day. | 
| 553 |  |  |  |  |  |  |  | 
| 554 |  |  |  |  |  |  | =head1 Wikipedia's Terms of Use | 
| 555 |  |  |  |  |  |  |  | 
| 556 |  |  |  |  |  |  | See L. | 
| 557 |  |  |  |  |  |  |  | 
| 558 |  |  |  |  |  |  | Also, since I'm distributing copies of Wikipedia-sourced material, reformatted but not changed by editing, | 
| 559 |  |  |  |  |  |  | I hereby give notice that their material is released under CC-BY-SA. | 
| 560 |  |  |  |  |  |  | See L for that licence. | 
| 561 |  |  |  |  |  |  |  | 
| 562 |  |  |  |  |  |  | =head1 References | 
| 563 |  |  |  |  |  |  |  | 
| 564 |  |  |  |  |  |  | In no particular order: | 
| 565 |  |  |  |  |  |  |  | 
| 566 |  |  |  |  |  |  | L | 
| 567 |  |  |  |  |  |  |  | 
| 568 |  |  |  |  |  |  | L | 
| 569 |  |  |  |  |  |  |  | 
| 570 |  |  |  |  |  |  | L | 
| 571 |  |  |  |  |  |  |  | 
| 572 |  |  |  |  |  |  | L | 
| 573 |  |  |  |  |  |  |  | 
| 574 |  |  |  |  |  |  | L | 
| 575 |  |  |  |  |  |  |  | 
| 576 |  |  |  |  |  |  | This is complex set of XML files concerning currency, postal, etc, formats and other details for various countries | 
| 577 |  |  |  |  |  |  | and/or languages. | 
| 578 |  |  |  |  |  |  |  | 
| 579 |  |  |  |  |  |  | For Debian etc users: /usr/share/xml/iso-codes/iso_3166_2.xml, as installed from the iso-codes package, with: | 
| 580 |  |  |  |  |  |  |  | 
| 581 |  |  |  |  |  |  | sudo apt-get install iso-codes | 
| 582 |  |  |  |  |  |  |  | 
| 583 |  |  |  |  |  |  | L | 
| 584 |  |  |  |  |  |  |  | 
| 585 |  |  |  |  |  |  | L | 
| 586 |  |  |  |  |  |  |  | 
| 587 |  |  |  |  |  |  | L | 
| 588 |  |  |  |  |  |  |  | 
| 589 |  |  |  |  |  |  | Check the Monthly Archives at Perl.com, starting in April 2012, for a series of Unicode-specific articles by | 
| 590 |  |  |  |  |  |  | Tom Christiansen. | 
| 591 |  |  |  |  |  |  |  | 
| 592 |  |  |  |  |  |  | L | 
| 593 |  |  |  |  |  |  |  | 
| 594 |  |  |  |  |  |  | L | 
| 595 |  |  |  |  |  |  |  | 
| 596 |  |  |  |  |  |  | =head1 Repository | 
| 597 |  |  |  |  |  |  |  | 
| 598 |  |  |  |  |  |  | L | 
| 599 |  |  |  |  |  |  |  | 
| 600 |  |  |  |  |  |  | =head1 Support | 
| 601 |  |  |  |  |  |  |  | 
| 602 |  |  |  |  |  |  | Email the author, or log a bug on RT: | 
| 603 |  |  |  |  |  |  |  | 
| 604 |  |  |  |  |  |  | L. | 
| 605 |  |  |  |  |  |  |  | 
| 606 |  |  |  |  |  |  | =head1 Author | 
| 607 |  |  |  |  |  |  |  | 
| 608 |  |  |  |  |  |  | C was written by Ron Savage Iron@savage.net.auE> in 2012. | 
| 609 |  |  |  |  |  |  |  | 
| 610 |  |  |  |  |  |  | Home page: L. | 
| 611 |  |  |  |  |  |  |  | 
| 612 |  |  |  |  |  |  | =head1 Copyright | 
| 613 |  |  |  |  |  |  |  | 
| 614 |  |  |  |  |  |  | Australian copyright (c) 2012 Ron Savage. | 
| 615 |  |  |  |  |  |  |  | 
| 616 |  |  |  |  |  |  | All Programs of mine are 'OSI Certified Open Source Software'; | 
| 617 |  |  |  |  |  |  | you can redistribute them and/or modify them under the terms of | 
| 618 |  |  |  |  |  |  | The Artistic License, a copy of which is available at: | 
| 619 |  |  |  |  |  |  | http://www.opensource.org/licenses/index.html | 
| 620 |  |  |  |  |  |  |  | 
| 621 |  |  |  |  |  |  |  | 
| 622 |  |  |  |  |  |  | =cut |