| line | stmt | bran | cond | sub | pod | time | code | 
| 1 |  |  |  |  |  |  | package Alvis::Convert; | 
| 2 |  |  |  |  |  |  |  | 
| 3 |  |  |  |  |  |  | $Alvis::Convert::VERSION = '0.4'; | 
| 4 |  |  |  |  |  |  |  | 
| 5 |  |  |  |  |  |  | ######################################################################## | 
| 6 |  |  |  |  |  |  | # | 
| 7 |  |  |  |  |  |  | # A general "set of document files in some format" -> | 
| 8 |  |  |  |  |  |  | # "set of files in ALVIS format" converter. | 
| 9 |  |  |  |  |  |  | # | 
| 10 |  |  |  |  |  |  | #   -- Kimmo Valtonen | 
| 11 |  |  |  |  |  |  | # | 
| 12 |  |  |  |  |  |  | ######################################################################## | 
| 13 |  |  |  |  |  |  |  | 
| 14 | 1 |  |  | 1 |  | 24461 | use strict; | 
|  | 1 |  |  |  |  | 3 |  | 
|  | 1 |  |  |  |  | 37 |  | 
| 15 | 1 |  |  | 1 |  | 5 | use warnings; | 
|  | 1 |  |  |  |  | 2 |  | 
|  | 1 |  |  |  |  | 29 |  | 
| 16 |  |  |  |  |  |  |  | 
| 17 | 1 |  |  | 1 |  | 6 | use Carp; | 
|  | 1 |  |  |  |  | 6 |  | 
|  | 1 |  |  |  |  | 92 |  | 
| 18 | 1 |  |  | 1 |  | 1090 | use Data::Dumper; | 
|  | 1 |  |  |  |  | 23561 |  | 
|  | 1 |  |  |  |  | 226 |  | 
| 19 | 1 |  |  | 1 |  | 3602 | use Encode; | 
|  | 1 |  |  |  |  | 21492 |  | 
|  | 1 |  |  |  |  | 106 |  | 
| 20 | 1 |  |  | 1 |  | 487 | use XML::LibXML; | 
|  | 0 |  |  |  |  |  |  | 
|  | 0 |  |  |  |  |  |  | 
| 21 |  |  |  |  |  |  |  | 
| 22 |  |  |  |  |  |  | use Alvis::Canonical; | 
| 23 |  |  |  |  |  |  | use Alvis::Document; | 
| 24 |  |  |  |  |  |  | use Alvis::Document::Encoding; | 
| 25 |  |  |  |  |  |  | use Alvis::Document::Meta; | 
| 26 |  |  |  |  |  |  | use Alvis::Document::Links; | 
| 27 |  |  |  |  |  |  | use Alvis::Document::Type; | 
| 28 |  |  |  |  |  |  | use Alvis::AinoDump; | 
| 29 |  |  |  |  |  |  | use Alvis::Wikipedia::XMLDump; | 
| 30 |  |  |  |  |  |  |  | 
| 31 |  |  |  |  |  |  |  | 
| 32 |  |  |  |  |  |  | ############################################################################ | 
| 33 |  |  |  |  |  |  | # | 
| 34 |  |  |  |  |  |  | #  Global variables | 
| 35 |  |  |  |  |  |  | # | 
| 36 |  |  |  |  |  |  | ############################################################################ | 
| 37 |  |  |  |  |  |  |  | 
| 38 |  |  |  |  |  |  | # Types of documents handled | 
| 39 |  |  |  |  |  |  | our ($UNKNOWN_FILE_TYPE,$DIR,$META,$HTML,$NEWS_XML,$AINODUMP, | 
| 40 |  |  |  |  |  |  | $WIKIPEDIA_XML_DUMP)=(0..6); | 
| 41 |  |  |  |  |  |  | my %RecognizedEntryTypeDescs=($UNKNOWN_FILE_TYPE=>"Guess the file type", | 
| 42 |  |  |  |  |  |  | $DIR=>"Directory", | 
| 43 |  |  |  |  |  |  | $META=>"Meta information", | 
| 44 |  |  |  |  |  |  | $HTML=>"HTML", | 
| 45 |  |  |  |  |  |  | $NEWS_XML=> | 
| 46 |  |  |  |  |  |  | "XML information about a news article", | 
| 47 |  |  |  |  |  |  | $AINODUMP=>"ainodump", | 
| 48 |  |  |  |  |  |  | $WIKIPEDIA_XML_DUMP=>"Wikipedia XML dump"); | 
| 49 |  |  |  |  |  |  |  | 
| 50 |  |  |  |  |  |  | ############################################################################ | 
| 51 |  |  |  |  |  |  | # | 
| 52 |  |  |  |  |  |  | #  Error message stuff | 
| 53 |  |  |  |  |  |  | # | 
| 54 |  |  |  |  |  |  | ############################################################################ | 
| 55 |  |  |  |  |  |  |  | 
| 56 |  |  |  |  |  |  | my ($ERR_OK, | 
| 57 |  |  |  |  |  |  | $ERR_CANONICAL, | 
| 58 |  |  |  |  |  |  | $ERR_ASSEMBLER, | 
| 59 |  |  |  |  |  |  | $ERR_CANDOC_CONV, | 
| 60 |  |  |  |  |  |  | $ERR_META, | 
| 61 |  |  |  |  |  |  | $ERR_LINKS, | 
| 62 |  |  |  |  |  |  | $ERR_LINK_ADD, | 
| 63 |  |  |  |  |  |  | $ERR_ASSEMBLE, | 
| 64 |  |  |  |  |  |  | $ERR_NO_NEWS_XML_TEXT, | 
| 65 |  |  |  |  |  |  | $ERR_XML_PARSER, | 
| 66 |  |  |  |  |  |  | $ERR_XML_PARSE, | 
| 67 |  |  |  |  |  |  | $ERR_NO_URL, | 
| 68 |  |  |  |  |  |  | $ERR_ENCODING_WIZARD, | 
| 69 |  |  |  |  |  |  | $ERR_UTF8_CONV, | 
| 70 |  |  |  |  |  |  | $ERR_ENCODING_CONV, | 
| 71 |  |  |  |  |  |  | $ERR_TYPE_SUFFIX, | 
| 72 |  |  |  |  |  |  | $ERR_READ_HTML, | 
| 73 |  |  |  |  |  |  | $ERR_READ_NEWS_XML, | 
| 74 |  |  |  |  |  |  | $ERR_ALVIS_CONV, | 
| 75 |  |  |  |  |  |  | $ERR_ALVIS_SUFFIX, | 
| 76 |  |  |  |  |  |  | $ERR_NO_OUTPUT_ROOT_DIR, | 
| 77 |  |  |  |  |  |  | $ERR_WRITING_OUTPUT, | 
| 78 |  |  |  |  |  |  | $ERR_DIR_CONV, | 
| 79 |  |  |  |  |  |  | $ERR_NO_HTML_F, | 
| 80 |  |  |  |  |  |  | $ERR_META_F, | 
| 81 |  |  |  |  |  |  | $ERR_HTML_F, | 
| 82 |  |  |  |  |  |  | $ERR_NEWS_XML_F, | 
| 83 |  |  |  |  |  |  | $ERR_DOC_ALVIS_CONV, | 
| 84 |  |  |  |  |  |  | $ERR_NEWS_XML_PARSE, | 
| 85 |  |  |  |  |  |  | $ERR_MULTIPLE_SUFFIX_MEANING, | 
| 86 |  |  |  |  |  |  | $ERR_OUTPUT_ALVIS, | 
| 87 |  |  |  |  |  |  | $ERR_OUTPUT_SET_OF_RECORDS, | 
| 88 |  |  |  |  |  |  | $ERR_AINODUMP, | 
| 89 |  |  |  |  |  |  | $ERR_OPEN_AINODUMP, | 
| 90 |  |  |  |  |  |  | $ERR_AINODUMP_PROCESS, | 
| 91 |  |  |  |  |  |  | $ERR_DOC_TYPE_WIZARD, | 
| 92 |  |  |  |  |  |  | $ERR_TYPE_GUESS, | 
| 93 |  |  |  |  |  |  | $ERR_UNK_FILE_TYPE, | 
| 94 |  |  |  |  |  |  | $ERR_WIKIPEDIA, | 
| 95 |  |  |  |  |  |  | $ERR_OPEN_WIKIPEDIA, | 
| 96 |  |  |  |  |  |  | $ERR_WIKIPEDIA_CONV | 
| 97 |  |  |  |  |  |  | )=(0..40); | 
| 98 |  |  |  |  |  |  |  | 
| 99 |  |  |  |  |  |  | my %ErrMsgs=($ERR_OK=>"", | 
| 100 |  |  |  |  |  |  | $ERR_CANONICAL=>"Could not instantiate Alvis::Canonical.", | 
| 101 |  |  |  |  |  |  | $ERR_ASSEMBLER=>"Could not instantiate Alvis::Document.", | 
| 102 |  |  |  |  |  |  | $ERR_CANDOC_CONV=>"Conversion to canonicalDocument failed.", | 
| 103 |  |  |  |  |  |  | $ERR_META=>"Could not instantiate Alvis::Document::Meta.", | 
| 104 |  |  |  |  |  |  | $ERR_LINKS=>"Could not instantiate Alvis::Document::Links.", | 
| 105 |  |  |  |  |  |  | $ERR_LINK_ADD=>"Adding a link failed.", | 
| 106 |  |  |  |  |  |  | $ERR_ASSEMBLE=>"Assembling a document failed.", | 
| 107 |  |  |  |  |  |  | $ERR_NO_NEWS_XML_TEXT=>"Unable to extract the content from News" . | 
| 108 |  |  |  |  |  |  | " XML format.", | 
| 109 |  |  |  |  |  |  | $ERR_XML_PARSER=>"Could not instantiate XML::LibXML.", | 
| 110 |  |  |  |  |  |  | $ERR_XML_PARSE=>"Parsing the XML failed.", | 
| 111 |  |  |  |  |  |  | $ERR_NO_URL=>"No URL.", | 
| 112 |  |  |  |  |  |  | $ERR_ENCODING_WIZARD=>"Unable to instantiate " . | 
| 113 |  |  |  |  |  |  | "Alvis::Document::Encoding.", | 
| 114 |  |  |  |  |  |  | $ERR_UTF8_CONV=>"Trying to convert to UTF-8 failed.", | 
| 115 |  |  |  |  |  |  | $ERR_ENCODING_CONV=>"Converting from the supposed source " . | 
| 116 |  |  |  |  |  |  | "encoding to UTF-8 failed.", | 
| 117 |  |  |  |  |  |  | $ERR_TYPE_SUFFIX=>"No suffix given for a type.", | 
| 118 |  |  |  |  |  |  | $ERR_READ_HTML=>"Reading the HTML failed.", | 
| 119 |  |  |  |  |  |  | $ERR_READ_NEWS_XML=>"Reading the news XML failed.", | 
| 120 |  |  |  |  |  |  | $ERR_ALVIS_CONV=>"Conversion to Alvis format failed.", | 
| 121 |  |  |  |  |  |  | $ERR_ALVIS_SUFFIX=>"No Alvis suffix defined.", | 
| 122 |  |  |  |  |  |  | $ERR_NO_OUTPUT_ROOT_DIR=>"No output root directory.", | 
| 123 |  |  |  |  |  |  | $ERR_WRITING_OUTPUT=>"Writing the output failed.", | 
| 124 |  |  |  |  |  |  | $ERR_DIR_CONV=>"Converting a directory failed.", | 
| 125 |  |  |  |  |  |  | $ERR_NO_HTML_F=>"No HTML file.", | 
| 126 |  |  |  |  |  |  | $ERR_META_F=>"Opening the meta file failed.", | 
| 127 |  |  |  |  |  |  | $ERR_HTML_F=>"Opening the HTML file failed.", | 
| 128 |  |  |  |  |  |  | $ERR_NEWS_XML_F=>"Opening the news XML file failed.", | 
| 129 |  |  |  |  |  |  | $ERR_DOC_ALVIS_CONV=>"Converting a document to Alvis format failed.", | 
| 130 |  |  |  |  |  |  | $ERR_NEWS_XML_PARSE=>"Parsing the news XML failed.", | 
| 131 |  |  |  |  |  |  | $ERR_MULTIPLE_SUFFIX_MEANING=> | 
| 132 |  |  |  |  |  |  | "Multiple meanings for a single suffix.", | 
| 133 |  |  |  |  |  |  | $ERR_OUTPUT_ALVIS=>"Outputting the Alvis records failed.", | 
| 134 |  |  |  |  |  |  | $ERR_OUTPUT_SET_OF_RECORDS=>"Outputting a set of records to a " . | 
| 135 |  |  |  |  |  |  | "file as a documentCollection  failed.", | 
| 136 |  |  |  |  |  |  | $ERR_AINODUMP=>"Instantiating Alvis::AinoDump failed.", | 
| 137 |  |  |  |  |  |  | $ERR_OPEN_AINODUMP=>"Opening an ainodump file failed.", | 
| 138 |  |  |  |  |  |  | $ERR_AINODUMP_PROCESS=>"Processing an ainodump file failed.", | 
| 139 |  |  |  |  |  |  | $ERR_DOC_TYPE_WIZARD=>"Instantiating Alvis::Document::Type " . | 
| 140 |  |  |  |  |  |  | "failed.", | 
| 141 |  |  |  |  |  |  | $ERR_TYPE_GUESS=>"Guessing the document's type failed.", | 
| 142 |  |  |  |  |  |  | $ERR_UNK_FILE_TYPE=>"Unrecognized file type.", | 
| 143 |  |  |  |  |  |  | $ERR_WIKIPEDIA=>"Instantiating Alvis::Wikipedia::XMLDump failed.", | 
| 144 |  |  |  |  |  |  | $ERR_OPEN_WIKIPEDIA=>"Opening the Wikipedia XML dump file failed.", | 
| 145 |  |  |  |  |  |  | $ERR_WIKIPEDIA_CONV=>"Extracting the articles from the Wikipedia" . | 
| 146 |  |  |  |  |  |  | " XML dump failed." | 
| 147 |  |  |  |  |  |  | ); | 
| 148 |  |  |  |  |  |  |  | 
| 149 |  |  |  |  |  |  | sub _set_err_state | 
| 150 |  |  |  |  |  |  | { | 
| 151 |  |  |  |  |  |  | my $self=shift; | 
| 152 |  |  |  |  |  |  | my $errcode=shift; | 
| 153 |  |  |  |  |  |  | my $errmsg=shift; | 
| 154 |  |  |  |  |  |  |  | 
| 155 |  |  |  |  |  |  |  | 
| 156 |  |  |  |  |  |  | if (!defined($errcode)) | 
| 157 |  |  |  |  |  |  | { | 
| 158 |  |  |  |  |  |  | confess("set_err_state() called with an undefined argument."); | 
| 159 |  |  |  |  |  |  | } | 
| 160 |  |  |  |  |  |  |  | 
| 161 |  |  |  |  |  |  | if (exists($ErrMsgs{$errcode})) | 
| 162 |  |  |  |  |  |  | { | 
| 163 |  |  |  |  |  |  | if ($errcode==$ERR_OK) | 
| 164 |  |  |  |  |  |  | { | 
| 165 |  |  |  |  |  |  | $self->{errstr}=""; | 
| 166 |  |  |  |  |  |  | } | 
| 167 |  |  |  |  |  |  | else | 
| 168 |  |  |  |  |  |  | { | 
| 169 |  |  |  |  |  |  | $self->{errstr}.=" " . $ErrMsgs{$errcode}; | 
| 170 |  |  |  |  |  |  | if (defined($errmsg)) | 
| 171 |  |  |  |  |  |  | { | 
| 172 |  |  |  |  |  |  | $self->{errstr}.=" " . $errmsg; | 
| 173 |  |  |  |  |  |  | } | 
| 174 |  |  |  |  |  |  |  | 
| 175 |  |  |  |  |  |  | } | 
| 176 |  |  |  |  |  |  | } | 
| 177 |  |  |  |  |  |  | else | 
| 178 |  |  |  |  |  |  | { | 
| 179 |  |  |  |  |  |  | confess("Internal error: set_err_state() called with an " . | 
| 180 |  |  |  |  |  |  | "unrecognized argument ($errcode).") | 
| 181 |  |  |  |  |  |  | } | 
| 182 |  |  |  |  |  |  | } | 
| 183 |  |  |  |  |  |  |  | 
| 184 |  |  |  |  |  |  | sub clearerr | 
| 185 |  |  |  |  |  |  | { | 
| 186 |  |  |  |  |  |  | my $self=shift; | 
| 187 |  |  |  |  |  |  |  | 
| 188 |  |  |  |  |  |  | $self->{errstr}=""; | 
| 189 |  |  |  |  |  |  | } | 
| 190 |  |  |  |  |  |  |  | 
| 191 |  |  |  |  |  |  | sub errmsg | 
| 192 |  |  |  |  |  |  | { | 
| 193 |  |  |  |  |  |  | my $self=shift; | 
| 194 |  |  |  |  |  |  |  | 
| 195 |  |  |  |  |  |  | return $self->{errstr}; | 
| 196 |  |  |  |  |  |  | } | 
| 197 |  |  |  |  |  |  |  | 
| 198 |  |  |  |  |  |  | ############################################################################ | 
| 199 |  |  |  |  |  |  | # | 
| 200 |  |  |  |  |  |  | #          Public methods | 
| 201 |  |  |  |  |  |  | # | 
| 202 |  |  |  |  |  |  | ############################################################################ | 
| 203 |  |  |  |  |  |  |  | 
| 204 |  |  |  |  |  |  | sub new | 
| 205 |  |  |  |  |  |  | { | 
| 206 |  |  |  |  |  |  | my $proto=shift; | 
| 207 |  |  |  |  |  |  |  | 
| 208 |  |  |  |  |  |  | my $class=ref($proto)||$proto; | 
| 209 |  |  |  |  |  |  | my $parent=ref($proto)&&$proto; | 
| 210 |  |  |  |  |  |  | my $self={}; | 
| 211 |  |  |  |  |  |  | bless($self,$class); | 
| 212 |  |  |  |  |  |  |  | 
| 213 |  |  |  |  |  |  | $self->_set_err_state($ERR_OK); | 
| 214 |  |  |  |  |  |  |  | 
| 215 |  |  |  |  |  |  | $self->_init(@_); | 
| 216 |  |  |  |  |  |  |  | 
| 217 |  |  |  |  |  |  | if (defined($self->{urlBase})) | 
| 218 |  |  |  |  |  |  | { | 
| 219 |  |  |  |  |  |  | if ($self->{urlBase}!~/\/$/) | 
| 220 |  |  |  |  |  |  | { | 
| 221 |  |  |  |  |  |  | $self->{urlBase}.='/'; | 
| 222 |  |  |  |  |  |  | } | 
| 223 |  |  |  |  |  |  | } | 
| 224 |  |  |  |  |  |  |  | 
| 225 |  |  |  |  |  |  | $self->{canonicalConverter}=Alvis::Canonical->new(); | 
| 226 |  |  |  |  |  |  | if (!defined($self->{canonicalConverter})) | 
| 227 |  |  |  |  |  |  | { | 
| 228 |  |  |  |  |  |  | $self->_set_err_state($ERR_CANONICAL); | 
| 229 |  |  |  |  |  |  | return undef; | 
| 230 |  |  |  |  |  |  | } | 
| 231 |  |  |  |  |  |  |  | 
| 232 |  |  |  |  |  |  | $self->{documentAssembler}= | 
| 233 |  |  |  |  |  |  | Alvis::Document->new(includeOriginalDocument=> | 
| 234 |  |  |  |  |  |  | $self->{includeOriginalDocument}); | 
| 235 |  |  |  |  |  |  | if (!defined($self->{documentAssembler})) | 
| 236 |  |  |  |  |  |  | { | 
| 237 |  |  |  |  |  |  | $self->_set_err_state($ERR_ASSEMBLER); | 
| 238 |  |  |  |  |  |  | return undef; | 
| 239 |  |  |  |  |  |  | } | 
| 240 |  |  |  |  |  |  |  | 
| 241 |  |  |  |  |  |  | $self->{XMLParser}=XML::LibXML->new(); | 
| 242 |  |  |  |  |  |  | if (!defined($self->{XMLParser})) | 
| 243 |  |  |  |  |  |  | { | 
| 244 |  |  |  |  |  |  | $self->_set_err_state($ERR_XML_PARSER); | 
| 245 |  |  |  |  |  |  | return undef; | 
| 246 |  |  |  |  |  |  | } | 
| 247 |  |  |  |  |  |  |  | 
| 248 |  |  |  |  |  |  | $self->{encodingWizard}= | 
| 249 |  |  |  |  |  |  | Alvis::Document::Encoding->new(defaultEncoding=>undef); | 
| 250 |  |  |  |  |  |  | if (!defined($self->{encodingWizard})) | 
| 251 |  |  |  |  |  |  | { | 
| 252 |  |  |  |  |  |  | $self->_set_err_state($ERR_ENCODING_WIZARD); | 
| 253 |  |  |  |  |  |  | return undef; | 
| 254 |  |  |  |  |  |  | } | 
| 255 |  |  |  |  |  |  |  | 
| 256 |  |  |  |  |  |  | $self->{wikipediaConverter}= | 
| 257 |  |  |  |  |  |  | Alvis::Wikipedia::XMLDump->new(expandVariables=>1, | 
| 258 |  |  |  |  |  |  | skipRedirects=>0, | 
| 259 |  |  |  |  |  |  | dumpCategoryData=>1, | 
| 260 |  |  |  |  |  |  | dumpTemplateData=>1); | 
| 261 |  |  |  |  |  |  | if (!defined($self->{wikipediaConverter})) | 
| 262 |  |  |  |  |  |  | { | 
| 263 |  |  |  |  |  |  | $self->_set_err_state($ERR_WIKIPEDIA); | 
| 264 |  |  |  |  |  |  | return undef; | 
| 265 |  |  |  |  |  |  | } | 
| 266 |  |  |  |  |  |  |  | 
| 267 |  |  |  |  |  |  | $self->{docTypeWizard}= | 
| 268 |  |  |  |  |  |  | Alvis::Document::Type->new(defaultType=> | 
| 269 |  |  |  |  |  |  | $self->{defaultDocType}, | 
| 270 |  |  |  |  |  |  | defaultSubType=> | 
| 271 |  |  |  |  |  |  | $self->{defaultDocSubType}); | 
| 272 |  |  |  |  |  |  | if (!defined($self->{docTypeWizard})) | 
| 273 |  |  |  |  |  |  | { | 
| 274 |  |  |  |  |  |  | $self->_set_err_state($ERR_DOC_TYPE_WIZARD); | 
| 275 |  |  |  |  |  |  | return undef; | 
| 276 |  |  |  |  |  |  | } | 
| 277 |  |  |  |  |  |  |  | 
| 278 |  |  |  |  |  |  | return $self; | 
| 279 |  |  |  |  |  |  | } | 
| 280 |  |  |  |  |  |  |  | 
| 281 |  |  |  |  |  |  | sub _init | 
| 282 |  |  |  |  |  |  | { | 
| 283 |  |  |  |  |  |  | my $self=shift; | 
| 284 |  |  |  |  |  |  |  | 
| 285 |  |  |  |  |  |  | $self->{fileType}=undef; | 
| 286 |  |  |  |  |  |  | $self->{sourceEncoding}=undef; | 
| 287 |  |  |  |  |  |  | $self->{urlFromBasename}=0; | 
| 288 |  |  |  |  |  |  | $self->{outputAtSameLocation}=0; | 
| 289 |  |  |  |  |  |  | $self->{alvisSuffix}='alvis'; | 
| 290 |  |  |  |  |  |  | $self->{outputRootDir}='.'; | 
| 291 |  |  |  |  |  |  | $self->{outputNPerSubdir}=1000; | 
| 292 |  |  |  |  |  |  | $self->{defaultDocType}='text'; | 
| 293 |  |  |  |  |  |  | $self->{defaultDocSubType}='html'; | 
| 294 |  |  |  |  |  |  | $self->{defaultEncoding}='iso-8859-1'; | 
| 295 |  |  |  |  |  |  | $self->{includeOriginalDocument}=1; | 
| 296 |  |  |  |  |  |  | $self->{ainodumpWarnings}=1; | 
| 297 |  |  |  |  |  |  | $self->{sourceEncodingFromMeta}=0; | 
| 298 |  |  |  |  |  |  |  | 
| 299 |  |  |  |  |  |  | if (defined(@_)) | 
| 300 |  |  |  |  |  |  | { | 
| 301 |  |  |  |  |  |  | my %args=@_; | 
| 302 |  |  |  |  |  |  | @$self{ keys %args }=values(%args); | 
| 303 |  |  |  |  |  |  | } | 
| 304 |  |  |  |  |  |  |  | 
| 305 |  |  |  |  |  |  | } | 
| 306 |  |  |  |  |  |  |  | 
| 307 |  |  |  |  |  |  | # | 
| 308 |  |  |  |  |  |  | # in UTF-8 | 
| 309 |  |  |  |  |  |  | # | 
| 310 |  |  |  |  |  |  | sub HTML | 
| 311 |  |  |  |  |  |  | { | 
| 312 |  |  |  |  |  |  | my $self=shift; | 
| 313 |  |  |  |  |  |  | my $html=shift; | 
| 314 |  |  |  |  |  |  | my $meta_txt=shift; | 
| 315 |  |  |  |  |  |  | my $opts=shift; | 
| 316 |  |  |  |  |  |  |  | 
| 317 |  |  |  |  |  |  | $self->_set_err_state($ERR_OK); | 
| 318 |  |  |  |  |  |  |  | 
| 319 |  |  |  |  |  |  | my $meta=Alvis::Document::Meta->new(text=>$meta_txt); | 
| 320 |  |  |  |  |  |  | if (!defined($meta)) | 
| 321 |  |  |  |  |  |  | { | 
| 322 |  |  |  |  |  |  | $self->_set_err_state($ERR_META, | 
| 323 |  |  |  |  |  |  | "Meta text:\"$meta_txt\"."); | 
| 324 |  |  |  |  |  |  | return undef; | 
| 325 |  |  |  |  |  |  | } | 
| 326 |  |  |  |  |  |  |  | 
| 327 |  |  |  |  |  |  | my $src_enc; | 
| 328 |  |  |  |  |  |  | if ($opts->{sourceEncoding}) | 
| 329 |  |  |  |  |  |  | { | 
| 330 |  |  |  |  |  |  | $src_enc=$opts->{sourceEncoding}; | 
| 331 |  |  |  |  |  |  | } | 
| 332 |  |  |  |  |  |  | elsif (!exists($opts->{sourceEncoding}) && $self->{sourceEncoding}) | 
| 333 |  |  |  |  |  |  | { | 
| 334 |  |  |  |  |  |  | $src_enc=$self->{sourceEncoding}; | 
| 335 |  |  |  |  |  |  | } | 
| 336 |  |  |  |  |  |  | else | 
| 337 |  |  |  |  |  |  | { | 
| 338 |  |  |  |  |  |  | #	warn "NO SOURCE ENCODING GIVEN IN OPTIONS TO HTML() OR IN new()"; | 
| 339 |  |  |  |  |  |  | } | 
| 340 |  |  |  |  |  |  |  | 
| 341 |  |  |  |  |  |  | if ($opts->{sourceEncodingFromMeta} || $self->{sourceEncodingFromMeta}) | 
| 342 |  |  |  |  |  |  | { | 
| 343 |  |  |  |  |  |  | my $detected=$meta->get('detectedCharSet'); | 
| 344 |  |  |  |  |  |  | if ($detected) | 
| 345 |  |  |  |  |  |  | { | 
| 346 |  |  |  |  |  |  | $src_enc=$detected; | 
| 347 |  |  |  |  |  |  | } | 
| 348 |  |  |  |  |  |  | } | 
| 349 |  |  |  |  |  |  |  | 
| 350 |  |  |  |  |  |  | my ($can_doc,$header)= | 
| 351 |  |  |  |  |  |  | $self->{canonicalConverter}->HTML($html, | 
| 352 |  |  |  |  |  |  | {title=>1, | 
| 353 |  |  |  |  |  |  | baseURL=>1, | 
| 354 |  |  |  |  |  |  | sourceEncoding=>$src_enc}); | 
| 355 |  |  |  |  |  |  | if (!defined($can_doc)) | 
| 356 |  |  |  |  |  |  | { | 
| 357 |  |  |  |  |  |  | $self->_set_err_state($ERR_CANDOC_CONV, | 
| 358 |  |  |  |  |  |  | $self->{canonicalConverter}->errmsg()); | 
| 359 |  |  |  |  |  |  | return undef; | 
| 360 |  |  |  |  |  |  | } | 
| 361 |  |  |  |  |  |  |  | 
| 362 |  |  |  |  |  |  | if (!defined($meta->get('title'))) | 
| 363 |  |  |  |  |  |  | { | 
| 364 |  |  |  |  |  |  | $meta->set('title',$header->{title}); | 
| 365 |  |  |  |  |  |  | } | 
| 366 |  |  |  |  |  |  | if (!defined($meta->get('url'))) | 
| 367 |  |  |  |  |  |  | { | 
| 368 |  |  |  |  |  |  | $self->_set_err_state($ERR_NO_URL); | 
| 369 |  |  |  |  |  |  | return undef; | 
| 370 |  |  |  |  |  |  | } | 
| 371 |  |  |  |  |  |  | else | 
| 372 |  |  |  |  |  |  | { | 
| 373 |  |  |  |  |  |  | if (!defined($meta->get('baseURL'))) | 
| 374 |  |  |  |  |  |  | { | 
| 375 |  |  |  |  |  |  | if (defined($header->{baseURL})) | 
| 376 |  |  |  |  |  |  | { | 
| 377 |  |  |  |  |  |  | $meta->set('baseURL',$header->{baseURL}); | 
| 378 |  |  |  |  |  |  | } | 
| 379 |  |  |  |  |  |  | else | 
| 380 |  |  |  |  |  |  | { | 
| 381 |  |  |  |  |  |  | my $base_URL=$meta->get('url'); | 
| 382 |  |  |  |  |  |  | $base_URL=~s/\/[^\/]+?$/\//isgo; | 
| 383 |  |  |  |  |  |  | $meta->set('baseURL',$base_URL); | 
| 384 |  |  |  |  |  |  | } | 
| 385 |  |  |  |  |  |  | } | 
| 386 |  |  |  |  |  |  | } | 
| 387 |  |  |  |  |  |  |  | 
| 388 |  |  |  |  |  |  | my $links=Alvis::Document::Links->new(); | 
| 389 |  |  |  |  |  |  | if (!defined($links)) | 
| 390 |  |  |  |  |  |  | { | 
| 391 |  |  |  |  |  |  | $self->_set_err_state($ERR_LINKS); | 
| 392 |  |  |  |  |  |  | return undef; | 
| 393 |  |  |  |  |  |  | } | 
| 394 |  |  |  |  |  |  | for my $link (@{$header->{links}}) | 
| 395 |  |  |  |  |  |  | { | 
| 396 |  |  |  |  |  |  | my ($url,$text,$type); | 
| 397 |  |  |  |  |  |  | if (exists($link->{url})) | 
| 398 |  |  |  |  |  |  | { | 
| 399 |  |  |  |  |  |  | $url=$link->{url}; | 
| 400 |  |  |  |  |  |  | } | 
| 401 |  |  |  |  |  |  | if (exists($link->{text})) | 
| 402 |  |  |  |  |  |  | { | 
| 403 |  |  |  |  |  |  | $text=$link->{text}; | 
| 404 |  |  |  |  |  |  | } | 
| 405 |  |  |  |  |  |  | if (exists($link->{type})) | 
| 406 |  |  |  |  |  |  | { | 
| 407 |  |  |  |  |  |  | if ($link->{type}=~/^\s*a\s*$/isgo) | 
| 408 |  |  |  |  |  |  | { | 
| 409 |  |  |  |  |  |  | $type='a'; | 
| 410 |  |  |  |  |  |  | } | 
| 411 |  |  |  |  |  |  | elsif ($link->{type}=~/^\s*i?frame\s*$/isgo) | 
| 412 |  |  |  |  |  |  | { | 
| 413 |  |  |  |  |  |  | $type='frame'; | 
| 414 |  |  |  |  |  |  | } | 
| 415 |  |  |  |  |  |  | elsif ($link->{type}=~/^\s*img\s*$/isgo) | 
| 416 |  |  |  |  |  |  | { | 
| 417 |  |  |  |  |  |  | $type='img'; | 
| 418 |  |  |  |  |  |  | } | 
| 419 |  |  |  |  |  |  | } | 
| 420 |  |  |  |  |  |  |  | 
| 421 |  |  |  |  |  |  | if (!$links->add($url,$text,$type)) | 
| 422 |  |  |  |  |  |  | { | 
| 423 |  |  |  |  |  |  | $self->_set_err_state($ERR_LINK_ADD, | 
| 424 |  |  |  |  |  |  | $links->errmsg()); | 
| 425 |  |  |  |  |  |  | return undef; | 
| 426 |  |  |  |  |  |  | } | 
| 427 |  |  |  |  |  |  | } | 
| 428 |  |  |  |  |  |  |  | 
| 429 |  |  |  |  |  |  | my $alvisXML= | 
| 430 |  |  |  |  |  |  | $self->{documentAssembler}->assemble({canDoc=>$can_doc, | 
| 431 |  |  |  |  |  |  | links=>$links, | 
| 432 |  |  |  |  |  |  | meta=>$meta, | 
| 433 |  |  |  |  |  |  | origText=>$html}); | 
| 434 |  |  |  |  |  |  | if (!defined($alvisXML)) | 
| 435 |  |  |  |  |  |  | { | 
| 436 |  |  |  |  |  |  | $self->_set_err_state($ERR_ASSEMBLE, | 
| 437 |  |  |  |  |  |  | $self->{documentAssembler}->errmsg()); | 
| 438 |  |  |  |  |  |  | return undef; | 
| 439 |  |  |  |  |  |  | } | 
| 440 |  |  |  |  |  |  |  | 
| 441 |  |  |  |  |  |  | return $alvisXML; | 
| 442 |  |  |  |  |  |  | } | 
| 443 |  |  |  |  |  |  |  | 
| 444 |  |  |  |  |  |  | sub newsXML | 
| 445 |  |  |  |  |  |  | { | 
| 446 |  |  |  |  |  |  | my $self=shift; | 
| 447 |  |  |  |  |  |  | my $newsXML=shift; | 
| 448 |  |  |  |  |  |  | my $meta_txt=shift; | 
| 449 |  |  |  |  |  |  | my $orig_txt=shift; | 
| 450 |  |  |  |  |  |  |  | 
| 451 |  |  |  |  |  |  | $self->_set_err_state($ERR_OK); | 
| 452 |  |  |  |  |  |  |  | 
| 453 |  |  |  |  |  |  | my $meta=Alvis::Document::Meta->new(text=>$meta_txt); | 
| 454 |  |  |  |  |  |  | if (!defined($meta)) | 
| 455 |  |  |  |  |  |  | { | 
| 456 |  |  |  |  |  |  | $self->_set_err_state($ERR_META, | 
| 457 |  |  |  |  |  |  | "Meta text:\"$meta_txt\"."); | 
| 458 |  |  |  |  |  |  | return undef; | 
| 459 |  |  |  |  |  |  | } | 
| 460 |  |  |  |  |  |  |  | 
| 461 |  |  |  |  |  |  | my @alvisXMLs=(); | 
| 462 |  |  |  |  |  |  |  | 
| 463 |  |  |  |  |  |  | my $articles=$self->_parse_newsXML($newsXML); | 
| 464 |  |  |  |  |  |  | if (!defined($articles)) | 
| 465 |  |  |  |  |  |  | { | 
| 466 |  |  |  |  |  |  | $self->_set_err_state($ERR_NEWS_XML_PARSE); | 
| 467 |  |  |  |  |  |  | return undef; | 
| 468 |  |  |  |  |  |  | } | 
| 469 |  |  |  |  |  |  | for my $article (@$articles) | 
| 470 |  |  |  |  |  |  | { | 
| 471 |  |  |  |  |  |  | my ($text,$iso_date,$title,$links)=@$article; | 
| 472 |  |  |  |  |  |  | if (!defined($text)) | 
| 473 |  |  |  |  |  |  | { | 
| 474 |  |  |  |  |  |  | $self->_set_err_state($ERR_NO_NEWS_XML_TEXT, | 
| 475 |  |  |  |  |  |  | "News XML text:\"$newsXML\"."); | 
| 476 |  |  |  |  |  |  | # OK, ignore | 
| 477 |  |  |  |  |  |  | next; | 
| 478 |  |  |  |  |  |  | #	    return undef; | 
| 479 |  |  |  |  |  |  | } | 
| 480 |  |  |  |  |  |  | $text='' . $text . ''; | 
| 481 |  |  |  |  |  |  |  | 
| 482 |  |  |  |  |  |  | # Check that the ISO date actually is in ISO format... | 
| 483 |  |  |  |  |  |  | if (defined($iso_date)) | 
| 484 |  |  |  |  |  |  | { | 
| 485 |  |  |  |  |  |  | $meta->set('dc:date',$iso_date); | 
| 486 |  |  |  |  |  |  | } | 
| 487 |  |  |  |  |  |  |  | 
| 488 |  |  |  |  |  |  | my ($can_doc,$header)= | 
| 489 |  |  |  |  |  |  | $self->{canonicalConverter}->HTML($text, | 
| 490 |  |  |  |  |  |  | {sourceEncoding=>'utf8'}); | 
| 491 |  |  |  |  |  |  | if (!defined($can_doc)) | 
| 492 |  |  |  |  |  |  | { | 
| 493 |  |  |  |  |  |  | $self->_set_err_state($ERR_CANDOC_CONV, | 
| 494 |  |  |  |  |  |  | $self->{canonicalConverter}->errmsg()); | 
| 495 |  |  |  |  |  |  | return undef; | 
| 496 |  |  |  |  |  |  | } | 
| 497 |  |  |  |  |  |  |  | 
| 498 |  |  |  |  |  |  | if (defined($title)) | 
| 499 |  |  |  |  |  |  | { | 
| 500 |  |  |  |  |  |  | $meta->set('title',$title); | 
| 501 |  |  |  |  |  |  | } | 
| 502 |  |  |  |  |  |  | if (!defined($meta->get('url'))) | 
| 503 |  |  |  |  |  |  | { | 
| 504 |  |  |  |  |  |  | $self->_set_err_state($ERR_NO_URL); | 
| 505 |  |  |  |  |  |  | return undef; | 
| 506 |  |  |  |  |  |  | } | 
| 507 |  |  |  |  |  |  | else | 
| 508 |  |  |  |  |  |  | { | 
| 509 |  |  |  |  |  |  | if (!defined($meta->get('baseURL'))) | 
| 510 |  |  |  |  |  |  | { | 
| 511 |  |  |  |  |  |  | my $base_URL=$meta->get('url'); | 
| 512 |  |  |  |  |  |  | $base_URL=~s/\/[^\/]+?$/\//isgo; | 
| 513 |  |  |  |  |  |  | $meta->set('baseURL',$base_URL); | 
| 514 |  |  |  |  |  |  | } | 
| 515 |  |  |  |  |  |  | } | 
| 516 |  |  |  |  |  |  |  | 
| 517 |  |  |  |  |  |  | my $alvisXML= | 
| 518 |  |  |  |  |  |  | $self->{documentAssembler}->assemble({canDoc=>$can_doc, | 
| 519 |  |  |  |  |  |  | meta=>$meta, | 
| 520 |  |  |  |  |  |  | links=>$links, | 
| 521 |  |  |  |  |  |  | origText=>$orig_txt}); | 
| 522 |  |  |  |  |  |  | if (!defined($alvisXML)) | 
| 523 |  |  |  |  |  |  | { | 
| 524 |  |  |  |  |  |  | $self->_set_err_state($ERR_ASSEMBLE, | 
| 525 |  |  |  |  |  |  | $self->{documentAssembler}->errmsg()); | 
| 526 |  |  |  |  |  |  | return undef; | 
| 527 |  |  |  |  |  |  | } | 
| 528 |  |  |  |  |  |  | push(@alvisXMLs,$alvisXML); | 
| 529 |  |  |  |  |  |  | } | 
| 530 |  |  |  |  |  |  |  | 
| 531 |  |  |  |  |  |  | return \@alvisXMLs; | 
| 532 |  |  |  |  |  |  | } | 
| 533 |  |  |  |  |  |  |  | 
| 534 |  |  |  |  |  |  | sub ainodump | 
| 535 |  |  |  |  |  |  | { | 
| 536 |  |  |  |  |  |  | my $self=shift; | 
| 537 |  |  |  |  |  |  | my $f=shift; | 
| 538 |  |  |  |  |  |  |  | 
| 539 |  |  |  |  |  |  | # No meta needed -- one per record in the dump | 
| 540 |  |  |  |  |  |  | # | 
| 541 |  |  |  |  |  |  | if (!defined(open(AINO,"<:raw",$f))) | 
| 542 |  |  |  |  |  |  | { | 
| 543 |  |  |  |  |  |  | $self->_set_err_state($ERR_OPEN_AINODUMP, | 
| 544 |  |  |  |  |  |  | "File: \"$f\""); | 
| 545 |  |  |  |  |  |  | return 0; | 
| 546 |  |  |  |  |  |  | } | 
| 547 |  |  |  |  |  |  | if (!$self->{ainodumpConverter} | 
| 548 |  |  |  |  |  |  | ->process_dump(*AINO, | 
| 549 |  |  |  |  |  |  | [\&_process_ainodump_doc,$self])) | 
| 550 |  |  |  |  |  |  | { | 
| 551 |  |  |  |  |  |  | $self->_set_err_state($ERR_AINODUMP_PROCESS, | 
| 552 |  |  |  |  |  |  | "File: \"$f\""); | 
| 553 |  |  |  |  |  |  | return 0; | 
| 554 |  |  |  |  |  |  | } | 
| 555 |  |  |  |  |  |  | close(AINO); | 
| 556 |  |  |  |  |  |  |  | 
| 557 |  |  |  |  |  |  | return 1; | 
| 558 |  |  |  |  |  |  | } | 
| 559 |  |  |  |  |  |  |  | 
| 560 |  |  |  |  |  |  | # | 
| 561 |  |  |  |  |  |  | # output_cb: [\&_output_wikipedia_article,$arg1,$arg2,...] | 
| 562 |  |  |  |  |  |  | #               will be called like this: | 
| 563 |  |  |  |  |  |  | #          _output_wikipedia_article($arg1,$arg2,..., | 
| 564 |  |  |  |  |  |  | #                                    $title,$output_format, | 
| 565 |  |  |  |  |  |  | #                                    $record_txt,$is_redir) | 
| 566 |  |  |  |  |  |  | # | 
| 567 |  |  |  |  |  |  | #  where $output_format is a global defined in Alvis::Wikipedia::XMLDump | 
| 568 |  |  |  |  |  |  | #  as $OUTPUT_* | 
| 569 |  |  |  |  |  |  | # | 
| 570 |  |  |  |  |  |  | # | 
| 571 |  |  |  |  |  |  | # progress_cb: [\&_wikipedia_progress,$arg1,$arg2,...]     OPTIONAL | 
| 572 |  |  |  |  |  |  | #               will be called like this: | 
| 573 |  |  |  |  |  |  | #          _wikipedia_progress($arg1,$arg2,..., | 
| 574 |  |  |  |  |  |  | #                              $prog_txt,$N,$n,$mess) | 
| 575 |  |  |  |  |  |  | # | 
| 576 |  |  |  |  |  |  | #   where $N is the total number of records processed and $n the number of hits | 
| 577 |  |  |  |  |  |  | # | 
| 578 |  |  |  |  |  |  | # opts:  a hash of options with these possible fields: | 
| 579 |  |  |  |  |  |  | # | 
| 580 |  |  |  |  |  |  | #     namespaces              ref to a list of namespace identifiers whose | 
| 581 |  |  |  |  |  |  | #                             records to extract | 
| 582 |  |  |  |  |  |  | #     expandTemplates         flag for true template expansion | 
| 583 |  |  |  |  |  |  | #     templateDumpF           template dump file | 
| 584 |  |  |  |  |  |  | #     outputFormat            format for result records | 
| 585 |  |  |  |  |  |  | #                             ($Alvis::Wikipedia::XMLDump::OUTPUT_*) | 
| 586 |  |  |  |  |  |  | #     categoryWord            category namespace identifier (changes with | 
| 587 |  |  |  |  |  |  | #                             language) | 
| 588 |  |  |  |  |  |  | #     templateWord            template namespace identifier (changes with | 
| 589 |  |  |  |  |  |  | #                             language) | 
| 590 |  |  |  |  |  |  | #     rootCategory            root category identifier (changes with | 
| 591 |  |  |  |  |  |  | #                             language) | 
| 592 |  |  |  |  |  |  | #     date                    the date of the dump | 
| 593 |  |  |  |  |  |  | #     dumpCatGraph            flag for dumping the category graph | 
| 594 |  |  |  |  |  |  | #     catGraphDumpF           category graph dump file | 
| 595 |  |  |  |  |  |  | # | 
| 596 |  |  |  |  |  |  | sub wikipedia | 
| 597 |  |  |  |  |  |  | { | 
| 598 |  |  |  |  |  |  | my $self=shift; | 
| 599 |  |  |  |  |  |  | my $f=shift; | 
| 600 |  |  |  |  |  |  | my $output_cb=shift; | 
| 601 |  |  |  |  |  |  | my $opts=shift; | 
| 602 |  |  |  |  |  |  | my $progress_cb=shift; | 
| 603 |  |  |  |  |  |  |  | 
| 604 |  |  |  |  |  |  | if (!defined(open(WIKIPEDIA,"<:utf8",$f))) | 
| 605 |  |  |  |  |  |  | { | 
| 606 |  |  |  |  |  |  | $self->_set_err_state($ERR_OPEN_WIKIPEDIA, | 
| 607 |  |  |  |  |  |  | "File: \"$f\""); | 
| 608 |  |  |  |  |  |  | return 0; | 
| 609 |  |  |  |  |  |  | } | 
| 610 |  |  |  |  |  |  | if (!$self->{wikipediaConverter}->extract_records(\*WIKIPEDIA, | 
| 611 |  |  |  |  |  |  | $output_cb, | 
| 612 |  |  |  |  |  |  | $opts, | 
| 613 |  |  |  |  |  |  | $progress_cb)) | 
| 614 |  |  |  |  |  |  | { | 
| 615 |  |  |  |  |  |  | $self->_set_err_state($ERR_WIKIPEDIA_CONV, | 
| 616 |  |  |  |  |  |  | "File: \"$f\""); | 
| 617 |  |  |  |  |  |  | return 0; | 
| 618 |  |  |  |  |  |  | } | 
| 619 |  |  |  |  |  |  |  | 
| 620 |  |  |  |  |  |  | close(WIKIPEDIA); | 
| 621 |  |  |  |  |  |  |  | 
| 622 |  |  |  |  |  |  | return 1; | 
| 623 |  |  |  |  |  |  | } | 
| 624 |  |  |  |  |  |  |  | 
| 625 |  |  |  |  |  |  | sub set | 
| 626 |  |  |  |  |  |  | { | 
| 627 |  |  |  |  |  |  | my $self=shift; | 
| 628 |  |  |  |  |  |  | my $param=shift; | 
| 629 |  |  |  |  |  |  | my $value=shift; | 
| 630 |  |  |  |  |  |  |  | 
| 631 |  |  |  |  |  |  | $self->{$param}=$value; | 
| 632 |  |  |  |  |  |  | } | 
| 633 |  |  |  |  |  |  |  | 
| 634 |  |  |  |  |  |  | sub read_HTML | 
| 635 |  |  |  |  |  |  | { | 
| 636 |  |  |  |  |  |  | my $self=shift; | 
| 637 |  |  |  |  |  |  | my $f=shift; | 
| 638 |  |  |  |  |  |  | my $meta_txt=shift; | 
| 639 |  |  |  |  |  |  |  | 
| 640 |  |  |  |  |  |  | my $html_txt=""; | 
| 641 |  |  |  |  |  |  |  | 
| 642 |  |  |  |  |  |  | # Stupid duplicating of "how the f**k do you read UTF8 in Perl?" fix | 
| 643 |  |  |  |  |  |  | my $meta=Alvis::Document::Meta->new(text=>$meta_txt); | 
| 644 |  |  |  |  |  |  | if (!defined($meta)) | 
| 645 |  |  |  |  |  |  | { | 
| 646 |  |  |  |  |  |  | $self->_set_err_state($ERR_META, | 
| 647 |  |  |  |  |  |  | "Meta text:\"$meta_txt\"."); | 
| 648 |  |  |  |  |  |  | return undef; | 
| 649 |  |  |  |  |  |  | } | 
| 650 |  |  |  |  |  |  |  | 
| 651 |  |  |  |  |  |  | my $src_enc; | 
| 652 |  |  |  |  |  |  | if ($self->{sourceEncoding}) | 
| 653 |  |  |  |  |  |  | { | 
| 654 |  |  |  |  |  |  | $src_enc=$self->{sourceEncoding}; | 
| 655 |  |  |  |  |  |  | } | 
| 656 |  |  |  |  |  |  | if ($self->{sourceEncodingFromMeta}) | 
| 657 |  |  |  |  |  |  | { | 
| 658 |  |  |  |  |  |  | my $detected=$meta->get('detectedCharSet'); | 
| 659 |  |  |  |  |  |  | if ($detected) | 
| 660 |  |  |  |  |  |  | { | 
| 661 |  |  |  |  |  |  | $src_enc=$detected; | 
| 662 |  |  |  |  |  |  | } | 
| 663 |  |  |  |  |  |  | } | 
| 664 |  |  |  |  |  |  |  | 
| 665 |  |  |  |  |  |  | if (defined($src_enc) && $src_enc=~/^\s*utf\s*\-?\s*8\s*$/i) | 
| 666 |  |  |  |  |  |  | { | 
| 667 |  |  |  |  |  |  | if (!defined(open(H,"<:utf8",$f))) | 
| 668 |  |  |  |  |  |  | { | 
| 669 |  |  |  |  |  |  | $self->_set_err_state($ERR_HTML_F, | 
| 670 |  |  |  |  |  |  | "File: \"$f\"."); | 
| 671 |  |  |  |  |  |  | return undef; | 
| 672 |  |  |  |  |  |  | } | 
| 673 |  |  |  |  |  |  | while (my $l=) | 
| 674 |  |  |  |  |  |  | { | 
| 675 |  |  |  |  |  |  | $html_txt.=$l; | 
| 676 |  |  |  |  |  |  | } | 
| 677 |  |  |  |  |  |  | close(H); | 
| 678 |  |  |  |  |  |  | } | 
| 679 |  |  |  |  |  |  | else | 
| 680 |  |  |  |  |  |  | { | 
| 681 |  |  |  |  |  |  | if (!defined(open(H,"<$f"))) | 
| 682 |  |  |  |  |  |  | { | 
| 683 |  |  |  |  |  |  | $self->_set_err_state($ERR_HTML_F, | 
| 684 |  |  |  |  |  |  | "File: \"$f\"."); | 
| 685 |  |  |  |  |  |  | return undef; | 
| 686 |  |  |  |  |  |  | } | 
| 687 |  |  |  |  |  |  | while (my $l=) | 
| 688 |  |  |  |  |  |  | { | 
| 689 |  |  |  |  |  |  | $html_txt.=$l; | 
| 690 |  |  |  |  |  |  | } | 
| 691 |  |  |  |  |  |  | close(H); | 
| 692 |  |  |  |  |  |  | } | 
| 693 |  |  |  |  |  |  |  | 
| 694 |  |  |  |  |  |  | return $html_txt; | 
| 695 |  |  |  |  |  |  | } | 
| 696 |  |  |  |  |  |  |  | 
| 697 |  |  |  |  |  |  | sub read_meta | 
| 698 |  |  |  |  |  |  | { | 
| 699 |  |  |  |  |  |  | my $self=shift; | 
| 700 |  |  |  |  |  |  | my $f=shift; | 
| 701 |  |  |  |  |  |  |  | 
| 702 |  |  |  |  |  |  | my $meta_txt=""; | 
| 703 |  |  |  |  |  |  |  | 
| 704 |  |  |  |  |  |  | if (defined($self->{metaEncoding})) | 
| 705 |  |  |  |  |  |  | { | 
| 706 |  |  |  |  |  |  | if ($self->{metaEncoding}=~/^\s*utf\s*\-?\s*8\s*$/i) | 
| 707 |  |  |  |  |  |  | { | 
| 708 |  |  |  |  |  |  | if (!defined(open(M,"<:utf8",$f))) | 
| 709 |  |  |  |  |  |  | { | 
| 710 |  |  |  |  |  |  | $self->_set_err_state($ERR_META_F, | 
| 711 |  |  |  |  |  |  | "File: \"$f\"."); | 
| 712 |  |  |  |  |  |  | return undef; | 
| 713 |  |  |  |  |  |  | } | 
| 714 |  |  |  |  |  |  | while (my $l=) | 
| 715 |  |  |  |  |  |  | { | 
| 716 |  |  |  |  |  |  | $meta_txt.=$l; | 
| 717 |  |  |  |  |  |  | } | 
| 718 |  |  |  |  |  |  | close(M); | 
| 719 |  |  |  |  |  |  | } | 
| 720 |  |  |  |  |  |  | else  # non-UTF8 | 
| 721 |  |  |  |  |  |  | { | 
| 722 |  |  |  |  |  |  | if (!defined(open(M,"<$f"))) | 
| 723 |  |  |  |  |  |  | { | 
| 724 |  |  |  |  |  |  | $self->_set_err_state($ERR_META_F, | 
| 725 |  |  |  |  |  |  | "File: \"$f\"."); | 
| 726 |  |  |  |  |  |  | return undef; | 
| 727 |  |  |  |  |  |  | } | 
| 728 |  |  |  |  |  |  | while (my $l=) | 
| 729 |  |  |  |  |  |  | { | 
| 730 |  |  |  |  |  |  | $meta_txt.=$l; | 
| 731 |  |  |  |  |  |  | } | 
| 732 |  |  |  |  |  |  | close(M); | 
| 733 |  |  |  |  |  |  |  | 
| 734 |  |  |  |  |  |  | eval | 
| 735 |  |  |  |  |  |  | { | 
| 736 |  |  |  |  |  |  | Encode::from_to($meta_txt, | 
| 737 |  |  |  |  |  |  | $self->{metaEncoding},'utf-8',Encode::FB_WARN); | 
| 738 |  |  |  |  |  |  | }; | 
| 739 |  |  |  |  |  |  | if ($@) | 
| 740 |  |  |  |  |  |  | { | 
| 741 |  |  |  |  |  |  | $self->_set_err_state($ERR_ENCODING_CONV, | 
| 742 |  |  |  |  |  |  | "$@. Supposed source encoding of \"$f\":" . | 
| 743 |  |  |  |  |  |  | "\"$self->{metaEncoding}\"."); | 
| 744 |  |  |  |  |  |  | return undef; | 
| 745 |  |  |  |  |  |  | } | 
| 746 |  |  |  |  |  |  | } | 
| 747 |  |  |  |  |  |  | } | 
| 748 |  |  |  |  |  |  | else # encoding unknown | 
| 749 |  |  |  |  |  |  | { | 
| 750 |  |  |  |  |  |  | if (!defined(open(M,"<$f"))) | 
| 751 |  |  |  |  |  |  | { | 
| 752 |  |  |  |  |  |  | $self->_set_err_state($ERR_META_F, | 
| 753 |  |  |  |  |  |  | "File: \"$f\"."); | 
| 754 |  |  |  |  |  |  | return undef; | 
| 755 |  |  |  |  |  |  | } | 
| 756 |  |  |  |  |  |  | my $meta_txt=""; | 
| 757 |  |  |  |  |  |  | while (my $l=) | 
| 758 |  |  |  |  |  |  | { | 
| 759 |  |  |  |  |  |  | $meta_txt.=$l; | 
| 760 |  |  |  |  |  |  | } | 
| 761 |  |  |  |  |  |  | close(M); | 
| 762 |  |  |  |  |  |  |  | 
| 763 |  |  |  |  |  |  | $meta_txt=$self->{encodingWizard}->try_to_convert_to_utf8($meta_txt, | 
| 764 |  |  |  |  |  |  | 'text', | 
| 765 |  |  |  |  |  |  | 'plain'); | 
| 766 |  |  |  |  |  |  | if (!defined($meta_txt)) | 
| 767 |  |  |  |  |  |  | { | 
| 768 |  |  |  |  |  |  | $self->_set_err_state($ERR_UTF8_CONV, | 
| 769 |  |  |  |  |  |  | $self->{encodingWizard}->errmsg()); | 
| 770 |  |  |  |  |  |  | return undef; | 
| 771 |  |  |  |  |  |  | } | 
| 772 |  |  |  |  |  |  | } | 
| 773 |  |  |  |  |  |  |  | 
| 774 |  |  |  |  |  |  | return $meta_txt; | 
| 775 |  |  |  |  |  |  | } | 
| 776 |  |  |  |  |  |  |  | 
| 777 |  |  |  |  |  |  | sub read_news_XML | 
| 778 |  |  |  |  |  |  | { | 
| 779 |  |  |  |  |  |  | my $self=shift; | 
| 780 |  |  |  |  |  |  | my $f=shift; | 
| 781 |  |  |  |  |  |  |  | 
| 782 |  |  |  |  |  |  | if (!defined(open(X,"<:utf8",$f))) | 
| 783 |  |  |  |  |  |  | { | 
| 784 |  |  |  |  |  |  | $self->_set_err_state($ERR_NEWS_XML_F, | 
| 785 |  |  |  |  |  |  | "File: \"$f\"."); | 
| 786 |  |  |  |  |  |  | return undef; | 
| 787 |  |  |  |  |  |  | } | 
| 788 |  |  |  |  |  |  | my $txt=""; | 
| 789 |  |  |  |  |  |  | while (my $l=) | 
| 790 |  |  |  |  |  |  | { | 
| 791 |  |  |  |  |  |  | $txt.=$l; | 
| 792 |  |  |  |  |  |  | } | 
| 793 |  |  |  |  |  |  | close(X); | 
| 794 |  |  |  |  |  |  |  | 
| 795 |  |  |  |  |  |  | return $txt; | 
| 796 |  |  |  |  |  |  | } | 
| 797 |  |  |  |  |  |  |  | 
| 798 |  |  |  |  |  |  | sub init_output | 
| 799 |  |  |  |  |  |  | { | 
| 800 |  |  |  |  |  |  | my $self=shift; | 
| 801 |  |  |  |  |  |  |  | 
| 802 |  |  |  |  |  |  | $self->{outputN}=0; | 
| 803 |  |  |  |  |  |  | } | 
| 804 |  |  |  |  |  |  |  | 
| 805 |  |  |  |  |  |  | sub output_Alvis | 
| 806 |  |  |  |  |  |  | { | 
| 807 |  |  |  |  |  |  | my $self=shift; | 
| 808 |  |  |  |  |  |  | my $alvis_records=shift; | 
| 809 |  |  |  |  |  |  | my $base_name=shift; | 
| 810 |  |  |  |  |  |  |  | 
| 811 |  |  |  |  |  |  | $self->{recordN}=0; | 
| 812 |  |  |  |  |  |  | for my $alvis_record (@$alvis_records) | 
| 813 |  |  |  |  |  |  | { | 
| 814 |  |  |  |  |  |  | if (!defined($alvis_record)) | 
| 815 |  |  |  |  |  |  | { | 
| 816 |  |  |  |  |  |  | $self->_set_err_state($ERR_DOC_ALVIS_CONV, | 
| 817 |  |  |  |  |  |  | "Base name:\"$base_name\"," . | 
| 818 |  |  |  |  |  |  | "# of record: $self->{recordN}"); | 
| 819 |  |  |  |  |  |  | return 0; | 
| 820 |  |  |  |  |  |  | } | 
| 821 |  |  |  |  |  |  |  | 
| 822 |  |  |  |  |  |  | my $out_f; | 
| 823 |  |  |  |  |  |  | if (!defined($self->{alvisSuffix})) | 
| 824 |  |  |  |  |  |  | { | 
| 825 |  |  |  |  |  |  | $self->_set_err_state($ERR_ALVIS_SUFFIX); | 
| 826 |  |  |  |  |  |  | return 0; | 
| 827 |  |  |  |  |  |  | } | 
| 828 |  |  |  |  |  |  | if ($self->{outputAtSameLocation}) | 
| 829 |  |  |  |  |  |  | { | 
| 830 |  |  |  |  |  |  | $out_f=$base_name . "." . $self->{articleN} . '.' . | 
| 831 |  |  |  |  |  |  | $self->{alvisSuffix}; | 
| 832 |  |  |  |  |  |  | $self->{articleN}++; | 
| 833 |  |  |  |  |  |  | if (!$self->_output_set_of_records($alvis_record,$out_f)) | 
| 834 |  |  |  |  |  |  | { | 
| 835 |  |  |  |  |  |  | $self->_set_err_state($ERR_OUTPUT_SET_OF_RECORDS); | 
| 836 |  |  |  |  |  |  | return 0; | 
| 837 |  |  |  |  |  |  | } | 
| 838 |  |  |  |  |  |  | $self->{outputN}++; | 
| 839 |  |  |  |  |  |  | print "$self->{outputN}\r"; | 
| 840 |  |  |  |  |  |  | } | 
| 841 |  |  |  |  |  |  | else | 
| 842 |  |  |  |  |  |  | { | 
| 843 |  |  |  |  |  |  | if (!defined($self->{outputRootDir})) | 
| 844 |  |  |  |  |  |  | { | 
| 845 |  |  |  |  |  |  | $self->_set_err_state($ERR_NO_OUTPUT_ROOT_DIR); | 
| 846 |  |  |  |  |  |  | return 0; | 
| 847 |  |  |  |  |  |  | } | 
| 848 |  |  |  |  |  |  | my $dir=$self->{outputRootDir} . '/' . | 
| 849 |  |  |  |  |  |  | int($self->{outputN} / $self->{outputNPerSubdir}); | 
| 850 |  |  |  |  |  |  | if ($self->{outputN} % $self->{outputNPerSubdir}==0) | 
| 851 |  |  |  |  |  |  | { | 
| 852 |  |  |  |  |  |  | mkdir($dir); | 
| 853 |  |  |  |  |  |  | } | 
| 854 |  |  |  |  |  |  | $out_f=$dir . '/' . $self->{outputN} . '.' . | 
| 855 |  |  |  |  |  |  | $self->{alvisSuffix}; | 
| 856 |  |  |  |  |  |  |  | 
| 857 |  |  |  |  |  |  | if (!$self->_output_set_of_records($alvis_record,$out_f)) | 
| 858 |  |  |  |  |  |  | { | 
| 859 |  |  |  |  |  |  | $self->_set_err_state($ERR_OUTPUT_SET_OF_RECORDS); | 
| 860 |  |  |  |  |  |  | return 0; | 
| 861 |  |  |  |  |  |  | } | 
| 862 |  |  |  |  |  |  |  | 
| 863 |  |  |  |  |  |  | $self->{outputN}++; | 
| 864 |  |  |  |  |  |  | print "$self->{outputN}\r"; | 
| 865 |  |  |  |  |  |  | } | 
| 866 |  |  |  |  |  |  | } | 
| 867 |  |  |  |  |  |  |  | 
| 868 |  |  |  |  |  |  | return 1; | 
| 869 |  |  |  |  |  |  | } | 
| 870 |  |  |  |  |  |  |  | 
| 871 |  |  |  |  |  |  | ############################################################################ | 
| 872 |  |  |  |  |  |  | # | 
| 873 |  |  |  |  |  |  | #          Private methods | 
| 874 |  |  |  |  |  |  | # | 
| 875 |  |  |  |  |  |  | ############################################################################ | 
| 876 |  |  |  |  |  |  |  | 
| 877 |  |  |  |  |  |  | sub _process_ainodump_doc | 
| 878 |  |  |  |  |  |  | { | 
| 879 |  |  |  |  |  |  | my $self=shift; | 
| 880 |  |  |  |  |  |  | my $text=shift; | 
| 881 |  |  |  |  |  |  | my $header=shift; | 
| 882 |  |  |  |  |  |  |  | 
| 883 |  |  |  |  |  |  | #    print Dumper($header); | 
| 884 |  |  |  |  |  |  | #    print "\n"; | 
| 885 |  |  |  |  |  |  |  | 
| 886 |  |  |  |  |  |  | my ($type,$sub_type)=$self->{docTypeWizard}->guess($text); | 
| 887 |  |  |  |  |  |  | if (!(defined($type) && defined($sub_type))) | 
| 888 |  |  |  |  |  |  | { | 
| 889 |  |  |  |  |  |  | $self->_set_err_state($ERR_TYPE_GUESS, | 
| 890 |  |  |  |  |  |  | $self->{docTypeWizard}->errmsg()); | 
| 891 |  |  |  |  |  |  | return 0; | 
| 892 |  |  |  |  |  |  | } | 
| 893 |  |  |  |  |  |  |  | 
| 894 |  |  |  |  |  |  | #    print "TYPE:$type,SUBTYPE:$sub_type\n"; | 
| 895 |  |  |  |  |  |  |  | 
| 896 |  |  |  |  |  |  | if ($type eq 'text' && $sub_type eq 'html') | 
| 897 |  |  |  |  |  |  | { | 
| 898 |  |  |  |  |  |  | my $meta_txt; | 
| 899 |  |  |  |  |  |  | if (defined($header->{url})) | 
| 900 |  |  |  |  |  |  | { | 
| 901 |  |  |  |  |  |  | $meta_txt.="url\t$header->{url}\n"; | 
| 902 |  |  |  |  |  |  | } | 
| 903 |  |  |  |  |  |  | if (defined($header->{time})) | 
| 904 |  |  |  |  |  |  | { | 
| 905 |  |  |  |  |  |  | $meta_txt.="date\t$header->{time}\n"; | 
| 906 |  |  |  |  |  |  | } | 
| 907 |  |  |  |  |  |  |  | 
| 908 |  |  |  |  |  |  | my $base_name; | 
| 909 |  |  |  |  |  |  | if (defined($header->{id})) | 
| 910 |  |  |  |  |  |  | { | 
| 911 |  |  |  |  |  |  | $base_name=$header->{id}; | 
| 912 |  |  |  |  |  |  | } | 
| 913 |  |  |  |  |  |  | else | 
| 914 |  |  |  |  |  |  | { | 
| 915 |  |  |  |  |  |  | warn "Ainodump document had no ID. URL,time:" . | 
| 916 |  |  |  |  |  |  | "($header->{url},$header->{time})\n" if $self->{ainodumpWarnings}; | 
| 917 |  |  |  |  |  |  | return 1; | 
| 918 |  |  |  |  |  |  | } | 
| 919 |  |  |  |  |  |  |  | 
| 920 |  |  |  |  |  |  | my $srcenc_setting=$self->{sourceEncoding}; | 
| 921 |  |  |  |  |  |  | $self->{sourceEncoding}=undef; | 
| 922 |  |  |  |  |  |  | my $alvisXML=$self->HTML($text,$meta_txt); | 
| 923 |  |  |  |  |  |  | $self->{sourceEncoding}=$srcenc_setting; | 
| 924 |  |  |  |  |  |  | if (!defined($alvisXML)) | 
| 925 |  |  |  |  |  |  | { | 
| 926 |  |  |  |  |  |  | $self->_set_err_state($ERR_ALVIS_CONV); | 
| 927 |  |  |  |  |  |  | return 0; | 
| 928 |  |  |  |  |  |  | } | 
| 929 |  |  |  |  |  |  |  | 
| 930 |  |  |  |  |  |  | if (!$self->output_Alvis([$alvisXML],$base_name)) | 
| 931 |  |  |  |  |  |  | { | 
| 932 |  |  |  |  |  |  | $self->_set_err_state($ERR_OUTPUT_ALVIS, | 
| 933 |  |  |  |  |  |  | "Base name: \"$base_name\""); | 
| 934 |  |  |  |  |  |  | return 0; | 
| 935 |  |  |  |  |  |  | } | 
| 936 |  |  |  |  |  |  | } | 
| 937 |  |  |  |  |  |  | else | 
| 938 |  |  |  |  |  |  | { | 
| 939 |  |  |  |  |  |  | warn "Ainodump document $header->{id} was not of a convertible " . | 
| 940 |  |  |  |  |  |  | "type: $type/$sub_type.\n" if $self->{ainodumpWarnings}; | 
| 941 |  |  |  |  |  |  | } | 
| 942 |  |  |  |  |  |  |  | 
| 943 |  |  |  |  |  |  | return 1; | 
| 944 |  |  |  |  |  |  | } | 
| 945 |  |  |  |  |  |  |  | 
| 946 |  |  |  |  |  |  | sub _output_set_of_records | 
| 947 |  |  |  |  |  |  | { | 
| 948 |  |  |  |  |  |  | my $self=shift; | 
| 949 |  |  |  |  |  |  | my $set_of_records_txt=shift; | 
| 950 |  |  |  |  |  |  | my $path=shift; | 
| 951 |  |  |  |  |  |  |  | 
| 952 |  |  |  |  |  |  | if (!defined(open(OUT,">:utf8",$path))) | 
| 953 |  |  |  |  |  |  | { | 
| 954 |  |  |  |  |  |  | $self->_set_err_state($ERR_WRITING_OUTPUT,"Output file: " . | 
| 955 |  |  |  |  |  |  | "\"$path\""); | 
| 956 |  |  |  |  |  |  | return 0; | 
| 957 |  |  |  |  |  |  | } | 
| 958 |  |  |  |  |  |  | print OUT "\n"; | 
| 959 |  |  |  |  |  |  | print OUT "\n"; | 
| 960 |  |  |  |  |  |  | print OUT $set_of_records_txt; | 
| 961 |  |  |  |  |  |  | print OUT "\n"; | 
| 962 |  |  |  |  |  |  | close(OUT); | 
| 963 |  |  |  |  |  |  |  | 
| 964 |  |  |  |  |  |  | return 1; | 
| 965 |  |  |  |  |  |  | } | 
| 966 |  |  |  |  |  |  |  | 
| 967 |  |  |  |  |  |  | sub _get_HTML_txt | 
| 968 |  |  |  |  |  |  | { | 
| 969 |  |  |  |  |  |  | my $self=shift; | 
| 970 |  |  |  |  |  |  | my $file_versions=shift; | 
| 971 |  |  |  |  |  |  | my $base_name=shift; | 
| 972 |  |  |  |  |  |  | my $html_suffix=shift; | 
| 973 |  |  |  |  |  |  |  | 
| 974 |  |  |  |  |  |  | my ($html_txt); | 
| 975 |  |  |  |  |  |  | if (defined($html_suffix) && | 
| 976 |  |  |  |  |  |  | exists($file_versions->{$base_name}{$html_suffix})) | 
| 977 |  |  |  |  |  |  | { | 
| 978 |  |  |  |  |  |  | my $html_f=$base_name . "." . $html_suffix; | 
| 979 |  |  |  |  |  |  | $html_txt=$self->_read_HTML($html_f); | 
| 980 |  |  |  |  |  |  | if (!defined($html_txt)) | 
| 981 |  |  |  |  |  |  | { | 
| 982 |  |  |  |  |  |  | $self->_set_err_state($ERR_READ_HTML,"File:\"$html_f\""); | 
| 983 |  |  |  |  |  |  | return undef; | 
| 984 |  |  |  |  |  |  | } | 
| 985 |  |  |  |  |  |  | } | 
| 986 |  |  |  |  |  |  | else # no HTML file | 
| 987 |  |  |  |  |  |  | { | 
| 988 |  |  |  |  |  |  | $self->_set_err_state($ERR_NO_HTML_F,"Base name:\"$base_name\""); | 
| 989 |  |  |  |  |  |  | return undef; | 
| 990 |  |  |  |  |  |  | } | 
| 991 |  |  |  |  |  |  |  | 
| 992 |  |  |  |  |  |  | return $html_txt; | 
| 993 |  |  |  |  |  |  | } | 
| 994 |  |  |  |  |  |  |  | 
| 995 |  |  |  |  |  |  |  | 
| 996 |  |  |  |  |  |  | sub _read_HTML | 
| 997 |  |  |  |  |  |  | { | 
| 998 |  |  |  |  |  |  | my $self=shift; | 
| 999 |  |  |  |  |  |  | my $f=shift; | 
| 1000 |  |  |  |  |  |  |  | 
| 1001 |  |  |  |  |  |  | if (!defined(open(H,"<$f"))) | 
| 1002 |  |  |  |  |  |  | { | 
| 1003 |  |  |  |  |  |  | $self->_set_err_state($ERR_HTML_F, | 
| 1004 |  |  |  |  |  |  | "File: \"$f\"."); | 
| 1005 |  |  |  |  |  |  | return undef; | 
| 1006 |  |  |  |  |  |  | } | 
| 1007 |  |  |  |  |  |  | my $txt=""; | 
| 1008 |  |  |  |  |  |  | while (my $l=) | 
| 1009 |  |  |  |  |  |  | { | 
| 1010 |  |  |  |  |  |  | $txt.=$l; | 
| 1011 |  |  |  |  |  |  | } | 
| 1012 |  |  |  |  |  |  | close(H); | 
| 1013 |  |  |  |  |  |  |  | 
| 1014 |  |  |  |  |  |  | return $txt; | 
| 1015 |  |  |  |  |  |  | } | 
| 1016 |  |  |  |  |  |  |  | 
| 1017 |  |  |  |  |  |  | sub _parse_newsXML | 
| 1018 |  |  |  |  |  |  | { | 
| 1019 |  |  |  |  |  |  | my $self=shift; | 
| 1020 |  |  |  |  |  |  | my $newsXML=shift; | 
| 1021 |  |  |  |  |  |  |  | 
| 1022 |  |  |  |  |  |  | if ($newsXML=~/^\s*$/isgo) | 
| 1023 |  |  |  |  |  |  | { | 
| 1024 |  |  |  |  |  |  | return []; | 
| 1025 |  |  |  |  |  |  | } | 
| 1026 |  |  |  |  |  |  |  | 
| 1027 |  |  |  |  |  |  | my @articles=(); | 
| 1028 |  |  |  |  |  |  | my ($text,$iso_date,$title,$links); | 
| 1029 |  |  |  |  |  |  |  | 
| 1030 |  |  |  |  |  |  | my $doc; | 
| 1031 |  |  |  |  |  |  | eval | 
| 1032 |  |  |  |  |  |  | { | 
| 1033 |  |  |  |  |  |  | $doc=$self->{XMLParser}->parse_string($newsXML); | 
| 1034 |  |  |  |  |  |  | }; | 
| 1035 |  |  |  |  |  |  | if ($@) | 
| 1036 |  |  |  |  |  |  | { | 
| 1037 |  |  |  |  |  |  | $self->_set_err_state($ERR_XML_PARSE,"$@"); | 
| 1038 |  |  |  |  |  |  | return undef; | 
| 1039 |  |  |  |  |  |  | } | 
| 1040 |  |  |  |  |  |  |  | 
| 1041 |  |  |  |  |  |  | my $root=$doc->documentElement(); | 
| 1042 |  |  |  |  |  |  |  | 
| 1043 |  |  |  |  |  |  | for my $article ($root->getChildrenByTagName('article')) | 
| 1044 |  |  |  |  |  |  | { | 
| 1045 |  |  |  |  |  |  | $links=Alvis::Document::Links->new(); | 
| 1046 |  |  |  |  |  |  | if (!defined($links)) | 
| 1047 |  |  |  |  |  |  | { | 
| 1048 |  |  |  |  |  |  | $self->_set_err_state($ERR_LINKS); | 
| 1049 |  |  |  |  |  |  | return undef; | 
| 1050 |  |  |  |  |  |  | } | 
| 1051 |  |  |  |  |  |  |  | 
| 1052 |  |  |  |  |  |  | for my $t ($article->getChildrenByTagName('title')) | 
| 1053 |  |  |  |  |  |  | { | 
| 1054 |  |  |  |  |  |  | $title=$t->textContent(); | 
| 1055 |  |  |  |  |  |  | } | 
| 1056 |  |  |  |  |  |  | for my $i_d ($article->getChildrenByTagName('iso-date')) | 
| 1057 |  |  |  |  |  |  | { | 
| 1058 |  |  |  |  |  |  | $iso_date=$i_d->textContent(); | 
| 1059 |  |  |  |  |  |  | } | 
| 1060 |  |  |  |  |  |  | for my $c ($article->getChildrenByTagName('content')) | 
| 1061 |  |  |  |  |  |  | { | 
| 1062 |  |  |  |  |  |  | $text=$c->textContent(); | 
| 1063 |  |  |  |  |  |  | } | 
| 1064 |  |  |  |  |  |  | for my $ls ($article->getChildrenByTagName('links')) | 
| 1065 |  |  |  |  |  |  | { | 
| 1066 |  |  |  |  |  |  | for my $l ($ls->getChildrenByTagName('link')) | 
| 1067 |  |  |  |  |  |  | { | 
| 1068 |  |  |  |  |  |  | my ($l_text,$l_url); | 
| 1069 |  |  |  |  |  |  | my $l_type=$l->getAttribute('type'); | 
| 1070 |  |  |  |  |  |  | for my $l_t ($l->getChildrenByTagName('anchorText')) | 
| 1071 |  |  |  |  |  |  | { | 
| 1072 |  |  |  |  |  |  | $l_text=$l_t->textContent(); | 
| 1073 |  |  |  |  |  |  | } | 
| 1074 |  |  |  |  |  |  | for my $l_u ($l->getChildrenByTagName('location')) | 
| 1075 |  |  |  |  |  |  | { | 
| 1076 |  |  |  |  |  |  | $l_url=$l_u->textContent(); | 
| 1077 |  |  |  |  |  |  | } | 
| 1078 |  |  |  |  |  |  |  | 
| 1079 |  |  |  |  |  |  | if (!$links->add($l_url,$l_text,$l_type)) | 
| 1080 |  |  |  |  |  |  | { | 
| 1081 |  |  |  |  |  |  | $self->_set_err_state($ERR_LINK_ADD, | 
| 1082 |  |  |  |  |  |  | "Title:\"$title\", " . $links->errmsg()); | 
| 1083 |  |  |  |  |  |  | next; | 
| 1084 |  |  |  |  |  |  | } | 
| 1085 |  |  |  |  |  |  |  | 
| 1086 |  |  |  |  |  |  | } | 
| 1087 |  |  |  |  |  |  | } | 
| 1088 |  |  |  |  |  |  | push(@articles,[$text,$iso_date,$title,$links]); | 
| 1089 |  |  |  |  |  |  | } | 
| 1090 |  |  |  |  |  |  |  | 
| 1091 |  |  |  |  |  |  | return \@articles; | 
| 1092 |  |  |  |  |  |  | } | 
| 1093 |  |  |  |  |  |  |  | 
| 1094 |  |  |  |  |  |  |  | 
| 1095 |  |  |  |  |  |  | 1; | 
| 1096 |  |  |  |  |  |  |  | 
| 1097 |  |  |  |  |  |  | __END__ |