| line | stmt | bran | cond | sub | pod | time | code | 
| 1 |  |  |  |  |  |  | package WWW::BookBot; | 
| 2 |  |  |  |  |  |  |  | 
| 3 | 3 |  |  | 3 |  | 1067 | use 5.008; | 
|  | 3 |  |  |  |  | 9 |  | 
|  | 3 |  |  |  |  | 123 |  | 
| 4 | 3 |  |  | 3 |  | 15 | use strict; | 
|  | 3 |  |  |  |  | 5 |  | 
|  | 3 |  |  |  |  | 80 |  | 
| 5 | 3 |  |  | 3 |  | 24 | use warnings; | 
|  | 3 |  |  |  |  | 5 |  | 
|  | 3 |  |  |  |  | 79 |  | 
| 6 | 3 |  |  | 3 |  | 11 | no warnings qw(uninitialized); | 
|  | 3 |  |  |  |  | 4 |  | 
|  | 3 |  |  |  |  | 106 |  | 
| 7 | 3 |  |  | 3 |  | 13 | use base qw(Exporter); | 
|  | 3 |  |  |  |  | 18 |  | 
|  | 3 |  |  |  |  | 260 |  | 
| 8 | 3 |  |  | 3 |  | 14 | use vars qw($VERSION @EXPORT @EXPORT_OK); | 
|  | 3 |  |  |  |  | 5 |  | 
|  | 3 |  |  |  |  | 269 |  | 
| 9 |  |  |  |  |  |  | $VERSION = "0.12"; | 
| 10 |  |  |  |  |  |  | @EXPORT = qw(); | 
| 11 |  |  |  |  |  |  | @EXPORT_OK = @EXPORT; | 
| 12 |  |  |  |  |  |  |  | 
| 13 | 3 |  |  | 3 |  | 3000 | use Encode; | 
|  | 3 |  |  |  |  | 39672 |  | 
|  | 3 |  |  |  |  | 276 |  | 
| 14 | 3 |  |  | 3 |  | 28 | use File::Basename; | 
|  | 3 |  |  |  |  | 6 |  | 
|  | 3 |  |  |  |  | 320 |  | 
| 15 | 3 |  |  | 3 |  | 18 | use File::Spec::Functions; | 
|  | 3 |  |  |  |  | 6 |  | 
|  | 3 |  |  |  |  | 287 |  | 
| 16 | 3 |  |  | 3 |  | 16 | use File::Path; | 
|  | 3 |  |  |  |  | 6 |  | 
|  | 3 |  |  |  |  | 142 |  | 
| 17 | 3 |  |  | 3 |  | 15 | use Fcntl; | 
|  | 3 |  |  |  |  | 7 |  | 
|  | 3 |  |  |  |  | 893 |  | 
| 18 | 3 |  |  | 3 |  | 1594 | use WWW::BookBot::FakeCookies; | 
|  | 3 |  |  |  |  | 9 |  | 
|  | 3 |  |  |  |  | 132 |  | 
| 19 | 3 |  |  | 3 |  | 3222 | use LWP::UserAgent; | 
|  | 3 |  |  |  |  | 144437 |  | 
|  | 3 |  |  |  |  | 121 |  | 
| 20 | 3 |  |  | 3 |  | 14022 | use HTTP::Request::Common; | 
|  | 3 |  |  |  |  | 8062 |  | 
|  | 3 |  |  |  |  | 413 |  | 
| 21 | 3 |  |  | 3 |  | 37 | use HTTP::Response; | 
|  | 3 |  |  |  |  | 6 |  | 
|  | 3 |  |  |  |  | 185 |  | 
| 22 | 3 |  |  | 3 |  | 20 | use HTTP::Date; | 
|  | 3 |  |  |  |  | 6 |  | 
|  | 3 |  |  |  |  | 187 |  | 
| 23 | 3 |  |  | 3 |  | 18 | use URI; | 
|  | 3 |  |  |  |  | 223 |  | 
|  | 3 |  |  |  |  | 103 |  | 
| 24 | 3 |  |  | 3 |  | 19 | use Carp; | 
|  | 3 |  |  |  |  | 7 |  | 
|  | 3 |  |  |  |  | 614 |  | 
| 25 | 3 |  |  | 3 |  | 23 | use Data::Dumper; | 
|  | 3 |  |  |  |  | 7 |  | 
|  | 3 |  |  |  |  | 185 |  | 
| 26 | 3 |  |  | 3 |  | 4314 | use POSIX qw(ceil floor); | 
|  | 3 |  |  |  |  | 33267 |  | 
|  | 3 |  |  |  |  | 25 |  | 
| 27 |  |  |  |  |  |  |  | 
| 28 |  |  |  |  |  |  | our %entity2char;	#for html entities decoding | 
| 29 |  |  |  |  |  |  |  | 
| 30 |  |  |  |  |  |  | #------------------------------------------------------------- | 
| 31 |  |  |  |  |  |  | # Create objects and initialize | 
| 32 |  |  |  |  |  |  | #	$bot->new(\%args)								=> $bot | 
| 33 |  |  |  |  |  |  | #	$class->default_settings						=> \%settings | 
| 34 |  |  |  |  |  |  | #	$bot->initialize								=> N/A | 
| 35 |  |  |  |  |  |  | #	$bot->work_dir($dir)							=> $work_dir | 
| 36 |  |  |  |  |  |  | #------------------------------------------------------------- | 
| 37 |  |  |  |  |  |  | sub new { | 
| 38 | 21 |  | 33 | 21 | 0 | 121 | my $class = ref($_[0]) || $_[0]; | 
| 39 | 21 | 50 |  |  |  | 83 | my $pargs = ref($_[1]) ? $_[1] : {} ; | 
| 40 | 21 |  |  |  |  | 104 | my $self = $class->default_settings; | 
| 41 | 21 |  |  |  |  | 55 | bless($self, $class); | 
| 42 |  |  |  |  |  |  |  | 
| 43 |  |  |  |  |  |  | # Set default work dir | 
| 44 | 21 |  |  |  |  | 65 | my $dirname=$self->{classname}; | 
| 45 | 21 |  |  |  |  | 103 | $dirname=~s/^WWW:://; | 
| 46 | 21 |  |  |  |  | 215 | $self->{work_dir}=catfile($ENV{HOME}, split(/::/,$dirname)); | 
| 47 |  |  |  |  |  |  |  | 
| 48 |  |  |  |  |  |  | # Set user defined args | 
| 49 | 21 |  |  |  |  | 77 | foreach (keys %$pargs) { | 
| 50 | 27 |  |  |  |  | 80 | $self->{$_} = $pargs->{$_}; | 
| 51 |  |  |  |  |  |  | } | 
| 52 |  |  |  |  |  |  |  | 
| 53 |  |  |  |  |  |  | # initialize and return | 
| 54 | 21 |  |  |  |  | 91 | $self->initialize; | 
| 55 | 21 |  |  |  |  | 115 | return $self; | 
| 56 |  |  |  |  |  |  | } | 
| 57 |  |  |  |  |  |  | sub default_settings { | 
| 58 |  |  |  |  |  |  | { | 
| 59 | 21 |  |  | 21 | 0 | 797 | classname				=> shift,	#cureent classname | 
| 60 |  |  |  |  |  |  | book_get_num			=> 0,		#statistics of books, to be used in file title | 
| 61 |  |  |  |  |  |  | book_has_chapters		=> 1,		#0-only 1 chapter, 1-multiple chapters | 
| 62 |  |  |  |  |  |  | book_max_levels			=> 5,		#max levels of book - chapters - chapters - chapters .. | 
| 63 |  |  |  |  |  |  | book_max_chapters		=> 500,		#max chapters in 1 book | 
| 64 |  |  |  |  |  |  | catalog_max_pages		=> 500,		#max catalog pages | 
| 65 |  |  |  |  |  |  | get_agent_name			=> "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT 5.0; AIRF)", | 
| 66 |  |  |  |  |  |  | get_agent_proxy			=> "Default", | 
| 67 |  |  |  |  |  |  | #Default			Use default IE proxy | 
| 68 |  |  |  |  |  |  | #No					Don't use proxy | 
| 69 |  |  |  |  |  |  | #196.23.147.34:80	Use given proxy | 
| 70 |  |  |  |  |  |  | #Default;202.105.138.19:8080;202.110.220.14:80;... | 
| 71 |  |  |  |  |  |  | #					Use multiple proxy, one visit use one proxy in turn | 
| 72 |  |  |  |  |  |  | get_delay_second		=> 0,		#if get_delay_second>0 then delay get_delay_second+rand seconds | 
| 73 |  |  |  |  |  |  | get_delay_second_rand	=> 2, | 
| 74 |  |  |  |  |  |  | get_fail_showtype		=> '',		#''-show simplified info, 'Detail'-show detailed info | 
| 75 |  |  |  |  |  |  | get_file_directory		=> './saved',	#debug save and read file from this directory | 
| 76 |  |  |  |  |  |  | get_from_file			=> 0,		#0-normal operation, 1-get from file only | 
| 77 |  |  |  |  |  |  | get_language			=> "en",	#get headers: language | 
| 78 |  |  |  |  |  |  | get_max_retries			=> 5,		#max retries of 1 get | 
| 79 |  |  |  |  |  |  | get_save_file			=> 0,		#0-normal operation, 1-save to file for latter debug | 
| 80 |  |  |  |  |  |  | get_skip_zip			=> 1,		#skip fetch zip files | 
| 81 |  |  |  |  |  |  | get_skip_media			=> 1,		#skip fetch media files | 
| 82 |  |  |  |  |  |  | get_timeout				=> 40,		#get timeout | 
| 83 |  |  |  |  |  |  | get_trunk_size			=> 50000,	#define 1 trunk = xxxBytes for display | 
| 84 |  |  |  |  |  |  | get_trunk_fresh_size	=> 5000,	#if get size > xxxBytes, then refresh trunk display | 
| 85 |  |  |  |  |  |  | get_visited_url_num		=> 0,		#statistics of visted urls, to be used in get_from_file/get_save_file | 
| 86 |  |  |  |  |  |  | language_decode			=> "utf8",	#to read with encoding | 
| 87 |  |  |  |  |  |  | language_encode			=> "utf8",	#to save with encoding | 
| 88 |  |  |  |  |  |  | process_all				=> 0,		#process all pages of catalog | 
| 89 |  |  |  |  |  |  | result_no_crlf			=> 1,		#0-with crlf, 1-no crlf | 
| 90 |  |  |  |  |  |  | space_leading_remove 	=> 1,		#remove leading spaces | 
| 91 |  |  |  |  |  |  | space_leading_max		=> 20,		#max leading spaces | 
| 92 |  |  |  |  |  |  | space_inner_remove		=> 1,		#remove inner spaces | 
| 93 |  |  |  |  |  |  | space_inner_min_words	=> 5,		#minimal length of word with inner spaces | 
| 94 |  |  |  |  |  |  | text_paragraph_type		=> 'br',	#type of paragraph split methods | 
| 95 |  |  |  |  |  |  | # br		 one br as end of paragraph | 
| 96 |  |  |  |  |  |  | # brbr		 two br as end of paragraph | 
| 97 |  |  |  |  |  |  | # cr		 one cr as end of paragraph | 
| 98 |  |  |  |  |  |  | # crcr		 two cr as end of paragraph | 
| 99 |  |  |  |  |  |  | # crandspace one cr and followed with space as end of paragraph | 
| 100 |  |  |  |  |  |  | screen_limit_trunk		=> 25,		#max trunks to be displayed | 
| 101 |  |  |  |  |  |  | screen_limit_title		=> 14,		#max title to be displayed | 
| 102 |  |  |  |  |  |  | }; | 
| 103 |  |  |  |  |  |  | } | 
| 104 |  |  |  |  |  |  | sub initialize { | 
| 105 | 21 |  |  | 21 | 0 | 39 | my $self = shift; | 
| 106 |  |  |  |  |  |  |  | 
| 107 |  |  |  |  |  |  | # Initialize languages | 
| 108 | 21 | 50 |  |  |  | 180 | $self->{lang_encode}=find_encoding($self->{language_encode}) | 
| 109 |  |  |  |  |  |  | if $self->{language_encode} ne ''; | 
| 110 | 21 | 50 |  |  |  | 10903 | $self->{lang_decode}=find_encoding($self->{language_decode}) | 
| 111 |  |  |  |  |  |  | if $self->{language_decode} ne ''; | 
| 112 |  |  |  |  |  |  |  | 
| 113 |  |  |  |  |  |  | # Initialize messages | 
| 114 | 21 |  |  |  |  | 327 | $self->trandict_init; | 
| 115 | 21 |  |  |  |  | 97 | $self->msg_init; | 
| 116 |  |  |  |  |  |  |  | 
| 117 |  |  |  |  |  |  | # Create work directory | 
| 118 | 21 |  |  |  |  | 103 | $self->work_dir( $self->{work_dir} ); | 
| 119 | 21 |  |  |  |  | 29 | eval {mkpath($self->{work_dir})}; | 
|  | 21 |  |  |  |  | 1542 |  | 
| 120 | 21 | 50 |  |  |  | 67 | $self->fatal_error("FailMkDir", dir=>$self->{work_dir}, errmsg=>$@) if $@; | 
| 121 |  |  |  |  |  |  |  | 
| 122 |  |  |  |  |  |  | # Check debug directory | 
| 123 | 21 |  |  |  |  | 105 | $self->{get_file_directory_save}=catfile($self->{get_file_directory}, $self->get_alias()); | 
| 124 | 21 |  |  |  |  | 70 | $self->{get_file_directory_read}=$self->{get_file_directory_save}; | 
| 125 | 21 |  |  |  |  | 97 | $self->{get_file_directory_read}=~s,\\,/,sg; | 
| 126 | 21 |  |  |  |  | 69 | $self->{get_file_directory_read}=~s,/+$,,sg; | 
| 127 | 21 | 50 | 33 |  |  | 147 | eval {mkpath($self->{get_file_directory_save})} | 
|  | 0 |  |  |  |  | 0 |  | 
| 128 |  |  |  |  |  |  | if $self->{get_from_file} or $self->{get_save_file}; | 
| 129 |  |  |  |  |  |  |  | 
| 130 |  |  |  |  |  |  | # Initialize patterns | 
| 131 | 21 |  |  |  |  | 53 | $self->{patterns} = {}; | 
| 132 | 21 |  |  |  |  | 141 | foreach ($self->getpattern_lists) { | 
| 133 | 546 |  |  |  |  | 941 | my $sub="getpattern_".$_; | 
| 134 | 546 |  |  |  |  | 753 | my $sub_data=$sub."_data"; | 
| 135 | 546 | 100 |  |  |  | 3526 | $self->{patterns}->{$_} = $self->can($sub) ? | 
| 136 |  |  |  |  |  |  | $self->$sub : $self->parse_patterns($self->$sub_data); | 
| 137 |  |  |  |  |  |  | } | 
| 138 |  |  |  |  |  |  |  | 
| 139 |  |  |  |  |  |  | # Content Type Initialize | 
| 140 | 21 |  |  |  |  | 127 | $self->contenttype_init; | 
| 141 |  |  |  |  |  |  |  | 
| 142 |  |  |  |  |  |  | # Initialize LWP user agents | 
| 143 | 21 |  |  |  |  | 265 | $self->agent_init; | 
| 144 |  |  |  |  |  |  |  | 
| 145 |  |  |  |  |  |  | # Initialize DB | 
| 146 | 21 |  |  |  |  | 94 | $self->db_init; | 
| 147 | 21 |  |  |  |  | 97 | $self->db_load; | 
| 148 |  |  |  |  |  |  |  | 
| 149 |  |  |  |  |  |  | # Try to login | 
| 150 | 21 |  |  |  |  | 78 | $self->go_login; | 
| 151 |  |  |  |  |  |  | } | 
| 152 |  |  |  |  |  |  | sub work_dir { | 
| 153 | 21 |  |  | 21 | 0 | 40 | my ($self, $work_dir) = @_; | 
| 154 | 21 | 50 |  |  |  | 69 | return $self->{work_dir} if $work_dir eq ''; | 
| 155 |  |  |  |  |  |  |  | 
| 156 |  |  |  |  |  |  | # Reset default work directory, log file, db file | 
| 157 | 21 |  |  |  |  | 41 | $self->{work_dir}=$work_dir; | 
| 158 | 21 |  |  |  |  | 142 | $self->{file_log}=catfile($work_dir, "0000log.txt"); | 
| 159 | 21 |  |  |  |  | 110 | $self->{file_DB}=catfile($work_dir, "0000db.txt"); | 
| 160 | 21 |  |  |  |  | 117 | return $work_dir; | 
| 161 |  |  |  |  |  |  | } | 
| 162 |  |  |  |  |  |  |  | 
| 163 |  |  |  |  |  |  | #------------------------------------------------------------- | 
| 164 |  |  |  |  |  |  | # Debug functions | 
| 165 |  |  |  |  |  |  | #	$bot->dump_class									=> N/A | 
| 166 |  |  |  |  |  |  | #	$bot->dump_var(@vars)								=> N/A | 
| 167 |  |  |  |  |  |  | #	$bot->test_pattern($pattern_name, $content)			=> 1-match 0-no match | 
| 168 |  |  |  |  |  |  | #------------------------------------------------------------- | 
| 169 |  |  |  |  |  |  | sub dump_class { | 
| 170 | 0 |  |  | 0 | 0 | 0 | my $self = shift; | 
| 171 | 0 |  |  |  |  | 0 | local $Data::Dumper::Maxdepth=1;	#only show 1 level | 
| 172 | 0 |  |  |  |  | 0 | local $Data::Dumper::Sortkeys=1;	#sort keys | 
| 173 | 0 |  |  |  |  | 0 | local $Data::Dumper::Quotekeys=0;	#no quote | 
| 174 | 0 |  |  |  |  | 0 | local $Data::Dumper::Varname=$self->{classname}; | 
| 175 | 0 |  |  |  |  | 0 | print Dumper($self); | 
| 176 |  |  |  |  |  |  | } | 
| 177 |  |  |  |  |  |  | sub dump_var { | 
| 178 | 0 |  |  | 0 | 0 | 0 | my $self = shift; | 
| 179 | 0 |  |  |  |  | 0 | foreach (@_) { | 
| 180 | 0 |  |  |  |  | 0 | local $Data::Dumper::Sortkeys=1;	#sort keys | 
| 181 | 0 |  |  |  |  | 0 | local $Data::Dumper::Quotekeys=0;	#no quote | 
| 182 | 0 |  |  |  |  | 0 | local $Data::Dumper::Varname=$self->{classname}."->{$_}"; | 
| 183 | 0 |  |  |  |  | 0 | print Dumper($self->{$_}); | 
| 184 |  |  |  |  |  |  | } | 
| 185 |  |  |  |  |  |  | } | 
| 186 |  |  |  |  |  |  | sub test_pattern { | 
| 187 | 0 |  |  | 0 | 0 | 0 | my ($self, $pattern_name) = (shift, shift); | 
| 188 |  |  |  |  |  |  |  | 
| 189 |  |  |  |  |  |  | # get pattern, verify and print its encoded content | 
| 190 | 0 |  |  |  |  | 0 | my $pattern = $self->{patterns}->{$pattern_name}; | 
| 191 | 0 | 0 |  |  |  | 0 | croak "Invalid pattern name: $pattern" if not(defined($pattern)); | 
| 192 | 0 |  |  |  |  | 0 | printf "[Pattern $pattern_name]=\"%s\"\n", $self->en_code($pattern); | 
| 193 |  |  |  |  |  |  |  | 
| 194 | 0 |  |  |  |  | 0 | my $result=1; | 
| 195 | 0 | 0 |  |  |  | 0 | if( $_[0] ne '' ) { | 
| 196 |  |  |  |  |  |  | # test content if specified | 
| 197 | 0 |  |  |  |  | 0 | my $str=$self->de_code($_[0]); | 
| 198 | 0 |  |  |  |  | 0 | my $result=($str=~/$pattern/); | 
| 199 | 0 | 0 |  |  |  | 0 | printf "  Test with \"%s\": %s\n", | 
| 200 |  |  |  |  |  |  | $_[0], | 
| 201 |  |  |  |  |  |  | $result ? 'match' : 'no match'; | 
| 202 |  |  |  |  |  |  | } | 
| 203 | 0 |  |  |  |  | 0 | return $result; | 
| 204 |  |  |  |  |  |  | } | 
| 205 |  |  |  |  |  |  |  | 
| 206 |  |  |  |  |  |  | #------------------------------------------------------------- | 
| 207 |  |  |  |  |  |  | # Pattern utility functions | 
| 208 |  |  |  |  |  |  | #	$bot->parse_patterns("a\n'b\nc\n")				=> [aA]|\'[bB]|[cC] | 
| 209 |  |  |  |  |  |  | #		add \ before /"'`$& automatically | 
| 210 |  |  |  |  |  |  | #		encoding -> decoding | 
| 211 |  |  |  |  |  |  | #		auto-case insensitive conversion except 2: | 
| 212 |  |  |  |  |  |  | #			first line is (case) | 
| 213 |  |  |  |  |  |  | #			[...] | 
| 214 |  |  |  |  |  |  | #------------------------------------------------------------- | 
| 215 |  |  |  |  |  |  | sub parse_patterns { | 
| 216 | 540 |  |  | 540 | 0 | 861 | my ($self, $str) = @_; | 
| 217 | 540 | 50 |  |  |  | 1180 | $str='' if not(defined($str)); | 
| 218 |  |  |  |  |  |  |  | 
| 219 |  |  |  |  |  |  | # simplify pattern construction by add \ before /"'`$& automatically | 
| 220 | 540 |  |  |  |  | 4541 | $str=~s,(?=\/|\"|\'|\`|\$|\&),\\,g; | 
| 221 |  |  |  |  |  |  |  | 
| 222 |  |  |  |  |  |  | # multiplie lines -> | | 
| 223 | 540 |  |  |  |  | 690 | my $pattern = ""; | 
| 224 | 540 |  |  |  |  | 4718 | foreach ( split /\r\n|\r|\n/, $str ) { | 
| 225 | 4009 | 100 |  |  |  | 8936 | $pattern .= $_.'|' if $_ ne ''; | 
| 226 |  |  |  |  |  |  | } | 
| 227 | 540 |  |  |  |  | 2104 | $pattern=~s/\|$//; | 
| 228 |  |  |  |  |  |  |  | 
| 229 |  |  |  |  |  |  | # parse \n | 
| 230 | 540 |  |  |  |  | 740 | $pattern=~s/\\n/\n/sg; | 
| 231 |  |  |  |  |  |  |  | 
| 232 |  |  |  |  |  |  | # decode | 
| 233 | 540 |  |  |  |  | 1024 | $str=$self->de_code($pattern); | 
| 234 |  |  |  |  |  |  |  | 
| 235 |  |  |  |  |  |  | # case sensitive and insensitive | 
| 236 | 540 | 100 |  |  |  | 2997 | if( $str=~/^\(case\)\|/s ) { | 
| 237 |  |  |  |  |  |  | # case sensitive | 
| 238 | 10 |  |  |  |  | 69 | $str=~s/^\(case\)\|//s; | 
| 239 | 10 |  |  |  |  | 17 | $pattern=$str; | 
| 240 |  |  |  |  |  |  | } else { | 
| 241 |  |  |  |  |  |  | # change to case insensitive form | 
| 242 | 530 |  |  |  |  | 600 | $pattern=""; | 
| 243 | 530 |  |  |  |  | 638 | my $meet_left=0; | 
| 244 | 530 |  |  |  |  | 6683 | foreach (split /(\\.|[\[\]])/, $str) { | 
| 245 | 4806 | 100 |  |  |  | 8290 | $meet_left=1 if $_ eq '['; | 
| 246 | 4806 | 100 |  |  |  | 7513 | $meet_left=0 if $_ eq ']'; | 
| 247 | 4806 | 100 | 100 |  |  | 19049 | s,([a-zA-Z]),'['.lc($1).uc($1).']',eg if $meet_left==0 and not(/^\\/); | 
|  | 11598 |  |  |  |  | 36722 |  | 
| 248 |  |  |  |  |  |  | # skip [...] and \d | 
| 249 | 4806 |  |  |  |  | 8358 | $pattern.=$_; | 
| 250 |  |  |  |  |  |  | } | 
| 251 |  |  |  |  |  |  | } | 
| 252 |  |  |  |  |  |  |  | 
| 253 |  |  |  |  |  |  | # return | 
| 254 | 540 |  |  |  |  | 2733 | return $pattern; | 
| 255 |  |  |  |  |  |  | } | 
| 256 |  |  |  |  |  |  |  | 
| 257 |  |  |  |  |  |  | #------------------------------------------------------------- | 
| 258 |  |  |  |  |  |  | # Message functions | 
| 259 |  |  |  |  |  |  | #	$bot->trandict_init								=> $bot->{translate_dict} | 
| 260 |  |  |  |  |  |  | #	$bot->msg_init									=> $bot->{messages} | 
| 261 |  |  |  |  |  |  | #	$bot->msg_format($msgid, \%args)				=> $msgstr | 
| 262 |  |  |  |  |  |  | #	$bot->fatal_error($msgid, \%args)				=> N/A | 
| 263 |  |  |  |  |  |  | #------------------------------------------------------------- | 
| 264 |  |  |  |  |  |  | sub trandict_init { | 
| 265 | 14 |  |  | 14 | 0 | 154 | shift->{translate_dict} = { | 
| 266 |  |  |  |  |  |  | 'log'		=> "log", | 
| 267 |  |  |  |  |  |  | 'result'	=> "result", | 
| 268 |  |  |  |  |  |  | 'DB'		=> "DB", | 
| 269 |  |  |  |  |  |  | 'debug'		=> 'debug', | 
| 270 |  |  |  |  |  |  | } | 
| 271 |  |  |  |  |  |  | } | 
| 272 |  |  |  |  |  |  | sub msg_init { | 
| 273 | 14 |  |  | 14 | 0 | 32 | my $skip_info="\n".'$pargs->{levelspace}  url=$pargs->{url}'."\n"; | 
| 274 | 14 |  |  |  |  | 483 | shift->{messages} = { | 
| 275 |  |  |  |  |  |  | TestMsg			=> 'Test: $pargs->{TestInfo} $pargs->{TestNum}', | 
| 276 |  |  |  |  |  |  | BookBinaryOK	=> '$pargs->{data_len_KB} $pargs->{write_file}'."\n", | 
| 277 |  |  |  |  |  |  | BookChapterErr	=> ' cannot parse'.$skip_info, | 
| 278 |  |  |  |  |  |  | BookChapterMany	=> '[$pargs->{chapter_num}CH]', | 
| 279 |  |  |  |  |  |  | BookChapterOne	=> '[0001CH]', | 
| 280 |  |  |  |  |  |  | BookChapterOK	=> '$pargs->{data_len_KB}'."\n", | 
| 281 |  |  |  |  |  |  | BookStart		=> '$pargs->{levelspace} [$pargs->{bpos_limit}/$pargs->{book_num}] $pargs->{title_limit} ', | 
| 282 |  |  |  |  |  |  | BookTOCFinish	=> '$pargs->{TOC_len_KB}'."\n", | 
| 283 |  |  |  |  |  |  | CatalogInfo		=> 'Get Catalog: ', | 
| 284 |  |  |  |  |  |  | CatalogResultErr=> ' 0 books'."\n", | 
| 285 |  |  |  |  |  |  | CatalogResultOK	=> ' $pargs->{book_num} books'."\n", | 
| 286 |  |  |  |  |  |  | CatalogURL		=> '$pargs->{url}', | 
| 287 |  |  |  |  |  |  | CatalogURLEmpty	=> '[Fail] catalog url is empty'."\n", | 
| 288 |  |  |  |  |  |  | DBHead			=> <<'DATA', | 
| 289 |  |  |  |  |  |  | #!$pargs->{perlcmd} | 
| 290 |  |  |  |  |  |  | ##====================================== | 
| 291 |  |  |  |  |  |  | ## Auto-generated DB File of $pargs->{classname} | 
| 292 |  |  |  |  |  |  | ##    Create time: $pargs->{createtime} | 
| 293 |  |  |  |  |  |  | ##====================================== | 
| 294 |  |  |  |  |  |  |  | 
| 295 |  |  |  |  |  |  | use $pargs->{classname}; | 
| 296 |  |  |  |  |  |  | my \$bot = new $pargs->{classname}; | 
| 297 |  |  |  |  |  |  |  | 
| 298 |  |  |  |  |  |  | DATA | 
| 299 |  |  |  |  |  |  | DBCatalogErr	=> ' \$bot->go_catalog({$pargs->{allargs}});'."\t#Err\n", | 
| 300 |  |  |  |  |  |  | DBCatalogOK		=> '#\$bot->go_catalog({$pargs->{allargs}});'."\n", | 
| 301 |  |  |  |  |  |  | DBBookErr		=> "\t".' \$bot->go_book({$pargs->{allargs}});'."\t#Err\n", | 
| 302 |  |  |  |  |  |  | DBBookOK		=> "\t".'#\$bot->go_book({$pargs->{allargs}});'."\n", | 
| 303 |  |  |  |  |  |  | FailClearDB		=> 'Fail to clear DB file $pargs->{filename}: $pargs->{errmsg}', | 
| 304 |  |  |  |  |  |  | FailClose	 	=> 'Fail to close $self->{translate_dict}->{$pargs->{filetype}} file $pargs->{filename}: $pargs->{errmsg}', | 
| 305 |  |  |  |  |  |  | FailMkDir		=> 'Fail to mkdir $pargs->{dir}: $pargs->{errmsg}', | 
| 306 |  |  |  |  |  |  | FailOpen	 	=> 'Fail to open $self->{translate_dict}->{$pargs->{filetype}} file $pargs->{filename}: $pargs->{errmsg}', | 
| 307 |  |  |  |  |  |  | FailWrite	 	=> 'Fail to write $self->{translate_dict}->{$pargs->{filetype}} file $pargs->{filename}: $pargs->{errmsg}', | 
| 308 |  |  |  |  |  |  | GetFail404		=> <<'DATA', | 
| 309 |  |  |  |  |  |  | [$pargs->{code},Fail] No such file | 
| 310 |  |  |  |  |  |  | $pargs->{url_real} | 
| 311 |  |  |  |  |  |  | DATA | 
| 312 |  |  |  |  |  |  | GetFail404Detail=> <<'DATA', | 
| 313 |  |  |  |  |  |  | [$pargs->{code},Fail] No such file | 
| 314 |  |  |  |  |  |  | >>>>Request | 
| 315 |  |  |  |  |  |  | $pargs->{req_content}<<< | 
| 316 |  |  |  |  |  |  | $pargs->{status_line} | 
| 317 |  |  |  |  |  |  |  | 
| 318 |  |  |  |  |  |  | DATA | 
| 319 |  |  |  |  |  |  | GetFailRetries	=> <<'DATA', | 
| 320 |  |  |  |  |  |  | [$pargs->{code},Fail] Exceed retry limits | 
| 321 |  |  |  |  |  |  | $pargs->{url_real} | 
| 322 |  |  |  |  |  |  | DATA | 
| 323 |  |  |  |  |  |  | GetFailRetriesDetail	=> <<'DATA', | 
| 324 |  |  |  |  |  |  | [$pargs->{code},Fail] Exceed retry limits | 
| 325 |  |  |  |  |  |  | >>>>Request | 
| 326 |  |  |  |  |  |  | $pargs->{req_content}<<< | 
| 327 |  |  |  |  |  |  | $pargs->{status_line} | 
| 328 |  |  |  |  |  |  | $pargs->{res_content} | 
| 329 |  |  |  |  |  |  |  | 
| 330 |  |  |  |  |  |  | DATA | 
| 331 |  |  |  |  |  |  | GetURLRetry		=> '[$pargs->{code}, Retry] ', | 
| 332 |  |  |  |  |  |  | GetURLSuccess	=> '$pargs->{len_KB} ', | 
| 333 |  |  |  |  |  |  | GetWait			=> 'Wait..', | 
| 334 |  |  |  |  |  |  | SkipMaxLevel	=> '[Skip]level>$self->{book_max_levels}'.$skip_info, | 
| 335 |  |  |  |  |  |  | SkipMedia		=> '[Skip]media files'.$skip_info, | 
| 336 |  |  |  |  |  |  | SkipTitleEmpty	=> '[Skip]title is empty'.$skip_info, | 
| 337 |  |  |  |  |  |  | SkipUrlEmpty	=> '[Skip]url is empty'."\n", | 
| 338 |  |  |  |  |  |  | SkipVisited		=> '[Skip]visited'."\n", | 
| 339 |  |  |  |  |  |  | SkipZip			=> '[Skip]zip files'.$skip_info, | 
| 340 |  |  |  |  |  |  | }; | 
| 341 |  |  |  |  |  |  | } | 
| 342 |  |  |  |  |  |  | sub msg_format { | 
| 343 | 27 |  |  | 27 | 0 | 476 | my ($self, $id, $pargs) = @_; | 
| 344 | 27 |  |  |  |  | 2566 | return eval('"'.$self->{messages}->{$id}.'"'); | 
| 345 |  |  |  |  |  |  | } | 
| 346 |  |  |  |  |  |  | sub fatal_error { | 
| 347 | 0 |  |  | 0 | 0 | 0 | croak shift->msg_format(@_); | 
| 348 |  |  |  |  |  |  | } | 
| 349 |  |  |  |  |  |  |  | 
| 350 |  |  |  |  |  |  | #------------------------------------------------------------- | 
| 351 |  |  |  |  |  |  | # Encode and decode functions | 
| 352 |  |  |  |  |  |  | #	$bot->en_code($string)							=> $octets | 
| 353 |  |  |  |  |  |  | #	$bot->de_code($octets)							=> $contents | 
| 354 |  |  |  |  |  |  | #------------------------------------------------------------- | 
| 355 |  |  |  |  |  |  | sub en_code { | 
| 356 | 84 | 50 |  | 84 | 0 | 693 | return ($_[0]->{language_encode} ne '') | 
| 357 |  |  |  |  |  |  | ? $_[0]->{lang_encode}->encode($_[1]) : $_[1]; | 
| 358 |  |  |  |  |  |  | } | 
| 359 |  |  |  |  |  |  | sub de_code { | 
| 360 | 636 | 50 |  | 636 | 0 | 4536 | return ($_[0]->{language_decode} ne '') | 
| 361 |  |  |  |  |  |  | ? $_[0]->{lang_decode}->decode($_[1]) : $_[1]; | 
| 362 |  |  |  |  |  |  | } | 
| 363 |  |  |  |  |  |  |  | 
| 364 |  |  |  |  |  |  | #------------------------------------------------------------- | 
| 365 |  |  |  |  |  |  | # File I/O functions | 
| 366 |  |  |  |  |  |  | #	$bot->file_init($filetype, $filename, @contents)	=> N/A | 
| 367 |  |  |  |  |  |  | #	$bot->file_writebin($filetype, $filename, $buf)		=> N/A | 
| 368 |  |  |  |  |  |  | #	$bot->file_add($filetype, $filename, @contents)		=> N/A | 
| 369 |  |  |  |  |  |  | #------------------------------------------------------------- | 
| 370 |  |  |  |  |  |  | sub file_init { | 
| 371 | 18 |  |  | 18 | 0 | 53 | my ($self, $filetype, $filename) = (shift, shift, shift); | 
| 372 | 18 |  |  |  |  | 49 | local(*WORK); | 
| 373 | 18 | 50 |  |  |  | 1861 | open(WORK, ">$filename") | 
| 374 |  |  |  |  |  |  | or $self->fatal_error("FailOpen", {filetype=>$filetype, filename=>$filename, errmsg=>$!}); | 
| 375 | 18 | 100 | 66 |  |  | 198 | binmode(WORK) if (($filetype eq 'result') and $self->{result_no_crlf}) or $filetype eq 'debug'; | 
|  |  |  | 66 |  |  |  |  | 
| 376 | 18 | 50 |  |  |  | 142 | (print WORK @_) | 
| 377 |  |  |  |  |  |  | or $self->fatal_error("FailWrite", {filetype=>$filetype, filename=>$filename, errmsg=>$!}); | 
| 378 | 18 | 50 |  |  |  | 4992 | close(WORK) | 
| 379 |  |  |  |  |  |  | or $self->fatal_error("FailClose", {filetype=>$filetype, filename=>$filename, errmsg=>$!}); | 
| 380 |  |  |  |  |  |  | } | 
| 381 |  |  |  |  |  |  | sub file_add { | 
| 382 | 42 |  |  | 42 | 0 | 157 | my ($self, $filetype, $filename) = (shift, shift, shift); | 
| 383 | 42 |  |  |  |  | 106 | local(*WORK); | 
| 384 | 42 | 50 |  |  |  | 2004 | open(WORK, ">>$filename") | 
| 385 |  |  |  |  |  |  | or $self->fatal_error("FailOpen", {filetype=>$filetype, filename=>$filename, errmsg=>$!}); | 
| 386 | 42 | 100 | 66 |  |  | 322 | binmode(WORK) if (($filetype eq 'result') and $self->{result_no_crlf}) or $filetype eq 'debug'; | 
|  |  |  | 66 |  |  |  |  | 
| 387 | 42 | 50 |  |  |  | 301 | (print WORK @_) | 
| 388 |  |  |  |  |  |  | or $self->fatal_error("FailWrite", {filetype=>$filetype, filename=>$filename, errmsg=>$!}); | 
| 389 | 42 | 50 |  |  |  | 30275 | close(WORK) | 
| 390 |  |  |  |  |  |  | or $self->fatal_error("FailClose", {filetype=>$filetype, filename=>$filename, errmsg=>$!}); | 
| 391 |  |  |  |  |  |  | } | 
| 392 |  |  |  |  |  |  | sub file_writebin { | 
| 393 | 6 |  |  | 6 | 0 | 18 | my ($self, $filetype, $filename) = (shift, shift, shift); | 
| 394 | 6 |  |  |  |  | 18 | local(*WORK); | 
| 395 | 6 | 50 |  |  |  | 785 | sysopen(WORK, $filename, O_WRONLY|O_TRUNC|O_CREAT) | 
| 396 |  |  |  |  |  |  | or $self->fatal_error("FailOpen", {filetype=>$filetype, filename=>$filename, errmsg=>$!}); | 
| 397 | 6 | 50 |  |  |  | 4314 | syswrite(WORK, $_[0], 200000000) | 
| 398 |  |  |  |  |  |  | or $self->fatal_error("FailWrite", {filetype=>$filetype, filename=>$filename, errmsg=>$!}); | 
| 399 | 6 | 50 |  |  |  | 98 | close(WORK) | 
| 400 |  |  |  |  |  |  | or $self->fatal_error("FailClose", {filetype=>$filetype, filename=>$filename, errmsg=>$!}); | 
| 401 |  |  |  |  |  |  | } | 
| 402 |  |  |  |  |  |  |  | 
| 403 |  |  |  |  |  |  | #------------------------------------------------------------- | 
| 404 |  |  |  |  |  |  | # Logging functions | 
| 405 |  |  |  |  |  |  | #	$bot->log_msg($contents)						=> N/A | 
| 406 |  |  |  |  |  |  | #	$bot->log_msgen($contents)						=> N/A | 
| 407 |  |  |  |  |  |  | #	$bot->log_add($msgid, \%args)					=> N/A | 
| 408 |  |  |  |  |  |  | #------------------------------------------------------------- | 
| 409 |  |  |  |  |  |  | sub log_msg { | 
| 410 | 24 |  |  | 24 | 0 | 43 | my $self = shift; | 
| 411 | 24 |  |  |  |  | 50 | $|=1; | 
| 412 | 24 |  |  |  |  | 2598 | print @_; | 
| 413 | 24 |  |  |  |  | 107 | $self->file_add( "log", $self->{file_log}, @_ ); | 
| 414 |  |  |  |  |  |  | } | 
| 415 |  |  |  |  |  |  | sub log_msgen { | 
| 416 | 3 |  |  | 3 | 0 | 37 | my $self = shift; | 
| 417 | 3 |  |  |  |  | 25 | $self->log_msg( $self->en_code($_[0]) ); | 
| 418 |  |  |  |  |  |  | } | 
| 419 |  |  |  |  |  |  | sub log_add { | 
| 420 | 12 |  |  | 12 | 0 | 26 | my $self = shift; | 
| 421 | 12 |  |  |  |  | 44 | $self->log_msg(	$self->msg_format(@_) ); | 
| 422 |  |  |  |  |  |  | } | 
| 423 |  |  |  |  |  |  |  | 
| 424 |  |  |  |  |  |  | #------------------------------------------------------------- | 
| 425 |  |  |  |  |  |  | # Result output functions | 
| 426 |  |  |  |  |  |  | #	$bot->result_filename(\%args)					=> filename | 
| 427 |  |  |  |  |  |  | #	$bot->result_init(\%args)						=> filename | 
| 428 |  |  |  |  |  |  | #	$bot->result_add($filename, $content_en)		=> N/A | 
| 429 |  |  |  |  |  |  | #	$bot->result_adden($filename, $content_de)		=> N/A | 
| 430 |  |  |  |  |  |  | #	$bot->result_settime($filename, $curtime)		=> N/A | 
| 431 |  |  |  |  |  |  | #	$bot->string2time($str)							=> $time | 
| 432 |  |  |  |  |  |  | #------------------------------------------------------------- | 
| 433 |  |  |  |  |  |  | sub result_filename { | 
| 434 | 18 |  |  | 18 | 0 | 87 | my ($self, $pargs) = @_; | 
| 435 | 18 |  |  |  |  | 82 | my $filename=$self->result_filestem($pargs).".".$pargs->{ext_save}; | 
| 436 | 18 |  |  |  |  | 96 | $filename=$self->de_code($filename); | 
| 437 | 18 |  |  |  |  | 161 | $filename=~s/[\\\/\:\*\?\"<>\|]//g;	# remove banned characters | 
| 438 | 18 |  |  |  |  | 64 | $filename=$self->en_code($filename); | 
| 439 | 18 |  |  |  |  | 147 | return catfile($self->{work_dir}, $filename); | 
| 440 |  |  |  |  |  |  | } | 
| 441 |  |  |  |  |  |  | sub result_init { | 
| 442 | 6 |  |  | 6 | 0 | 19 | my $self = shift; | 
| 443 | 6 |  |  |  |  | 26 | my $filename = $self->result_filename(@_); | 
| 444 | 6 |  |  |  |  | 33 | $self->file_init( "result", $filename ); | 
| 445 | 6 |  |  |  |  | 21 | return $filename; | 
| 446 |  |  |  |  |  |  | } | 
| 447 |  |  |  |  |  |  | sub result_add { | 
| 448 | 9 |  |  | 9 | 0 | 24 | my $self = shift; | 
| 449 | 9 |  |  |  |  | 17 | my $filename = shift; | 
| 450 | 9 |  |  |  |  | 36 | $self->file_add( "result", $filename, $_[0] ); | 
| 451 |  |  |  |  |  |  | } | 
| 452 |  |  |  |  |  |  | sub result_adden { | 
| 453 | 0 |  |  | 0 | 0 | 0 | my $self = shift; | 
| 454 | 0 |  |  |  |  | 0 | my $filename = shift; | 
| 455 | 0 |  |  |  |  | 0 | $self->file_add( "result", $filename, $self->en_code($_[0]) ); | 
| 456 |  |  |  |  |  |  | } | 
| 457 |  |  |  |  |  |  | sub result_settime { | 
| 458 | 3 |  |  | 3 | 0 | 11 | my ($self, $filename, $curtime) = @_; | 
| 459 | 3 |  |  |  |  | 126 | utime $curtime, $curtime, $filename; | 
| 460 |  |  |  |  |  |  | } | 
| 461 |  |  |  |  |  |  | sub string2time { | 
| 462 | 0 |  |  | 0 | 0 | 0 | str2time($_[1]); | 
| 463 |  |  |  |  |  |  | } | 
| 464 |  |  |  |  |  |  |  | 
| 465 |  |  |  |  |  |  | #------------------------------------------------------------- | 
| 466 |  |  |  |  |  |  | # DB functions | 
| 467 |  |  |  |  |  |  | #	$bot->db_init									=> N/A | 
| 468 |  |  |  |  |  |  | #	$bot->db_clear									=> N/A | 
| 469 |  |  |  |  |  |  | #	$bot->db_load									=> N/A | 
| 470 |  |  |  |  |  |  | #	$bot->db_add($type, $result, \%args)			=> N/A | 
| 471 |  |  |  |  |  |  | #------------------------------------------------------------- | 
| 472 |  |  |  |  |  |  | sub db_init { | 
| 473 | 27 |  |  | 27 | 0 | 48 | my $self = shift; | 
| 474 | 27 |  |  |  |  | 72 | $self->{DB_visited_book}={}; | 
| 475 | 27 | 100 |  |  |  | 820 | return if -f $self->{file_DB}; | 
| 476 |  |  |  |  |  |  |  | 
| 477 | 9 |  |  |  |  | 20 | my $perlcmd = $^X; | 
| 478 | 9 |  |  |  |  | 24 | $perlcmd=~s/\\/\//g; | 
| 479 | 9 |  |  |  |  | 81 | $self->file_init( "DB", $self->{file_DB}, $self->msg_format( "DBHead", { | 
| 480 |  |  |  |  |  |  | perlcmd=>$perlcmd, | 
| 481 |  |  |  |  |  |  | classname=>$self->{classname}, | 
| 482 |  |  |  |  |  |  | createtime=>HTTP::Date::time2iso(time), | 
| 483 |  |  |  |  |  |  | })); | 
| 484 |  |  |  |  |  |  | } | 
| 485 |  |  |  |  |  |  | sub db_clear { | 
| 486 | 3 |  |  | 3 | 0 | 8 | my $self = shift; | 
| 487 | 3 |  |  |  |  | 10 | my $filename = $self->{file_DB}; | 
| 488 | 3 | 50 |  |  |  | 234 | unlink($filename) | 
| 489 |  |  |  |  |  |  | or $self->fatal_error("FailClearDB", {filename=>$filename, errmsg=>$!}); | 
| 490 |  |  |  |  |  |  | } | 
| 491 |  |  |  |  |  |  | sub db_load { | 
| 492 | 24 |  |  | 24 | 0 | 41 | my $self = shift; | 
| 493 | 24 |  |  |  |  | 51 | my $filename = $self->{file_DB}; | 
| 494 | 24 |  |  |  |  | 75 | local(*WORK); | 
| 495 | 24 | 50 |  |  |  | 1878 | open(WORK, $filename) | 
| 496 |  |  |  |  |  |  | or $self->fatal_error("FailOpen", {filetype=>"DB", filename=>$filename, errmsg=>$!}); | 
| 497 | 24 |  |  |  |  | 46 | my ($type, $result, $url, $paras); | 
| 498 | 24 |  |  |  |  | 625 | while() { | 
| 499 | 231 | 100 |  |  |  | 793 | if( /(.)\$bot->go_([^\(]*)\((.*)\)/ ) { | 
| 500 | 15 |  |  |  |  | 56 | $type=$2; | 
| 501 | 15 | 50 |  |  |  | 61 | $result=($1 eq '#') ? 'OK' : 'Err'; | 
| 502 | 15 | 50 |  |  |  | 107 | $url=($3=~/url=>\'([^\']*)\'/) ? $1 : ''; | 
| 503 | 15 |  |  |  |  | 156 | $self->{"DB_visited_$type"}->{$url}=$result; | 
| 504 |  |  |  |  |  |  | } | 
| 505 |  |  |  |  |  |  | } | 
| 506 | 24 | 50 |  |  |  | 355 | close(WORK) | 
| 507 |  |  |  |  |  |  | or $self->fatal_error("FailClose", {filetype=>"DB", filename=>$filename, errmsg=>$!}); | 
| 508 |  |  |  |  |  |  | } | 
| 509 |  |  |  |  |  |  | sub db_add { | 
| 510 | 3 |  |  | 3 | 0 | 9 | my ($self, $type, $result, $pargs) = @_; | 
| 511 | 3 | 50 |  |  |  | 16 | return if not defined($pargs->{url}); | 
| 512 | 3 |  |  |  |  | 10 | my $url = $pargs->{url}; | 
| 513 | 3 |  |  |  |  | 99 | my $allargs = ""; | 
| 514 | 3 |  |  |  |  | 7 | my $str; | 
| 515 | 3 |  |  |  |  | 24 | foreach (sort keys %$pargs) { | 
| 516 | 6 |  |  |  |  | 12 | $str=$pargs->{$_}; | 
| 517 | 6 |  |  |  |  | 13 | $str=~s/\'/\\\'/g; | 
| 518 | 6 |  |  |  |  | 17 | $allargs .= "$_=>'$str', "; | 
| 519 |  |  |  |  |  |  | } | 
| 520 | 3 |  |  |  |  | 17 | $self->{"DB_visited_$type"}->{$url}=$result; | 
| 521 | 3 |  |  |  |  | 25 | $self->file_add( "DB", $self->{file_DB}, | 
| 522 |  |  |  |  |  |  | $self->msg_format( "DB".ucfirst($type)."$result", {allargs=>$allargs}) ); | 
| 523 |  |  |  |  |  |  | } | 
| 524 |  |  |  |  |  |  |  | 
| 525 |  |  |  |  |  |  | #------------------------------------------------------------- | 
| 526 |  |  |  |  |  |  | # Agent functions | 
| 527 |  |  |  |  |  |  | #	$bot->agent_init								=> N/A | 
| 528 |  |  |  |  |  |  | #	$bot->agent_setproxy($ua, $proxy_name)			=> $proxy_settings | 
| 529 |  |  |  |  |  |  | #		$proxy_name eq "No"				Don't use proxy | 
| 530 |  |  |  |  |  |  | #		$proxy_name eq "Default"		suggest IE's default proxy | 
| 531 |  |  |  |  |  |  | #		$proxy_name eq "192.168.1.5"	defined proxy | 
| 532 |  |  |  |  |  |  | #------------------------------------------------------------- | 
| 533 |  |  |  |  |  |  | sub agent_init { | 
| 534 | 24 |  |  | 24 | 0 | 46 | my $self = shift; | 
| 535 | 24 |  |  |  |  | 431 | my $cookies=new WWW::BookBot::FakeCookies; | 
| 536 | 24 |  |  |  |  | 768 | $self->{get_agent_cur} = 0; | 
| 537 | 24 | 100 |  |  |  | 115 | $self->{get_lasturl} = "" if not(defined($self->{get_lasturl})); | 
| 538 | 24 |  |  |  |  | 74 | $self->{get_agent_array} = []; | 
| 539 | 24 |  |  |  |  | 354 | foreach ( split /;/, $self->{get_agent_proxy} ) { | 
| 540 | 27 |  |  |  |  | 501 | my $ua=new LWP::UserAgent; | 
| 541 | 27 |  |  |  |  | 19637 | $ua->agent( $self->{get_agent_name} ); | 
| 542 | 27 |  |  |  |  | 1447 | $self->agent_setproxy( $ua, $_ ); | 
| 543 | 27 |  |  |  |  | 121 | $ua->timeout( $self->{get_timeout} ); | 
| 544 | 27 |  |  |  |  | 374 | $ua->cookie_jar( $cookies ); | 
| 545 | 27 |  |  |  |  | 2365 | push @{ $ua->requests_redirectable }, 'POST'; | 
|  | 27 |  |  |  |  | 100 |  | 
| 546 | 27 |  |  |  |  | 229 | push @{$self->{get_agent_array}}, $ua; | 
|  | 27 |  |  |  |  | 120 |  | 
| 547 |  |  |  |  |  |  | } | 
| 548 |  |  |  |  |  |  | } | 
| 549 |  |  |  |  |  |  | sub agent_setproxy { | 
| 550 | 27 |  |  | 27 | 0 | 63 | my ($self, $ua, $proxy_name) = @_; | 
| 551 | 27 |  |  |  |  | 71 | $proxy_name=~s/ //g;	# Remove spaces | 
| 552 | 27 | 100 | 66 |  |  | 193 | return '' if $proxy_name eq 'No' or $proxy_name eq '';	#Don't use proxy | 
| 553 |  |  |  |  |  |  |  | 
| 554 |  |  |  |  |  |  | # Check assigned proxy | 
| 555 | 24 | 100 |  |  |  | 83 | if( $proxy_name ne 'Default' ) { | 
| 556 | 3 |  |  |  |  | 26 | $ua->proxy(['http','https','ftp'],"http://$proxy_name/"); | 
| 557 | 3 |  |  |  |  | 882 | return $proxy_name; | 
| 558 |  |  |  |  |  |  | } | 
| 559 |  |  |  |  |  |  |  | 
| 560 |  |  |  |  |  |  | # Check Win32::TieRegistry module | 
| 561 | 21 |  |  |  |  | 34 | my $ie_proxy_ok=0; | 
| 562 | 21 |  |  |  |  | 39 | my %RegHash; | 
| 563 | 21 |  |  |  |  | 30 | my $win32_registry='use Win32::TieRegistry(Delimiter=>"/", TiedHash => \%RegHash);'; | 
| 564 | 21 | 50 |  |  |  | 101 | if( $^O eq 'MSWin32' ) { | 
| 565 | 0 |  |  |  |  | 0 | eval $win32_registry; | 
| 566 | 0 | 0 |  |  |  | 0 | $ie_proxy_ok=1 if not $@; | 
| 567 |  |  |  |  |  |  | } | 
| 568 | 21 | 50 |  |  |  | 70 | return "" if not $ie_proxy_ok; | 
| 569 |  |  |  |  |  |  |  | 
| 570 |  |  |  |  |  |  | # Get IE registry | 
| 571 | 0 |  |  |  |  | 0 | my($iekey, $ie_proxy_enable, $ie_proxy_server); | 
| 572 | 0 | 0 |  |  |  | 0 | $iekey=$RegHash{"CUser/Software/Microsoft/Windows/CurrentVersion/Internet Settings/"} | 
| 573 |  |  |  |  |  |  | or return ""; | 
| 574 | 0 | 0 |  |  |  | 0 | $ie_proxy_enable=$iekey->{"/ProxyEnable"} or return ""; | 
| 575 | 0 | 0 |  |  |  | 0 | $ie_proxy_server=$iekey->{"/ProxyServer"} or return ""; | 
| 576 | 0 | 0 |  |  |  | 0 | return "" if !($ie_proxy_enable=~/1$/); | 
| 577 |  |  |  |  |  |  |  | 
| 578 |  |  |  |  |  |  | # Set LWP proxy | 
| 579 | 0 | 0 |  |  |  | 0 | if($ie_proxy_server=~/;/) { | 
| 580 |  |  |  |  |  |  | #Multiple proxies, such as ftp=192.168.1.3:8080;...;https=192.168.1.3:8080 | 
| 581 | 0 |  |  |  |  | 0 | foreach (split(/;/, $ie_proxy_server)) { | 
| 582 | 0 | 0 |  |  |  | 0 | next if $_ eq ''; | 
| 583 | 0 | 0 |  |  |  | 0 | $ua->proxy($1,"http://$2/") if /^(.*?)=(.*?)$/; | 
| 584 |  |  |  |  |  |  | } | 
| 585 |  |  |  |  |  |  | }else{ | 
| 586 |  |  |  |  |  |  | #Single proxy, such as 192.168.1.3:8080 | 
| 587 | 0 |  |  |  |  | 0 | $ua->proxy(['http','https','ftp'],"http://$ie_proxy_server/"); | 
| 588 |  |  |  |  |  |  | } | 
| 589 | 0 |  |  |  |  | 0 | return $ie_proxy_server; | 
| 590 |  |  |  |  |  |  | } | 
| 591 |  |  |  |  |  |  |  | 
| 592 |  |  |  |  |  |  | #------------------------------------------------------------- | 
| 593 |  |  |  |  |  |  | # string utility functions | 
| 594 |  |  |  |  |  |  | #	$bot->url_rel2abs($relative, $baseurl)			=> $absolute_url | 
| 595 |  |  |  |  |  |  | #	$bot->len_KB(95632)								=> "   93K" | 
| 596 |  |  |  |  |  |  | #	$bot->string_limit($str, $limit_len)			=> $str_limit | 
| 597 |  |  |  |  |  |  | #		'abc', 4	-> 'abc ' | 
| 598 |  |  |  |  |  |  | #		'abcdef', 4	-> 'abcd' | 
| 599 |  |  |  |  |  |  | #------------------------------------------------------------- | 
| 600 |  |  |  |  |  |  | sub url_rel2abs { | 
| 601 | 9 |  |  | 9 | 0 | 97 | return URI->new_abs($_[1], URI->new($_[2]))->as_string; | 
| 602 |  |  |  |  |  |  | } | 
| 603 |  |  |  |  |  |  | sub len_KB { | 
| 604 | 9 |  |  | 9 | 0 | 19 | my ($self, $len)=@_; | 
| 605 | 9 | 50 |  |  |  | 26 | if($len<1024) { | 
|  |  | 0 |  |  |  |  |  | 
| 606 | 9 |  |  |  |  | 50 | return sprintf("%5dB", $len); | 
| 607 |  |  |  |  |  |  | } elsif($len<9999*1024) { | 
| 608 | 0 |  |  |  |  | 0 | return sprintf("%5dK", $len/1024); | 
| 609 |  |  |  |  |  |  | } else { | 
| 610 | 0 |  |  |  |  | 0 | return sprintf("%5dM", $len/(1024*1024)); | 
| 611 |  |  |  |  |  |  | } | 
| 612 |  |  |  |  |  |  | } | 
| 613 |  |  |  |  |  |  | sub string_limit { | 
| 614 | 0 |  |  | 0 | 0 | 0 | my ($self, $str, $limit_len)=@_; | 
| 615 | 0 |  |  |  |  | 0 | $str=$self->de_code($str); | 
| 616 | 0 |  |  |  |  | 0 | my ($i, $j); | 
| 617 | 0 |  |  |  |  | 0 | for($i=0, $j=0; $i | 
| 618 | 0 | 0 |  |  |  | 0 | if( ord(substr($str, $i, 1))>=128 ) { | 
| 619 | 0 | 0 |  |  |  | 0 | last if $j+2>$limit_len; | 
| 620 | 0 |  |  |  |  | 0 | $j+=2; | 
| 621 |  |  |  |  |  |  | }else{ | 
| 622 | 0 | 0 |  |  |  | 0 | last if $j+1>$limit_len; | 
| 623 | 0 |  |  |  |  | 0 | $j++; | 
| 624 |  |  |  |  |  |  | } | 
| 625 |  |  |  |  |  |  | } | 
| 626 | 0 |  |  |  |  | 0 | $str=substr($str, 0, $i).(" " x ($limit_len-$j)); | 
| 627 | 0 |  |  |  |  | 0 | return $self->en_code($str); | 
| 628 |  |  |  |  |  |  | } | 
| 629 |  |  |  |  |  |  |  | 
| 630 |  |  |  |  |  |  | #------------------------------------------------------------- | 
| 631 |  |  |  |  |  |  | # fetch functions | 
| 632 |  |  |  |  |  |  | #	$bot->get_url($url)								=> $res | 
| 633 |  |  |  |  |  |  | #	$bot->get_url_request($url, $pargs)				=> $res | 
| 634 |  |  |  |  |  |  | #		$bot->get_url_request($url) | 
| 635 |  |  |  |  |  |  | #		$bot->get_url_request($url, {method='post', form={var=>value, var=>value}}) | 
| 636 |  |  |  |  |  |  | #	$bot->get_fail($msgid, $res)					=> N/A | 
| 637 |  |  |  |  |  |  | #------------------------------------------------------------- | 
| 638 |  |  |  |  |  |  | sub get_url { | 
| 639 | 0 |  |  | 0 | 0 | 0 | my ($self, $url) = @_; | 
| 640 | 0 |  |  |  |  | 0 | my $res; | 
| 641 | 0 |  |  |  |  | 0 | $self->get_url_verify($url);	#verify or change $url before real work | 
| 642 | 0 |  |  |  |  | 0 | my %info=(url=>$url);			#for display messages | 
| 643 | 0 |  |  |  |  | 0 | my $wait_msg=$self->msg_format('GetWait', {});	#perpare for wait message | 
| 644 |  |  |  |  |  |  |  | 
| 645 | 0 |  |  |  |  | 0 | for(my $i=$self->{get_max_retries}; $i>0; $i--) { | 
| 646 |  |  |  |  |  |  | #issues real request | 
| 647 | 0 |  |  |  |  | 0 | $res=$self->get_url_request($url); | 
| 648 |  |  |  |  |  |  |  | 
| 649 |  |  |  |  |  |  | #record infos for display messages | 
| 650 | 0 |  |  |  |  | 0 | $info{retry}=$i; | 
| 651 | 0 |  |  |  |  | 0 | $info{code}=$res->code; | 
| 652 | 0 |  |  |  |  | 0 | $info{len}=length($res->content); | 
| 653 | 0 |  |  |  |  | 0 | $info{len_KB}=$self->len_KB($info{len}); | 
| 654 |  |  |  |  |  |  |  | 
| 655 |  |  |  |  |  |  | # display before sleep | 
| 656 | 0 | 0 |  |  |  | 0 | $self->log_add("GetURLSuccess", \%info) if $res->is_success; | 
| 657 | 0 | 0 |  |  |  | 0 | $self->get_fail("GetFail404", $res) if $res->code==404; | 
| 658 |  |  |  |  |  |  |  | 
| 659 | 0 | 0 |  |  |  | 0 | if($self->{get_delay_second}>0) { | 
| 660 |  |  |  |  |  |  | # sleep if needed | 
| 661 | 0 |  |  |  |  | 0 | print $wait_msg; | 
| 662 | 0 |  |  |  |  | 0 | sleep $self->{get_delay_second}+rand($self->{get_delay_second_rand}); | 
| 663 | 0 |  |  |  |  | 0 | print "\b" x length($wait_msg); | 
| 664 |  |  |  |  |  |  | } | 
| 665 |  |  |  |  |  |  |  | 
| 666 |  |  |  |  |  |  | # return or display after sleep | 
| 667 | 0 | 0 | 0 |  |  | 0 | return $res if $res->is_success or $res->code==404; | 
| 668 | 0 | 0 |  |  |  | 0 | $self->log_add("GetURLRetry", \%info) if $i>1; | 
| 669 |  |  |  |  |  |  | } | 
| 670 | 0 |  |  |  |  | 0 | $self->get_fail("GetFailRetries", $res); | 
| 671 | 0 |  |  |  |  | 0 | return $res; | 
| 672 |  |  |  |  |  |  | } | 
| 673 |  |  |  |  |  |  | sub get_url_request { | 
| 674 | 0 |  |  | 0 | 0 | 0 | my ($self, $url, $pargs) = @_; | 
| 675 |  |  |  |  |  |  |  | 
| 676 |  |  |  |  |  |  | # prepare for parameters | 
| 677 | 0 | 0 |  |  |  | 0 | $pargs={} if not(ref($pargs)); | 
| 678 | 0 | 0 |  |  |  | 0 | $self->{get_lasturl}=$url if $self->{get_lasturl} eq ''; | 
| 679 | 0 |  |  |  |  | 0 | $self->{get_visited_url_num}++; | 
| 680 | 0 | 0 |  |  |  | 0 | $url='file:'.$self->{get_file_directory_read}.'/'.$self->{get_visited_url_num}.'.htm' if $self->{get_from_file}; | 
| 681 | 0 |  |  |  |  | 0 | my $agent=$self->{get_agent_array}->[$self->{get_agent_cur}]; | 
| 682 | 0 | 0 |  |  |  | 0 | my $method=defined($pargs->{method}) ? $pargs->{method} : 'get'; | 
| 683 | 0 |  |  |  |  | 0 | my %header=( | 
| 684 |  |  |  |  |  |  | 'Accept'			=>	'*/*', | 
| 685 |  |  |  |  |  |  | 'Referer'			=>	$self->{get_lasturl}, | 
| 686 |  |  |  |  |  |  | 'Accept-Language'	=>	$self->{get_language}, | 
| 687 |  |  |  |  |  |  | ); | 
| 688 |  |  |  |  |  |  |  | 
| 689 |  |  |  |  |  |  | # trunk display vars | 
| 690 | 0 |  |  |  |  | 0 | my $get_trunk_size=$self->{get_trunk_size}; | 
| 691 | 0 |  |  |  |  | 0 | my $screen_limit_trunk=$self->{screen_limit_trunk}; | 
| 692 | 0 |  |  |  |  | 0 | my $get_trunk_fresh_size=$self->{get_trunk_fresh_size}; | 
| 693 | 0 |  |  |  |  | 0 | my ($first_run, $expected_length, $expected_trunks, $bytes_received, $trunk_received)=(1,0,0,0,0); | 
| 694 | 0 |  |  |  |  | 0 | my @trunk_statuschar=qw(- \ | /); | 
| 695 | 0 |  |  |  |  | 0 | my ($trunk_status, $trunk_now, $trunk_refresh_now, $trunk_ceil, $trunk_floor)=(0, 0,0,0,0); | 
| 696 | 0 |  |  |  |  | 0 | my ($str_display, $bak_number); | 
| 697 |  |  |  |  |  |  |  | 
| 698 |  |  |  |  |  |  | # get it | 
| 699 |  |  |  |  |  |  | my $res = $agent->request( | 
| 700 |  |  |  |  |  |  | $method eq 'get' ? GET($url, %header) : POST($url, $pargs->{form}, %header) , | 
| 701 |  |  |  |  |  |  | sub { | 
| 702 | 0 | 0 |  | 0 |  | 0 | if($first_run) { | 
| 703 |  |  |  |  |  |  | # first get, then caculate expected length and print | 
| 704 | 0 |  |  |  |  | 0 | $first_run=0; | 
| 705 | 0 |  |  |  |  | 0 | $expected_length=$_[1]->headers->content_length; | 
| 706 | 0 | 0 |  |  |  | 0 | if($expected_length>0) { | 
| 707 | 0 |  |  |  |  | 0 | $expected_trunks=ceil($expected_length/$get_trunk_size); | 
| 708 | 0 | 0 |  |  |  | 0 | $expected_trunks=$screen_limit_trunk if $expected_trunks>$screen_limit_trunk; | 
| 709 | 0 |  |  |  |  | 0 | $str_display="." x $expected_trunks; | 
| 710 | 0 |  |  |  |  | 0 | $str_display.=" " x ($screen_limit_trunk-$expected_trunks); | 
| 711 | 0 | 0 |  |  |  | 0 | $str_display.=$self->len_KB($expected_length) if $expected_length>0; | 
| 712 | 0 |  |  |  |  | 0 | $str_display.="\b" x length($str_display); | 
| 713 | 0 |  |  |  |  | 0 | print $str_display; | 
| 714 |  |  |  |  |  |  | } | 
| 715 |  |  |  |  |  |  | } | 
| 716 | 0 |  |  |  |  | 0 | $bytes_received += length($_[0]); | 
| 717 | 0 |  |  |  |  | 0 | $trunk_ceil=ceil($bytes_received/$get_trunk_size); | 
| 718 | 0 | 0 |  |  |  | 0 | $trunk_ceil=$screen_limit_trunk if $trunk_ceil>$screen_limit_trunk; | 
| 719 | 0 |  |  |  |  | 0 | $trunk_floor=floor($bytes_received/$get_trunk_size); | 
| 720 | 0 | 0 |  |  |  | 0 | $trunk_floor=$screen_limit_trunk-1 if $trunk_floor>$screen_limit_trunk-1; | 
| 721 | 0 |  |  |  |  | 0 | $str_display=""; | 
| 722 | 0 |  |  |  |  | 0 | $bak_number=0; | 
| 723 | 0 | 0 |  |  |  | 0 | if($trunk_floor>$trunk_received) { | 
| 724 | 0 |  |  |  |  | 0 | $str_display.= ">" x ($trunk_floor - $trunk_received); | 
| 725 |  |  |  |  |  |  | } | 
| 726 | 0 | 0 | 0 |  |  | 0 | if($trunk_floor<$trunk_ceil and $bytes_received>=$trunk_refresh_now+$get_trunk_fresh_size) { | 
| 727 | 0 | 0 |  |  |  | 0 | $trunk_status=0 if ++$trunk_status>=scalar(@trunk_statuschar); | 
| 728 | 0 |  |  |  |  | 0 | $str_display.= $trunk_statuschar[$trunk_status]; | 
| 729 | 0 |  |  |  |  | 0 | $bak_number++; | 
| 730 |  |  |  |  |  |  | } | 
| 731 | 0 | 0 |  |  |  | 0 | if($bytes_received>=$trunk_refresh_now+$get_trunk_fresh_size) { | 
| 732 | 0 | 0 | 0 |  |  | 0 | if($expected_length>0 and $trunk_ceil>=$screen_limit_trunk) { | 
| 733 | 0 |  |  |  |  | 0 | my $trunk_percent=int(100*$bytes_received/$expected_length); | 
| 734 | 0 | 0 |  |  |  | 0 | $trunk_percent=100 if $trunk_percent>100; | 
| 735 | 0 |  |  |  |  | 0 | $str_display.= "   $trunk_percent%"; | 
| 736 | 0 |  |  |  |  | 0 | $bak_number+=length($trunk_percent)+4; | 
| 737 |  |  |  |  |  |  | } | 
| 738 | 0 |  |  |  |  | 0 | $trunk_refresh_now=$bytes_received; | 
| 739 |  |  |  |  |  |  | } | 
| 740 | 0 |  |  |  |  | 0 | print $str_display. "\b" x $bak_number; | 
| 741 | 0 |  |  |  |  | 0 | $trunk_received=$trunk_floor; | 
| 742 | 0 |  |  |  |  | 0 | $_[1]->add_content(\$_[0]); | 
| 743 |  |  |  |  |  |  | }, | 
| 744 | 0 | 0 |  |  |  | 0 | ); | 
| 745 |  |  |  |  |  |  |  | 
| 746 |  |  |  |  |  |  | # print rest trunks | 
| 747 | 0 |  |  |  |  | 0 | print ">" x ($trunk_ceil - $trunk_received); | 
| 748 | 0 | 0 |  |  |  | 0 | print " " x ($screen_limit_trunk - $trunk_ceil) if $res->is_success; | 
| 749 |  |  |  |  |  |  |  | 
| 750 |  |  |  |  |  |  | # status processing | 
| 751 | 0 | 0 |  |  |  | 0 | $self->{get_lasturl}=$url if $res->is_success; | 
| 752 | 0 | 0 |  |  |  | 0 | $self->{get_agent_cur}=0 if ++$self->{get_agent_cur} >= @{$self->{get_agent_array}}; | 
|  | 0 |  |  |  |  | 0 |  | 
| 753 | 0 | 0 | 0 |  |  | 0 | if( $res->is_success and $self->{get_save_file} ){ | 
| 754 |  |  |  |  |  |  | #debug writing | 
| 755 | 0 |  |  |  |  | 0 | $self->file_writebin('debug', | 
| 756 |  |  |  |  |  |  | catfile($self->{get_file_directory_save}, $self->{get_visited_url_num}.'.htm'), | 
| 757 |  |  |  |  |  |  | $res->content); | 
| 758 |  |  |  |  |  |  | } | 
| 759 | 0 |  |  |  |  | 0 | return $res; | 
| 760 |  |  |  |  |  |  | } | 
| 761 |  |  |  |  |  |  | sub get_fail { | 
| 762 | 0 |  |  | 0 | 0 | 0 | my $self = shift; | 
| 763 | 0 |  |  |  |  | 0 | my $msgid = shift; | 
| 764 | 0 |  |  |  |  | 0 | my $url_real=""; | 
| 765 | 0 | 0 |  |  |  | 0 | $url_real=$_[0]->request->uri->as_string if defined($_[0]->request->uri); | 
| 766 | 0 |  |  |  |  | 0 | $self->log_add($msgid.$self->{get_fail_showtype}, { | 
| 767 |  |  |  |  |  |  | code		=>	$_[0]->code, | 
| 768 |  |  |  |  |  |  | req_content	=>	$_[0]->request->as_string, | 
| 769 |  |  |  |  |  |  | status_line	=>	$_[0]->status_line, | 
| 770 |  |  |  |  |  |  | res_content	=>	$_[0]->as_string, | 
| 771 |  |  |  |  |  |  | url_real	=>	$url_real, | 
| 772 |  |  |  |  |  |  | }); | 
| 773 |  |  |  |  |  |  | } | 
| 774 |  |  |  |  |  |  |  | 
| 775 |  |  |  |  |  |  | #------------------------------------------------------------- | 
| 776 |  |  |  |  |  |  | # Parser utility functions | 
| 777 |  |  |  |  |  |  | #	$bot->normalize_space($content_dein_deout)			=> N/A | 
| 778 |  |  |  |  |  |  | #	$bot->remove_html($content_dein_deout)				=> N/A | 
| 779 |  |  |  |  |  |  | #	$bot->decode_entity($content_dein_deout)			=> N/A | 
| 780 |  |  |  |  |  |  | #	$bot->normalize_paragraph_1($content_dein_deout)	=> N/A | 
| 781 |  |  |  |  |  |  | #	$bot->parse_title($content_dein_deout)				=> $content_deout | 
| 782 |  |  |  |  |  |  | #	$bot->parse_titleen($content_dein_deout)			=> $content_enout | 
| 783 |  |  |  |  |  |  | #	$bot->normalize_paragraph($content_dein_deout)		=> N/A | 
| 784 |  |  |  |  |  |  | #		$bot->remove_line_by_end($content_dein_deout) | 
| 785 |  |  |  |  |  |  | #		$bot->parse_paragraph_br($content_dein_deout) | 
| 786 |  |  |  |  |  |  | #		$bot->parse_paragraph_brbr($content_dein_deout) | 
| 787 |  |  |  |  |  |  | #		$bot->parse_paragraph_brandspace($content_dein_deout) | 
| 788 |  |  |  |  |  |  | #		$bot->parse_paragraph_brbr_or_brandspace($content_dein_deout) | 
| 789 |  |  |  |  |  |  | #		$bot->parse_paragraph_cr($content_dein_deout) | 
| 790 |  |  |  |  |  |  | #		$bot->parse_paragraph_crcr($content_dein_deout) | 
| 791 |  |  |  |  |  |  | #		$bot->parse_paragraph_crandspace($content_dein_deout) | 
| 792 |  |  |  |  |  |  | #		$bot->remove_leadingspace($content_dein_deout) | 
| 793 |  |  |  |  |  |  | #		$bot->remove_innerspace($content_dein_deout) | 
| 794 |  |  |  |  |  |  | #------------------------------------------------------------- | 
| 795 |  |  |  |  |  |  | sub normalize_space { | 
| 796 | 24 |  |  | 24 | 0 | 134 | $_[1]=~s/$_[0]->{patterns}->{space}/ /osg; | 
| 797 | 24 |  |  |  |  | 106 | $_[1]=~s/$_[0]->{patterns}->{space2}/  /osg; | 
| 798 |  |  |  |  |  |  | } | 
| 799 |  |  |  |  |  |  | sub remove_html { | 
| 800 | 24 |  |  | 24 | 0 | 413 | $_[1]=~s/$_[0]->{patterns}->{remove_html}//osg; | 
| 801 | 24 |  |  |  |  | 93 | $_[1]=~s/<[^<>]*>//osg; | 
| 802 |  |  |  |  |  |  | } | 
| 803 |  |  |  |  |  |  | sub decode_entity { | 
| 804 | 16 |  |  | 16 | 0 | 43 | $_[1]=~s/(?:&\#(\d{1,5});?)/chr($1)/esg; | 
|  | 4 |  |  |  |  | 20 |  | 
| 805 | 16 |  |  |  |  | 30 | $_[1]=~s/(?:&\#[xX]([0-9a-fA-F]{1,5});?)/chr(hex($1))/esg; | 
|  | 0 |  |  |  |  | 0 |  | 
| 806 | 16 | 50 |  |  |  | 36 | $_[1]=~s/(&([0-9a-zA-Z]{1,9});?)/$entity2char{$2} or $1/esg; | 
|  | 2 |  |  |  |  | 18 |  | 
| 807 |  |  |  |  |  |  | } | 
| 808 |  |  |  |  |  |  | sub normalize_paragraph_1 { | 
| 809 | 33 |  |  | 33 | 0 | 118 | $_[1]=~s/^ +/ /mg;								#normalize spaces before paragraph | 
| 810 | 33 |  |  |  |  | 94 | $_[1]=~s/ +$//mg;								#remove spaces after paragraph | 
| 811 | 33 |  |  |  |  | 256 | $_[1]=~s/^ ?(?:$_[0]->{patterns}->{mark_dash} *){2,}/ ---/omg; | 
| 812 |  |  |  |  |  |  | #normalize repeated dash | 
| 813 | 33 |  |  |  |  | 198 | $_[1]=~s/\n{2,}/\n/sg;							#remove empty paragraph | 
| 814 | 33 |  |  |  |  | 66 | $_[1]=~s/(?: ---\n?){2,}/ ---\n/sg;				#remove too much dash line | 
| 815 | 33 |  |  |  |  | 110 | $_[1]=~s/(?:^\n|\n$)//s;						#remove leading and ending \n | 
| 816 | 33 |  |  |  |  | 528 | $_[1]=~s/$_[0]->{patterns}->{word_finish}//os;	#remove finish words | 
| 817 | 33 |  |  |  |  | 79 | $_[1]=~s/\n$//s;								#remove ending \n | 
| 818 |  |  |  |  |  |  | } | 
| 819 |  |  |  |  |  |  | sub parse_title { | 
| 820 | 12 |  |  | 12 | 0 | 43 | $_[0]->normalize_space($_[1]); | 
| 821 | 12 |  |  |  |  | 39 | $_[0]->remove_html($_[1]); | 
| 822 | 12 |  |  |  |  | 36 | $_[0]->decode_entity($_[1]); | 
| 823 | 12 |  |  |  |  | 42 | $_[1]=~s/\n+/ /sg;	# CRLF as space | 
| 824 | 12 |  |  |  |  | 35 | $_[0]->normalize_paragraph_1($_[1]); | 
| 825 | 12 |  |  |  |  | 46 | $_[1]=~s/ +/ /sg;	#remove extra spaces | 
| 826 |  |  |  |  |  |  |  | 
| 827 |  |  |  |  |  |  | #remove ending space or wordsplit mark | 
| 828 | 12 |  |  |  |  | 37 | my $p1=$_[0]->{patterns}->{mark_wordsplit}; | 
| 829 | 12 |  |  |  |  | 90 | $p1=~s/(?:^\[|\]$)//sg; | 
| 830 | 12 |  |  |  |  | 36 | $p1="[".$p1." ]"; | 
| 831 | 12 |  |  |  |  | 110 | $_[1]=~s/$p1+$//os; | 
| 832 |  |  |  |  |  |  |  | 
| 833 |  |  |  |  |  |  | #remove paraentheses | 
| 834 | 12 |  |  |  |  | 59 | $_[1]=~s/(?:^ +| +$)//sg; | 
| 835 | 12 |  |  |  |  | 294 | while($_[1]=~/^(?:$_[0]->{patterns}->{parentheses})$/os) { | 
| 836 | 3 |  |  |  |  | 10 | $_[1]=$^N; | 
| 837 | 3 |  |  |  |  | 25 | $_[1]=~s/(?:^ +| +$)//sg; | 
| 838 |  |  |  |  |  |  | } | 
| 839 |  |  |  |  |  |  |  | 
| 840 | 12 |  |  |  |  | 50 | $_[1]; | 
| 841 |  |  |  |  |  |  | } | 
| 842 |  |  |  |  |  |  | sub parse_titleen { | 
| 843 | 6 |  |  | 6 | 0 | 28 | $_[0]->en_code($_[0]->parse_title($_[1])); | 
| 844 |  |  |  |  |  |  | } | 
| 845 |  |  |  |  |  |  | sub normalize_paragraph { | 
| 846 | 9 |  |  | 9 | 0 | 42 | $_[0]->normalize_space($_[1]); | 
| 847 | 9 |  |  |  |  | 44 | $_[0]->parse_paragraph_begin($_[1]); | 
| 848 | 9 |  |  |  |  | 35 | my $sub="parse_paragraph_".$_[0]->{text_paragraph_type}; | 
| 849 | 9 |  |  |  |  | 50 | $_[0]->$sub($_[1]); | 
| 850 | 9 |  |  |  |  | 28 | $_[0]->remove_html($_[1]); | 
| 851 | 9 |  |  |  |  | 37 | $_[0]->decode_entity($_[1]); | 
| 852 | 9 |  |  |  |  | 31 | $_[0]->normalize_paragraph_1($_[1]); | 
| 853 | 9 |  |  |  |  | 38 | $_[0]->remove_line_by_end($_[1]); | 
| 854 | 9 |  |  |  |  | 26 | $_[0]->normalize_paragraph_1($_[1]); | 
| 855 | 9 |  |  |  |  | 42 | $_[0]->parse_paragraph_end($_[1]); | 
| 856 | 9 |  |  |  |  | 15 | $_[1]=~s/ ?\$BOOKBOTRETURN\$//sg;		#remove for reserved return | 
| 857 | 9 |  |  |  |  | 42 | $sub='$_[1]=~s/^ /'.$_[0]->{patterns}->{line_head}.'/mg;'; #normalize with 4 spaces | 
| 858 | 9 |  |  |  |  | 913 | eval $sub; | 
| 859 |  |  |  |  |  |  | } | 
| 860 |  |  |  |  |  |  | sub remove_line_by_end { | 
| 861 | 9 |  |  | 9 | 0 | 467 | $_[1]=~s/(?:---\n|\n).*(?:$_[0]->{patterns}->{remove_line_by_end})$_[0]->{patterns}->{parentheses_right}?$//omg; | 
| 862 | 9 |  |  |  |  | 124 | $_[1]=~s/\n $_[0]->{patterns}->{remove_line_by_end_special}$//osg; | 
| 863 |  |  |  |  |  |  | } | 
| 864 |  |  |  |  |  |  | sub parse_paragraph_br { | 
| 865 | 9 |  |  | 9 | 0 | 68 | $_[1]=~s/\n//sg; | 
| 866 | 9 |  |  |  |  | 37 | $_[1]=~s/<[bB][rR]> */\n /sg; | 
| 867 |  |  |  |  |  |  | } | 
| 868 |  |  |  |  |  |  | sub parse_paragraph_brbr { | 
| 869 | 0 |  |  | 0 | 0 | 0 | $_[1]=~s/\n//sg; | 
| 870 | 0 |  |  |  |  | 0 | $_[1]=~s/(?:<[bB][rR]> *){2,}/\n /sg; | 
| 871 |  |  |  |  |  |  | } | 
| 872 |  |  |  |  |  |  | sub parse_paragraph_brandspace { | 
| 873 | 0 |  |  | 0 | 0 | 0 | $_[1]=~s/\n//sg; | 
| 874 | 0 |  |  |  |  | 0 | $_[1]=~s/<[bB][rR]>(?=[^ ])//sg; | 
| 875 | 0 |  |  |  |  | 0 | $_[1]=~s/<[bB][rR]> */\n /sg; | 
| 876 |  |  |  |  |  |  | } | 
| 877 |  |  |  |  |  |  | sub parse_paragraph_brbr_or_brandspace { | 
| 878 | 0 |  |  | 0 | 0 | 0 | $_[1]=~s/\n//sg; | 
| 879 | 0 |  |  |  |  | 0 | $_[1]=~s/<[bB][rR]>(?=[^ <])//sg; | 
| 880 | 0 |  |  |  |  | 0 | $_[1]=~s/(?:<[bB][rR]> *){1,}/\n /sg; | 
| 881 |  |  |  |  |  |  | } | 
| 882 |  |  |  |  |  |  | sub parse_paragraph_br_or_p { | 
| 883 | 0 |  |  | 0 | 0 | 0 | $_[1]=~s/\n/ /sg; | 
| 884 | 0 |  |  |  |  | 0 | $_[1]=~s/<[bB\/][rRpP]>/\n/sg; | 
| 885 |  |  |  |  |  |  | } | 
| 886 | 0 |  |  | 0 | 0 | 0 | sub parse_paragraph_cr { | 
| 887 |  |  |  |  |  |  | } | 
| 888 |  |  |  |  |  |  | sub parse_paragraph_crcr { | 
| 889 | 0 |  |  | 0 | 0 | 0 | $_[1]=~s/(?<=[^\n])\n(?=[^\n])//sg;		#remove single \n | 
| 890 | 0 |  |  |  |  | 0 | $_[1]=~s/\n{2,}/\n/sg;					#change multiple \n into one \n | 
| 891 |  |  |  |  |  |  | } | 
| 892 |  |  |  |  |  |  | sub parse_paragraph_crandspace { | 
| 893 | 0 |  |  | 0 | 0 | 0 | $_[1]=~s/\n(?=[^ ])//sg;				#remove \n without " " followed | 
| 894 |  |  |  |  |  |  | } | 
| 895 |  |  |  |  |  |  | sub remove_leadingspace { | 
| 896 | 0 |  |  | 0 | 0 | 0 | my $self = shift; | 
| 897 | 0 | 0 |  |  |  | 0 | return if not $self->{space_leading_remove}; | 
| 898 | 0 |  |  |  |  | 0 | for(my $i=$self->{space_leading_max}; $i>0; $i--) { | 
| 899 | 0 |  |  |  |  | 0 | my $spaces=" " x $i; | 
| 900 | 0 |  |  |  |  | 0 | my $linefollow=$spaces."[^ ].*?\n"; | 
| 901 | 0 | 0 |  |  |  | 0 | if( $_[0]=~/\n$spaces .*?\n$linefollow$linefollow$linefollow/o ) { | 
| 902 | 0 |  |  |  |  | 0 | $_[0]=~s/\n$spaces/\n/og; | 
| 903 | 0 |  |  |  |  | 0 | return; | 
| 904 |  |  |  |  |  |  | } | 
| 905 |  |  |  |  |  |  | } | 
| 906 |  |  |  |  |  |  | } | 
| 907 |  |  |  |  |  |  | sub remove_innerspace { | 
| 908 | 0 |  |  | 0 | 0 | 0 | my $self = shift; | 
| 909 | 0 | 0 |  |  |  | 0 | return if not $self->{space_inner_remove}; | 
| 910 |  |  |  |  |  |  |  | 
| 911 | 0 |  |  |  |  | 0 | my $pattern='\w ' x $self->{space_inner_min_words}; | 
| 912 | 0 | 0 |  |  |  | 0 | return if not $_[0]=~/$pattern/o; | 
| 913 |  |  |  |  |  |  |  | 
| 914 | 0 |  |  |  |  | 0 | my ($str) = @_; | 
| 915 | 0 |  |  |  |  | 0 | $pattern=$self->{patterns}->{mark_wordsplit}; | 
| 916 | 0 |  |  |  |  | 0 | $pattern=~s/\|//og; | 
| 917 | 0 |  |  |  |  | 0 | $_[0]=~s/(?<=[^$pattern\s])\s(?=\S)//omg; | 
| 918 | 0 |  |  |  |  | 0 | $_[0]=~s/(?<=\S)\s\s(?=\S)/ /omg; | 
| 919 |  |  |  |  |  |  | } | 
| 920 |  |  |  |  |  |  |  | 
| 921 |  |  |  |  |  |  | #------------------------------------------------------------- | 
| 922 |  |  |  |  |  |  | # Parser main functions | 
| 923 |  |  |  |  |  |  | #	$bot->go_catalog(\%args)							=> $booknum | 
| 924 |  |  |  |  |  |  | #		url | 
| 925 |  |  |  |  |  |  | #	$bot->catalog_get_book(\%args, $content_dein)		=> @books | 
| 926 |  |  |  |  |  |  | #	$bot->catalog_get_next(\%args, $content_dein)		=> $booknum, 0-no next | 
| 927 |  |  |  |  |  |  | #	$bot->go_book(\%args)								=> $content_de | 
| 928 |  |  |  |  |  |  | #		level, bpos, book_num, url, title | 
| 929 |  |  |  |  |  |  | #	$bot->book_html(\%args, $content_enin_deout)		=> N/A | 
| 930 |  |  |  |  |  |  | #	$bot->book_text(\%args, $content_enin_deout)		=> N/A | 
| 931 |  |  |  |  |  |  | #	$bot->book_bin(\%args, $content_enin_deout)			=> N/A | 
| 932 |  |  |  |  |  |  | #	$bot->book_writebin(\%args, $content_bin)			=> $filebasename_de | 
| 933 |  |  |  |  |  |  | #	$bot->book_chapters(\%args, $content_dein)			=> @chapters | 
| 934 |  |  |  |  |  |  | #		() means this is a chapter or wrong | 
| 935 |  |  |  |  |  |  | #		({url=>..., title=>..., ...}, {}) means chapters | 
| 936 |  |  |  |  |  |  | #	$bot->TOC_parser(\%args, $content_dein_deout)		=> N/A | 
| 937 |  |  |  |  |  |  | #	$bot->chapter_process(\%args, $content_dein_deout)	=> N/A | 
| 938 |  |  |  |  |  |  | #	$bot->chapter_parser(\%args, $content_dein_deout)	=> N/A | 
| 939 |  |  |  |  |  |  | #------------------------------------------------------------- | 
| 940 |  |  |  |  |  |  | sub go_catalog { | 
| 941 | 0 |  |  | 0 | 0 | 0 | my ($self, $pargs) = @_; | 
| 942 | 0 | 0 |  |  |  | 0 | $pargs={} if not(ref($pargs)); | 
| 943 | 0 |  |  |  |  | 0 | my %args_orig=%$pargs;							#keep original args | 
| 944 | 0 |  |  |  |  | 0 | $pargs->{url}=$self->msg_format('CatalogURL', $pargs); | 
| 945 | 0 |  |  |  |  | 0 | $self->log_add('CatalogInfo', $pargs); | 
| 946 |  |  |  |  |  |  |  | 
| 947 |  |  |  |  |  |  | # Get it | 
| 948 | 0 | 0 |  |  |  | 0 | if( $pargs->{url} eq '' ) { | 
| 949 | 0 |  |  |  |  | 0 | $self->log_add('CatalogURLEmpty', $pargs); | 
| 950 | 0 |  |  |  |  | 0 | return 0; | 
| 951 |  |  |  |  |  |  | } | 
| 952 | 0 |  |  |  |  | 0 | my $res=$self->get_url($pargs->{url}); | 
| 953 | 0 | 0 |  |  |  | 0 | if( not($res->is_success) ) { | 
| 954 | 0 |  |  |  |  | 0 | $self->db_add("catalog", "Err", \%args_orig); | 
| 955 | 0 |  |  |  |  | 0 | return 0; | 
| 956 |  |  |  |  |  |  | } | 
| 957 | 0 |  |  |  |  | 0 | $pargs->{url_real}=$res->request->uri->as_string; | 
| 958 | 0 |  |  |  |  | 0 | $pargs->{url_base}=$res->base->as_string; | 
| 959 | 0 |  |  |  |  | 0 | my $str=$self->de_code($res->content); | 
| 960 | 0 |  |  |  |  | 0 | undef $res; | 
| 961 | 0 |  |  |  |  | 0 | $str=~s/\r\n|\r/\n/sg; | 
| 962 | 0 |  |  |  |  | 0 | my $str_all=$str; | 
| 963 |  |  |  |  |  |  |  | 
| 964 |  |  |  |  |  |  | # Parse books | 
| 965 | 0 | 0 |  |  |  | 0 | $str=~s/^.*?$self->{patterns}->{catalog_head}//os if $self->{patterns}->{catalog_head} ne ''; | 
| 966 | 0 | 0 |  |  |  | 0 | $str=~s/$self->{patterns}->{catalog_end}.*$//os if $self->{patterns}->{catalog_end} ne ''; | 
| 967 | 0 |  |  |  |  | 0 | my @books=$self->catalog_get_book($pargs, $str); | 
| 968 | 0 |  |  |  |  | 0 | undef $str; | 
| 969 | 0 |  |  |  |  | 0 | $pargs->{book_num}=scalar(@books); | 
| 970 | 0 | 0 |  |  |  | 0 | $self->log_add('CatalogResult'.($pargs->{book_num}>0 ? 'OK' : 'Err'), $pargs); | 
| 971 | 0 | 0 |  |  |  | 0 | $self->db_add("catalog", $pargs->{book_num}>0 ? 'OK' : 'Err', \%args_orig); | 
| 972 |  |  |  |  |  |  |  | 
| 973 |  |  |  |  |  |  | # Parse next area | 
| 974 | 0 |  |  |  |  | 0 | my $go_next=$self->catalog_get_next($pargs, $str_all); | 
| 975 | 0 |  |  |  |  | 0 | undef $str_all; | 
| 976 |  |  |  |  |  |  |  | 
| 977 |  |  |  |  |  |  | # Get books | 
| 978 | 0 |  |  |  |  | 0 | for(my $bpos=0; $bpos<$pargs->{book_num}; $bpos++) { | 
| 979 | 0 |  |  |  |  | 0 | $books[$bpos]->{book_num}=$pargs->{book_num}; | 
| 980 | 0 |  |  |  |  | 0 | $books[$bpos]->{bpos}=$bpos+1; | 
| 981 | 0 |  |  |  |  | 0 | $self->go_book($books[$bpos]); | 
| 982 |  |  |  |  |  |  | } | 
| 983 |  |  |  |  |  |  |  | 
| 984 | 0 |  |  |  |  | 0 | return $go_next; | 
| 985 |  |  |  |  |  |  | } | 
| 986 |  |  |  |  |  |  | sub catalog_get_book { | 
| 987 | 6 |  |  | 6 | 0 | 40 | my ($self, $pargs) = (shift, shift); | 
| 988 | 6 |  |  |  |  | 12 | my @books = (); | 
| 989 | 6 |  |  |  |  | 237 | while($_[0]=~/$self->{patterns}->{catalog_get_bookargs}/osg) { | 
| 990 | 3 |  |  |  |  | 8 | my $pargs1={}; | 
| 991 | 3 | 50 |  |  |  | 27 | next if $self->catalog_get_bookargs($pargs1, $1, $2, $3, $4, $5, $6, $7, $8, $9) eq 'Skip'; | 
| 992 | 3 |  |  |  |  | 18 | $pargs1->{url}=$self->url_rel2abs($pargs1->{url}, $pargs->{url_base}); | 
| 993 | 3 |  |  |  |  | 1197 | push @books, $pargs1; | 
| 994 |  |  |  |  |  |  | } | 
| 995 | 6 |  |  |  |  | 29 | return @books; | 
| 996 |  |  |  |  |  |  | } | 
| 997 |  |  |  |  |  |  | sub catalog_get_next { | 
| 998 | 0 |  |  | 0 | 0 | 0 | my ($self, $pargs) = (shift, shift); | 
| 999 | 0 | 0 |  |  |  | 0 | return $pargs->{book_num} if $self->{patterns}->{catalog_get_next} eq ''; | 
| 1000 | 0 | 0 |  |  |  | 0 | return $_[0]=~/$self->{patterns}->{catalog_get_next}/os ? $pargs->{book_num} : 0; | 
| 1001 |  |  |  |  |  |  | } | 
| 1002 |  |  |  |  |  |  | sub go_book { | 
| 1003 | 0 |  |  | 0 | 0 | 0 | my ($self, $pargs) = @_; | 
| 1004 | 0 | 0 |  |  |  | 0 | $pargs->{level}=0 if $pargs->{level} eq ''; | 
| 1005 | 0 | 0 |  |  |  | 0 | my %args_orig=%$pargs if $pargs->{level}==0;	#keep original args | 
| 1006 | 0 | 0 |  |  |  | 0 | $pargs->{bpos}=0 if $pargs->{bpos} eq ''; | 
| 1007 | 0 |  |  |  |  | 0 | $pargs->{url}=~s/\#.*$//;						#Remove references in url | 
| 1008 | 0 |  |  |  |  | 0 | $pargs->{levelspace} = "  " x $pargs->{level};	#caculate spaces to purify log output | 
| 1009 | 0 |  |  |  |  | 0 | $pargs->{bpos_limit}=sprintf("%0".length($pargs->{book_num})."d", $pargs->{bpos}); | 
| 1010 | 0 |  |  |  |  | 0 | $pargs->{title_limit}=$self->string_limit($pargs->{title}, $self->{screen_limit_title}); | 
| 1011 | 0 |  |  |  |  | 0 | $self->log_add("BookStart", $pargs); | 
| 1012 |  |  |  |  |  |  |  | 
| 1013 |  |  |  |  |  |  | # Skip some special urls | 
| 1014 | 0 | 0 |  |  |  | 0 | if($pargs->{level}>$self->{book_max_levels}) | 
|  | 0 |  |  |  |  | 0 |  | 
| 1015 | 0 |  |  |  |  | 0 | {$self->log_add("SkipMaxLevel", $pargs); return "";} | 
| 1016 | 0 | 0 |  |  |  | 0 | if($pargs->{title} eq '') | 
|  | 0 |  |  |  |  | 0 |  | 
| 1017 | 0 |  |  |  |  | 0 | {$self->log_add("SkipTitleEmpty", $pargs); return "";} | 
| 1018 | 0 | 0 |  |  |  | 0 | if($pargs->{url} eq '') | 
|  | 0 |  |  |  |  | 0 |  | 
| 1019 | 0 |  |  |  |  | 0 | {$self->log_add("SkipUrlEmpty", $pargs); return "";} | 
| 1020 | 0 | 0 | 0 |  |  | 0 | if($self->{get_skip_zip} and $pargs->{url}=~/(?:$self->{patterns}->{postfix_zip})$/i) | 
|  | 0 |  |  |  |  | 0 |  | 
| 1021 | 0 |  |  |  |  | 0 | {$self->log_add("SkipZip", $pargs); return "";} | 
| 1022 | 0 | 0 | 0 |  |  | 0 | if($self->{get_skip_media} and $pargs->{url}=~/(?:$self->{patterns}->{postfix_media})$/i) | 
|  | 0 |  |  |  |  | 0 |  | 
| 1023 | 0 |  |  |  |  | 0 | {$self->log_add("SkipMedia", $pargs); return "";} | 
| 1024 | 0 | 0 |  |  |  | 0 | if(defined($self->{DB_visited_book}->{$pargs->{url}})) | 
|  | 0 |  |  |  |  | 0 |  | 
| 1025 | 0 |  |  |  |  | 0 | {$self->log_add("SkipVisited", $pargs); return "";} | 
| 1026 |  |  |  |  |  |  |  | 
| 1027 |  |  |  |  |  |  | # Get URL | 
| 1028 | 0 |  |  |  |  | 0 | my $res=$self->get_url($pargs->{url}); | 
| 1029 | 0 | 0 |  |  |  | 0 | if(not($res->is_success)) { | 
| 1030 | 0 | 0 |  |  |  | 0 | $self->db_add("book", "Err", \%args_orig) if $pargs->{level}==0; | 
| 1031 | 0 |  |  |  |  | 0 | return ""; | 
| 1032 |  |  |  |  |  |  | } | 
| 1033 | 0 |  |  |  |  | 0 | my $str=$res->content; | 
| 1034 | 0 |  |  |  |  | 0 | $pargs->{content_type}=$res->headers->content_type; | 
| 1035 | 0 |  |  |  |  | 0 | $pargs->{content_len}=length($str); | 
| 1036 | 0 |  |  |  |  | 0 | $pargs->{last_modified}=$res->headers->last_modified; | 
| 1037 | 0 |  |  |  |  | 0 | $pargs->{last_modified_str}=HTTP::Date::time2iso($res->headers->last_modified); | 
| 1038 | 0 |  |  |  |  | 0 | $pargs->{url_real}=$res->request->uri->as_string; | 
| 1039 | 0 |  |  |  |  | 0 | $pargs->{url_base}=$res->base->as_string; | 
| 1040 | 0 |  |  |  |  | 0 | my $url1=$pargs->{url_real}; | 
| 1041 | 0 |  |  |  |  | 0 | $url1=~s/\?.*$//; | 
| 1042 | 0 | 0 |  |  |  | 0 | $pargs->{ext_real}=($url1=~/\.([^\.]+)$/) ? lc($1) : ""; | 
| 1043 | 0 |  |  |  |  | 0 | $pargs->{ext_save}=$pargs->{ext_real}; | 
| 1044 | 0 | 0 |  |  |  | 0 | if( $pargs->{ext_save}=~/^(?:$self->{patterns}->{postfix_free}|)$/ ) { | 
| 1045 |  |  |  |  |  |  | # file extension cannot be confirmed, since it's cgi | 
| 1046 | 0 |  |  |  |  | 0 | $pargs->{ext_save}=$self->{Content_Type}->{$pargs->{content_type}};	#try to new_bot via content-type | 
| 1047 | 0 | 0 |  |  |  | 0 | $pargs->{ext_save}='txt' if $pargs->{ext_save} eq '';				#add default txt if fail | 
| 1048 |  |  |  |  |  |  | } | 
| 1049 | 0 |  |  |  |  | 0 | undef $res; | 
| 1050 |  |  |  |  |  |  |  | 
| 1051 |  |  |  |  |  |  | # html/text/bin | 
| 1052 | 0 | 0 |  |  |  | 0 | if( $pargs->{content_len}>0 ) { | 
| 1053 | 0 | 0 |  |  |  | 0 | if($pargs->{ext_real} eq 'txt') { | 
|  |  | 0 |  |  |  |  |  | 
| 1054 | 0 |  |  |  |  | 0 | $self->book_text($pargs, $str); | 
| 1055 |  |  |  |  |  |  | } elsif($pargs->{content_type} eq 'text/html') { | 
| 1056 | 0 |  |  |  |  | 0 | $self->book_html($pargs, $str); | 
| 1057 |  |  |  |  |  |  | } else { | 
| 1058 | 0 |  |  |  |  | 0 | $self->book_bin($pargs, $str); | 
| 1059 |  |  |  |  |  |  | } | 
| 1060 |  |  |  |  |  |  | } | 
| 1061 | 0 | 0 |  |  |  | 0 | $self->db_add("book", (length($str)==0) ? 'Err' : 'OK', \%args_orig) if $pargs->{level}==0; | 
|  |  | 0 |  |  |  |  |  | 
| 1062 | 0 |  |  |  |  | 0 | return $str; | 
| 1063 |  |  |  |  |  |  | } | 
| 1064 |  |  |  |  |  |  | sub book_html { | 
| 1065 | 0 |  |  | 0 | 0 | 0 | my ($self, $pargs) = (shift, shift); | 
| 1066 |  |  |  |  |  |  |  | 
| 1067 |  |  |  |  |  |  | # check 1 chapter or more, return if 1 chapter | 
| 1068 | 0 |  |  |  |  | 0 | $_[0]=$self->de_code($_[0]); | 
| 1069 | 0 |  |  |  |  | 0 | $_[0]=~s/\r\n|\r/\n/og; | 
| 1070 | 0 |  |  |  |  | 0 | my @chapters=$self->book_chapters($pargs, $_[0]); | 
| 1071 | 0 |  |  |  |  | 0 | $pargs->{chapter_num}=scalar(@chapters); | 
| 1072 | 0 |  |  |  |  | 0 | $pargs->{chapter_num_limit}=sprintf("%04d", $pargs->{chapter_num}); | 
| 1073 | 0 | 0 |  |  |  | 0 | if( $pargs->{chapter_num}==0 ){ | 
| 1074 | 0 |  |  |  |  | 0 | $self->log_add("BookChapterOne", $pargs); | 
| 1075 | 0 |  |  |  |  | 0 | $self->chapter_process($pargs, $_[0]); | 
| 1076 | 0 |  |  |  |  | 0 | return; | 
| 1077 |  |  |  |  |  |  | } | 
| 1078 | 0 |  |  |  |  | 0 | $self->log_add("BookChapterMany", $pargs); | 
| 1079 |  |  |  |  |  |  |  | 
| 1080 |  |  |  |  |  |  | # initialize result file to put TOC | 
| 1081 | 0 |  |  |  |  | 0 | $self->{book_get_num}++; | 
| 1082 | 0 |  |  |  |  | 0 | my $filename=$self->result_init($pargs); | 
| 1083 | 0 |  |  |  |  | 0 | $pargs->{filename}=$filename; | 
| 1084 |  |  |  |  |  |  |  | 
| 1085 |  |  |  |  |  |  | # parse TOC and save it | 
| 1086 | 0 |  |  |  |  | 0 | $self->TOC_parser($pargs, $_[0]); | 
| 1087 | 0 |  |  |  |  | 0 | my $out_en=$self->en_code($_[0]); | 
| 1088 | 0 |  |  |  |  | 0 | $self->result_add($filename, $out_en); | 
| 1089 | 0 |  |  |  |  | 0 | $pargs->{TOC_len}=length($out_en); | 
| 1090 | 0 |  |  |  |  | 0 | $pargs->{TOC_len_KB}=$self->len_KB($pargs->{TOC_len}); | 
| 1091 | 0 |  |  |  |  | 0 | $self->log_add("BookTOCFinish", $pargs); | 
| 1092 |  |  |  |  |  |  |  | 
| 1093 |  |  |  |  |  |  | # parse other chapters | 
| 1094 | 0 |  |  |  |  | 0 | for(my $bpos=0; $bpos<$pargs->{chapter_num}; $bpos++) { | 
| 1095 | 0 |  |  |  |  | 0 | $chapters[$bpos]->{level}=$pargs->{level}+1; | 
| 1096 | 0 |  |  |  |  | 0 | $chapters[$bpos]->{book_num}=$pargs->{chapter_num}; | 
| 1097 | 0 |  |  |  |  | 0 | $chapters[$bpos]->{bpos}=$bpos+1; | 
| 1098 | 0 |  |  |  |  | 0 | $self->result_add( $filename, "\n\n" ); | 
| 1099 | 0 |  |  |  |  | 0 | $self->result_adden( $filename, $self->go_book($chapters[$bpos]) ); | 
| 1100 |  |  |  |  |  |  | } | 
| 1101 | 0 |  |  |  |  | 0 | $self->book_finish($pargs); | 
| 1102 |  |  |  |  |  |  |  | 
| 1103 |  |  |  |  |  |  | # finish work | 
| 1104 | 0 |  |  |  |  | 0 | my $result_time=$self->result_time($pargs); | 
| 1105 | 0 | 0 |  |  |  | 0 | $self->result_settime($filename, $result_time) if defined($result_time); | 
| 1106 |  |  |  |  |  |  | } | 
| 1107 |  |  |  |  |  |  | sub book_text { | 
| 1108 | 6 |  |  | 6 | 0 | 14 | my ($self, $pargs) = (shift, shift); | 
| 1109 | 6 |  |  |  |  | 22 | $_[0]=$self->de_code($_[0]); | 
| 1110 | 6 |  |  |  |  | 44 | $_[0]=~s/\r\n|\r/\n/og; | 
| 1111 | 6 |  |  |  |  | 32 | $self->chapter_process($pargs, $_[0]); | 
| 1112 |  |  |  |  |  |  | } | 
| 1113 |  |  |  |  |  |  | sub book_bin { | 
| 1114 | 3 |  |  | 3 | 0 | 9 | my ($self, $pargs) = (shift, shift); | 
| 1115 | 3 |  |  |  |  | 10 | $pargs->{data_len}=length($_[0]); | 
| 1116 | 3 |  |  |  |  | 20 | $pargs->{data_len_KB}=$self->len_KB($pargs->{data_len}); | 
| 1117 | 3 | 50 |  |  |  | 15 | return if $pargs->{data_len}==0; | 
| 1118 | 3 |  |  |  |  | 14 | $pargs->{write_file}=$self->book_writebin($pargs, $_[0]); | 
| 1119 | 3 |  |  |  |  | 28 | $self->log_add("BookBinaryOK", $pargs); | 
| 1120 | 3 |  |  |  |  | 39 | $_[0]="[$pargs->{write_file}]"; | 
| 1121 |  |  |  |  |  |  | } | 
| 1122 |  |  |  |  |  |  | sub book_writebin { | 
| 1123 | 6 |  |  | 6 | 0 | 19 | my ($self, $pargs) = (shift, shift); | 
| 1124 | 6 |  |  |  |  | 24 | my $filename=$self->result_filename($pargs); | 
| 1125 | 6 |  |  |  |  | 41 | $self->file_writebin("result", $filename, $_[0]); | 
| 1126 | 6 |  |  |  |  | 39 | my $result_time=$self->result_time($pargs); | 
| 1127 | 6 | 50 |  |  |  | 21 | $self->result_settime($filename, $result_time) if defined($result_time); | 
| 1128 | 6 |  |  |  |  | 306 | return $self->de_code(basename($filename)); | 
| 1129 |  |  |  |  |  |  | } | 
| 1130 |  |  |  |  |  |  | sub book_chapters { | 
| 1131 | 6 |  |  | 6 | 0 | 54 | my ($self, $pargs, $str) = @_; | 
| 1132 | 6 | 50 |  |  |  | 28 | return () if $self->{book_has_chapters}==0; | 
| 1133 | 6 | 50 | 33 |  |  | 31 | return () if $self->{patterns}->{TOC_exists} ne '' and not($str=~/$self->{patterns}->{TOC_exists}/os); | 
| 1134 | 6 | 50 |  |  |  | 132 | $str=~s/^.*?$self->{patterns}->{chapters_head}//os if $self->{patterns}->{chapters_head} ne ''; | 
| 1135 | 6 | 50 |  |  |  | 89 | $str=~s/$self->{patterns}->{chapters_end}.*$//os if $self->{patterns}->{chapters_end} ne ''; | 
| 1136 |  |  |  |  |  |  |  | 
| 1137 | 6 |  |  |  |  | 15 | my @chapters = (); | 
| 1138 | 6 |  |  |  |  | 272 | while($str=~/$self->{patterns}->{chapters_get_chapterargs}/oisg) { | 
| 1139 | 3 |  |  |  |  | 12 | my $pargs1={}; | 
| 1140 | 3 | 50 |  |  |  | 32 | next if $self->chapters_get_chapterargs($pargs1, $1, $2, $3, $4, $5, $6, $7, $8, $9) eq 'Skip'; | 
| 1141 | 3 |  |  |  |  | 15 | $pargs1->{url}=$self->url_rel2abs($pargs1->{url}, $pargs->{url_base}); | 
| 1142 | 3 |  |  |  |  | 852 | push @chapters, $pargs1; | 
| 1143 |  |  |  |  |  |  | } | 
| 1144 | 6 |  |  |  |  | 30 | return @chapters; | 
| 1145 |  |  |  |  |  |  | } | 
| 1146 |  |  |  |  |  |  | sub TOC_parser { | 
| 1147 | 0 | 0 |  | 0 | 0 | 0 | $_[2]=~s/^.*?$_[0]->{patterns}->{TOC_head}//os if $_[0]->{patterns}->{TOC_head} ne ''; | 
| 1148 | 0 | 0 |  |  |  | 0 | $_[2]=~s/$_[0]->{patterns}->{TOC_end}.*$//os if $_[0]->{patterns}->{TOC_end} ne ''; | 
| 1149 | 0 |  |  |  |  | 0 | $_[0]->normalize_paragraph($_[2]); | 
| 1150 |  |  |  |  |  |  | } | 
| 1151 |  |  |  |  |  |  | sub chapter_process { | 
| 1152 | 6 |  |  | 6 | 0 | 14 | my ($self, $pargs) = (shift, shift); | 
| 1153 | 6 |  |  |  |  | 27 | $self->chapter_parser($pargs, $_[0]); | 
| 1154 | 6 |  |  |  |  | 30 | my $out_en=$self->en_code($_[0]); | 
| 1155 | 6 |  |  |  |  | 19 | $pargs->{data_len}=length($out_en); | 
| 1156 | 6 |  |  |  |  | 23 | $pargs->{data_len_KB}=$self->len_KB($pargs->{data_len}); | 
| 1157 | 6 | 100 | 66 |  |  | 45 | if( $pargs->{level}==0 and $pargs->{data_len}>0 ) {	# save single chapter as a book | 
| 1158 | 3 |  |  |  |  | 11 | $self->{book_get_num}++; | 
| 1159 | 3 |  |  |  |  | 15 | my $filename=$self->result_init($pargs); | 
| 1160 | 3 |  |  |  |  | 12 | $pargs->{filename}=$filename; | 
| 1161 | 3 |  |  |  |  | 16 | $self->result_add($filename, $out_en); | 
| 1162 | 3 |  |  |  |  | 22 | $self->book_finish($pargs); | 
| 1163 | 3 |  |  |  |  | 13 | my $result_time=$self->result_time($pargs); | 
| 1164 | 3 | 50 |  |  |  | 17 | $self->result_settime($filename, $result_time) if defined($result_time); | 
| 1165 | 3 |  |  |  |  | 124 | $pargs->{write_file}=$self->de_code(basename($filename)); | 
| 1166 |  |  |  |  |  |  | } else { | 
| 1167 | 3 |  |  |  |  | 8 | $pargs->{write_file}=""; | 
| 1168 |  |  |  |  |  |  | } | 
| 1169 | 6 | 50 |  |  |  | 48 | $self->log_add("BookChapter".(($pargs->{data_len}>0)?"OK":"Err"), $pargs); | 
| 1170 | 6 | 100 | 66 |  |  | 71 | $_[0]="[$pargs->{write_file}]" if $pargs->{level}==0 and $pargs->{data_len}>0; | 
| 1171 |  |  |  |  |  |  | } | 
| 1172 |  |  |  |  |  |  | sub chapter_parser { | 
| 1173 | 6 | 50 |  | 6 | 0 | 175 | $_[2]=~s/^.*?$_[0]->{patterns}->{chapter_head}//os if $_[0]->{patterns}->{chapter_head} ne ''; | 
| 1174 | 6 | 50 |  |  |  | 98 | $_[2]=~s/$_[0]->{patterns}->{chapter_end}.*$//os if $_[0]->{patterns}->{chapter_end} ne ''; | 
| 1175 | 6 |  |  |  |  | 29 | $_[0]->normalize_paragraph($_[2]); | 
| 1176 |  |  |  |  |  |  | } | 
| 1177 |  |  |  |  |  |  |  | 
| 1178 |  |  |  |  |  |  | #------------------------------------------------------------- | 
| 1179 |  |  |  |  |  |  | # pattern initialize functions | 
| 1180 |  |  |  |  |  |  | #------------------------------------------------------------- | 
| 1181 |  |  |  |  |  |  | sub getpattern_lists { | 
| 1182 | 21 |  |  | 21 | 0 | 186 | qw( | 
| 1183 |  |  |  |  |  |  | space space2 line_head parentheses mark_dash mark_wordsplit | 
| 1184 |  |  |  |  |  |  | remove_html remove_line_by_end remove_line_by_end_special word_finish | 
| 1185 |  |  |  |  |  |  | postfix_zip postfix_media postfix_img postfix_free | 
| 1186 |  |  |  |  |  |  | catalog_head catalog_end catalog_get_bookargs catalog_get_next | 
| 1187 |  |  |  |  |  |  | chapters_head chapters_end chapters_get_chapterargs | 
| 1188 |  |  |  |  |  |  | TOC_exists TOC_head TOC_end | 
| 1189 |  |  |  |  |  |  | chapter_head chapter_end | 
| 1190 |  |  |  |  |  |  | ); | 
| 1191 |  |  |  |  |  |  | } | 
| 1192 |  |  |  |  |  |  | sub getpattern_space_data { | 
| 1193 | 21 |  |  | 21 | 0 | 128 | <<"DATA"; | 
| 1194 |  |  |  |  |  |  | [\000-\011\013-\037] | 
| 1195 |  |  |  |  |  |  | DATA | 
| 1196 |  |  |  |  |  |  | } | 
| 1197 |  |  |  |  |  |  | sub getpattern_space2_data { | 
| 1198 | 14 |  |  | 14 | 0 | 38 | <<"DATA"; | 
| 1199 |  |  |  |  |  |  | ^\000\000 | 
| 1200 |  |  |  |  |  |  | DATA | 
| 1201 |  |  |  |  |  |  | } | 
| 1202 |  |  |  |  |  |  | sub getpattern_line_head_data { | 
| 1203 | 14 |  |  | 14 | 0 | 36 | '    '; | 
| 1204 |  |  |  |  |  |  | } | 
| 1205 |  |  |  |  |  |  | sub getpattern_parentheses { | 
| 1206 | 21 |  |  | 21 | 0 | 44 | my $self = shift; | 
| 1207 | 21 |  |  |  |  | 24 | my ($left, $right); | 
| 1208 | 21 |  |  |  |  | 31 | my $pattern=''; | 
| 1209 | 21 |  |  |  |  | 42 | my $pattern_left=''; | 
| 1210 | 21 |  |  |  |  | 30 | my $pattern_right=''; | 
| 1211 | 21 |  |  |  |  | 94 | foreach(split /\r\n|\r|\n/, $self->de_code($self->getpattern_parentheses_data)) { | 
| 1212 | 371 | 50 |  |  |  | 805 | next if $_ eq ''; | 
| 1213 | 371 |  |  |  |  | 860 | ($left, $right)=split(/ /, $_); | 
| 1214 | 371 |  |  |  |  | 720 | $pattern.=$left."(.*)".$right."|"; | 
| 1215 | 371 |  |  |  |  | 445 | $pattern_left.=$left; | 
| 1216 | 371 |  |  |  |  | 443 | $pattern_right.=$right; | 
| 1217 |  |  |  |  |  |  | } | 
| 1218 | 21 |  |  |  |  | 174 | $pattern=~s/\|$//; | 
| 1219 | 21 |  |  |  |  | 86 | $self->{patterns}->{parentheses_left}='['.$pattern_left.']'; | 
| 1220 | 21 |  |  |  |  | 77 | $self->{patterns}->{parentheses_right}='['.$pattern_right.']'; | 
| 1221 | 21 |  |  |  |  | 292 | return $pattern; | 
| 1222 |  |  |  |  |  |  | } | 
| 1223 |  |  |  |  |  |  | sub getpattern_parentheses_data { | 
| 1224 | 21 |  |  | 21 | 0 | 86 | <<'DATA'; | 
| 1225 |  |  |  |  |  |  | \( \) | 
| 1226 |  |  |  |  |  |  | \[ \] | 
| 1227 |  |  |  |  |  |  | \{ \} | 
| 1228 |  |  |  |  |  |  | \" \" | 
| 1229 |  |  |  |  |  |  | \' \' | 
| 1230 |  |  |  |  |  |  | \` \` | 
| 1231 |  |  |  |  |  |  | DATA | 
| 1232 |  |  |  |  |  |  | } | 
| 1233 |  |  |  |  |  |  | sub getpattern_mark_dash_data { | 
| 1234 | 14 |  |  | 14 | 0 | 36 | <<'DATA'; | 
| 1235 |  |  |  |  |  |  | [#-&\*\+\-=@_~] | 
| 1236 |  |  |  |  |  |  | DATA | 
| 1237 |  |  |  |  |  |  | } | 
| 1238 |  |  |  |  |  |  | sub getpattern_mark_wordsplit_data { | 
| 1239 | 14 |  |  | 14 | 0 | 33 | <<'DATA'; | 
| 1240 |  |  |  |  |  |  | [\.\,\?\!\:\;] | 
| 1241 |  |  |  |  |  |  | DATA | 
| 1242 |  |  |  |  |  |  | } | 
| 1243 |  |  |  |  |  |  | sub getpattern_remove_html_data { | 
| 1244 | 21 |  |  | 21 | 0 | 56 | <<'DATA'; | 
| 1245 |  |  |  |  |  |  |  | 
| 1246 |  |  |  |  |  |  |  | 
| 1247 |  |  |  |  |  |  |  | 
| 1248 |  |  |  |  |  |  |  | 
| 1249 |  |  |  |  |  |  |  | 
| 1250 |  |  |  |  |  |  | DATA | 
| 1251 |  |  |  |  |  |  | } | 
| 1252 |  |  |  |  |  |  | sub getpattern_remove_line_by_end_data { | 
| 1253 | 14 |  |  | 14 | 0 | 37 | <<'DATA'; | 
| 1254 |  |  |  |  |  |  | \000 | 
| 1255 |  |  |  |  |  |  | DATA | 
| 1256 |  |  |  |  |  |  | } | 
| 1257 |  |  |  |  |  |  | sub getpattern_remove_line_by_end_special { | 
| 1258 | 21 |  |  | 21 | 0 | 40 | my $self=shift; | 
| 1259 | 21 |  |  |  |  | 84 | my $special=$self->parse_patterns($self->getpattern_remove_line_by_end_special_data); | 
| 1260 | 21 |  |  |  |  | 57 | my $left=$self->{patterns}->{parentheses_left}; | 
| 1261 | 21 |  |  |  |  | 182 | $left=~s/(?:^\[|\]$)//sg; | 
| 1262 | 21 |  |  |  |  | 124 | "[^ \n]{1,8}?[$special][ $left][^\n]*"; | 
| 1263 |  |  |  |  |  |  | } | 
| 1264 |  |  |  |  |  |  | sub getpattern_remove_line_by_end_special_data { | 
| 1265 | 14 |  |  | 14 | 0 | 42 | <<'DATA'; | 
| 1266 |  |  |  |  |  |  | \000 | 
| 1267 |  |  |  |  |  |  | DATA | 
| 1268 |  |  |  |  |  |  | } | 
| 1269 |  |  |  |  |  |  | sub getpattern_word_finish { | 
| 1270 | 21 |  |  | 21 | 0 | 40 | my $self = shift; | 
| 1271 | 21 |  |  |  |  | 26 | my ($t, $result); | 
| 1272 | 21 |  |  |  |  | 38 | $result=" *"; | 
| 1273 | 21 |  |  |  |  | 46 | $t=$self->{patterns}->{parentheses_left}; | 
| 1274 | 21 |  |  |  |  | 104 | $t=~s/\]$//; | 
| 1275 | 21 |  |  |  |  | 52 | $result.=$t; | 
| 1276 | 21 |  |  |  |  | 43 | $t=$self->{patterns}->{mark_dash}; | 
| 1277 | 21 |  |  |  |  | 88 | $t=~s/^\[//; | 
| 1278 | 21 |  |  |  |  | 63 | $result.=$t."? *"; | 
| 1279 |  |  |  |  |  |  |  | 
| 1280 | 21 |  |  |  |  | 84 | $result.="(?:".$self->parse_patterns($self->getpattern_word_finish_data).") *"; | 
| 1281 |  |  |  |  |  |  |  | 
| 1282 | 21 |  |  |  |  | 60 | $t=$self->{patterns}->{parentheses_right}; | 
| 1283 | 21 |  |  |  |  | 100 | $t=~s/\]$//; | 
| 1284 | 21 |  |  |  |  | 48 | $result.=$t; | 
| 1285 | 21 |  |  |  |  | 47 | $t=$self->{patterns}->{mark_dash}; | 
| 1286 | 21 |  |  |  |  | 86 | $t=~s/^\[//; | 
| 1287 | 21 |  |  |  |  | 132 | $result.=$t."?\$"; | 
| 1288 |  |  |  |  |  |  | } | 
| 1289 |  |  |  |  |  |  | sub getpattern_word_finish_data { | 
| 1290 | 14 |  |  | 14 | 0 | 44 | <<'DATA'; | 
| 1291 |  |  |  |  |  |  | \000 | 
| 1292 |  |  |  |  |  |  | DATA | 
| 1293 |  |  |  |  |  |  | } | 
| 1294 |  |  |  |  |  |  | sub getpattern_postfix_zip { | 
| 1295 | 21 |  |  | 21 | 0 | 52 | return shift->parse_patterns(<<'DATA'); | 
| 1296 |  |  |  |  |  |  | ^case$ | 
| 1297 |  |  |  |  |  |  | zip | 
| 1298 |  |  |  |  |  |  | r(?:a[rx]|\d\d) | 
| 1299 |  |  |  |  |  |  | z | 
| 1300 |  |  |  |  |  |  | gz | 
| 1301 |  |  |  |  |  |  | t[ga]z | 
| 1302 |  |  |  |  |  |  | 7z | 
| 1303 |  |  |  |  |  |  | a\d\d | 
| 1304 |  |  |  |  |  |  | ace | 
| 1305 |  |  |  |  |  |  | ain | 
| 1306 |  |  |  |  |  |  | akt | 
| 1307 |  |  |  |  |  |  | ap[qx] | 
| 1308 |  |  |  |  |  |  | ar(?:[jc]|) | 
| 1309 |  |  |  |  |  |  | asd | 
| 1310 |  |  |  |  |  |  | bh | 
| 1311 |  |  |  |  |  |  | bi[nx] | 
| 1312 |  |  |  |  |  |  | bz2 | 
| 1313 |  |  |  |  |  |  | cab | 
| 1314 |  |  |  |  |  |  | cfd | 
| 1315 |  |  |  |  |  |  | class | 
| 1316 |  |  |  |  |  |  | com | 
| 1317 |  |  |  |  |  |  | cru | 
| 1318 |  |  |  |  |  |  | cpio | 
| 1319 |  |  |  |  |  |  | cpt | 
| 1320 |  |  |  |  |  |  | dcf | 
| 1321 |  |  |  |  |  |  | ddi | 
| 1322 |  |  |  |  |  |  | dpa | 
| 1323 |  |  |  |  |  |  | dsk | 
| 1324 |  |  |  |  |  |  | dup | 
| 1325 |  |  |  |  |  |  | dwc | 
| 1326 |  |  |  |  |  |  | eli | 
| 1327 |  |  |  |  |  |  | enc | 
| 1328 |  |  |  |  |  |  | esp | 
| 1329 |  |  |  |  |  |  | exe | 
| 1330 |  |  |  |  |  |  | f | 
| 1331 |  |  |  |  |  |  | ha(?:p|) | 
| 1332 |  |  |  |  |  |  | hex | 
| 1333 |  |  |  |  |  |  | hp[ak] | 
| 1334 |  |  |  |  |  |  | hqx | 
| 1335 |  |  |  |  |  |  | hyp | 
| 1336 |  |  |  |  |  |  | ice | 
| 1337 |  |  |  |  |  |  | im[gp] | 
| 1338 |  |  |  |  |  |  | is[co] | 
| 1339 |  |  |  |  |  |  | jar | 
| 1340 |  |  |  |  |  |  | jrc | 
| 1341 |  |  |  |  |  |  | lbr | 
| 1342 |  |  |  |  |  |  | lha | 
| 1343 |  |  |  |  |  |  | lz[ahwx] | 
| 1344 |  |  |  |  |  |  | mar | 
| 1345 |  |  |  |  |  |  | mime | 
| 1346 |  |  |  |  |  |  | pak | 
| 1347 |  |  |  |  |  |  | pk3 | 
| 1348 |  |  |  |  |  |  | pz[hk] | 
| 1349 |  |  |  |  |  |  | q | 
| 1350 |  |  |  |  |  |  | qfc | 
| 1351 |  |  |  |  |  |  | saif | 
| 1352 |  |  |  |  |  |  | sar | 
| 1353 |  |  |  |  |  |  | sbx | 
| 1354 |  |  |  |  |  |  | sdn | 
| 1355 |  |  |  |  |  |  | sea | 
| 1356 |  |  |  |  |  |  | shar | 
| 1357 |  |  |  |  |  |  | sit | 
| 1358 |  |  |  |  |  |  | sqz | 
| 1359 |  |  |  |  |  |  | td0 | 
| 1360 |  |  |  |  |  |  | uc2 | 
| 1361 |  |  |  |  |  |  | ufa | 
| 1362 |  |  |  |  |  |  | uu(?:u|) | 
| 1363 |  |  |  |  |  |  | xxe | 
| 1364 |  |  |  |  |  |  | zoo | 
| 1365 |  |  |  |  |  |  | DATA | 
| 1366 |  |  |  |  |  |  | } | 
| 1367 |  |  |  |  |  |  | sub getpattern_postfix_media { | 
| 1368 | 21 |  |  | 21 | 0 | 72 | return shift->parse_patterns(<<'DATA'); | 
| 1369 |  |  |  |  |  |  | avi | 
| 1370 |  |  |  |  |  |  | as[fx] | 
| 1371 |  |  |  |  |  |  | r(?:m|am|a) | 
| 1372 |  |  |  |  |  |  | mov(?:ie|) | 
| 1373 |  |  |  |  |  |  | mp(?:\d|eg|ga|g) | 
| 1374 |  |  |  |  |  |  | wma | 
| 1375 |  |  |  |  |  |  | wav | 
| 1376 |  |  |  |  |  |  | 3ds | 
| 1377 |  |  |  |  |  |  | aif(?:[cf]|) | 
| 1378 |  |  |  |  |  |  | au | 
| 1379 |  |  |  |  |  |  | cd(?:a|) | 
| 1380 |  |  |  |  |  |  | code | 
| 1381 |  |  |  |  |  |  | d[cix]r | 
| 1382 |  |  |  |  |  |  | fl[cit] | 
| 1383 |  |  |  |  |  |  | fon | 
| 1384 |  |  |  |  |  |  | kar | 
| 1385 |  |  |  |  |  |  | m3u | 
| 1386 |  |  |  |  |  |  | mid(?:i|) | 
| 1387 |  |  |  |  |  |  | qt | 
| 1388 |  |  |  |  |  |  | r[fp] | 
| 1389 |  |  |  |  |  |  | scr | 
| 1390 |  |  |  |  |  |  | snd | 
| 1391 |  |  |  |  |  |  | spl | 
| 1392 |  |  |  |  |  |  | swf | 
| 1393 |  |  |  |  |  |  | tt[cf] | 
| 1394 |  |  |  |  |  |  | DATA | 
| 1395 |  |  |  |  |  |  | } | 
| 1396 |  |  |  |  |  |  | sub getpattern_postfix_img { | 
| 1397 | 21 |  |  | 21 | 0 | 54 | return shift->parse_patterns(<<'DATA'); | 
| 1398 |  |  |  |  |  |  | gif | 
| 1399 |  |  |  |  |  |  | jp(?:eg|e|g) | 
| 1400 |  |  |  |  |  |  | png | 
| 1401 |  |  |  |  |  |  | ani | 
| 1402 |  |  |  |  |  |  | ai | 
| 1403 |  |  |  |  |  |  | ais | 
| 1404 |  |  |  |  |  |  | art | 
| 1405 |  |  |  |  |  |  | bmp | 
| 1406 |  |  |  |  |  |  | bw | 
| 1407 |  |  |  |  |  |  | ddf | 
| 1408 |  |  |  |  |  |  | dib | 
| 1409 |  |  |  |  |  |  | col | 
| 1410 |  |  |  |  |  |  | crw | 
| 1411 |  |  |  |  |  |  | cur | 
| 1412 |  |  |  |  |  |  | dcx | 
| 1413 |  |  |  |  |  |  | djv(?:u|) | 
| 1414 |  |  |  |  |  |  | dwg | 
| 1415 |  |  |  |  |  |  | dxf | 
| 1416 |  |  |  |  |  |  | emf | 
| 1417 |  |  |  |  |  |  | fpx | 
| 1418 |  |  |  |  |  |  | ic[lno] | 
| 1419 |  |  |  |  |  |  | ief | 
| 1420 |  |  |  |  |  |  | iff | 
| 1421 |  |  |  |  |  |  | ilbm | 
| 1422 |  |  |  |  |  |  | int(?:a|) | 
| 1423 |  |  |  |  |  |  | iw4 | 
| 1424 |  |  |  |  |  |  | jfif | 
| 1425 |  |  |  |  |  |  | kdc | 
| 1426 |  |  |  |  |  |  | lbm | 
| 1427 |  |  |  |  |  |  | mag | 
| 1428 |  |  |  |  |  |  | pc[dxt] | 
| 1429 |  |  |  |  |  |  | pic(?:t|) | 
| 1430 |  |  |  |  |  |  | pix | 
| 1431 |  |  |  |  |  |  | p[nbgp]m | 
| 1432 |  |  |  |  |  |  | pntg | 
| 1433 |  |  |  |  |  |  | ps[dp] | 
| 1434 |  |  |  |  |  |  | qtif | 
| 1435 |  |  |  |  |  |  | ras | 
| 1436 |  |  |  |  |  |  | rgb(?:a|) | 
| 1437 |  |  |  |  |  |  | rle | 
| 1438 |  |  |  |  |  |  | rsb | 
| 1439 |  |  |  |  |  |  | sgi | 
| 1440 |  |  |  |  |  |  | sid | 
| 1441 |  |  |  |  |  |  | svg | 
| 1442 |  |  |  |  |  |  | targa | 
| 1443 |  |  |  |  |  |  | tga | 
| 1444 |  |  |  |  |  |  | thm | 
| 1445 |  |  |  |  |  |  | tif(?:f|) | 
| 1446 |  |  |  |  |  |  | yuv | 
| 1447 |  |  |  |  |  |  | wbmp | 
| 1448 |  |  |  |  |  |  | wmf | 
| 1449 |  |  |  |  |  |  | x[bp]m | 
| 1450 |  |  |  |  |  |  | xif | 
| 1451 |  |  |  |  |  |  | xwd | 
| 1452 |  |  |  |  |  |  | DATA | 
| 1453 |  |  |  |  |  |  | } | 
| 1454 |  |  |  |  |  |  | sub getpattern_postfix_free { | 
| 1455 | 21 |  |  | 21 | 0 | 57 | return shift->parse_patterns(<<'DATA'); | 
| 1456 |  |  |  |  |  |  | htm(?:l|) | 
| 1457 |  |  |  |  |  |  | cgi | 
| 1458 |  |  |  |  |  |  | jsp | 
| 1459 |  |  |  |  |  |  | asp(?:x|) | 
| 1460 |  |  |  |  |  |  | php(?:\d|) | 
| 1461 |  |  |  |  |  |  | cfm | 
| 1462 |  |  |  |  |  |  | phtml | 
| 1463 |  |  |  |  |  |  | pl | 
| 1464 |  |  |  |  |  |  | nph | 
| 1465 |  |  |  |  |  |  | fcgi | 
| 1466 |  |  |  |  |  |  | ht[ac] | 
| 1467 |  |  |  |  |  |  | DATA | 
| 1468 |  |  |  |  |  |  | } | 
| 1469 |  |  |  |  |  |  | sub contenttype_init { | 
| 1470 | 21 |  |  | 21 | 0 | 5091 | shift->{Content_Type} = { | 
| 1471 |  |  |  |  |  |  | 'text/plain'					=> 'txt', | 
| 1472 |  |  |  |  |  |  | 'text/html'						=> 'txt', | 
| 1473 |  |  |  |  |  |  | 'image/jpeg'					=> 'jpg', | 
| 1474 |  |  |  |  |  |  | 'image/png'						=> 'png', | 
| 1475 |  |  |  |  |  |  | 'image/gif'						=> 'gif', | 
| 1476 |  |  |  |  |  |  | 'application/ami'				=> 'ami', | 
| 1477 |  |  |  |  |  |  | 'application/caa'				=> 'caa', | 
| 1478 |  |  |  |  |  |  | 'application/caj'				=> 'caj', | 
| 1479 |  |  |  |  |  |  | 'application/cas'				=> 'cas', | 
| 1480 |  |  |  |  |  |  | 'application/cdf'				=> 'cdf', | 
| 1481 |  |  |  |  |  |  | 'application/andrew-inset'		=> 'ez', | 
| 1482 |  |  |  |  |  |  | 'application/fractals'			=> 'fif', | 
| 1483 |  |  |  |  |  |  | 'application/futuresplash'		=> 'spl', | 
| 1484 |  |  |  |  |  |  | 'application/kdh'				=> 'kdh', | 
| 1485 |  |  |  |  |  |  | 'application/mac-binhex40'		=> 'hqx', | 
| 1486 |  |  |  |  |  |  | 'application/mac-compactpro'	=> 'cpt', | 
| 1487 |  |  |  |  |  |  | 'application/msaccess'			=> 'mdb', | 
| 1488 |  |  |  |  |  |  | 'application/msword'			=> 'doc', | 
| 1489 |  |  |  |  |  |  | 'application/nh'				=> 'nh', | 
| 1490 |  |  |  |  |  |  | 'application/octet-stream'		=> 'exe', | 
| 1491 |  |  |  |  |  |  | 'application/oda'				=> 'oda', | 
| 1492 |  |  |  |  |  |  | 'application/pdf'				=> 'pdf', | 
| 1493 |  |  |  |  |  |  | 'application/pkcs10'			=> 'p10', | 
| 1494 |  |  |  |  |  |  | 'application/pkcs7-mime'		=> 'p7m', | 
| 1495 |  |  |  |  |  |  | 'application/pkcs7-signature'	=> 'p7s', | 
| 1496 |  |  |  |  |  |  | 'application/pkix-cert'			=> 'cer', | 
| 1497 |  |  |  |  |  |  | 'application/pkix-crl'			=> 'crl', | 
| 1498 |  |  |  |  |  |  | 'application/postscript'		=> 'ps', | 
| 1499 |  |  |  |  |  |  | 'application/rat-file'			=> 'rat', | 
| 1500 |  |  |  |  |  |  | 'application/sdp'				=> 'sdp', | 
| 1501 |  |  |  |  |  |  | 'application/set-payment-initiation'		=> 'setpay', | 
| 1502 |  |  |  |  |  |  | 'application/set-registration-initiation'	=> 'setreg', | 
| 1503 |  |  |  |  |  |  | 'application/smil'				=> 'smil', | 
| 1504 |  |  |  |  |  |  | 'application/streamingmedia'	=> 'ssm', | 
| 1505 |  |  |  |  |  |  | 'application/vnd.adobe.xfdf'	=> 'xfdf', | 
| 1506 |  |  |  |  |  |  | 'application/vnd.fdf'			=> 'fdf', | 
| 1507 |  |  |  |  |  |  | 'application/vnd.mif'			=> 'mif', | 
| 1508 |  |  |  |  |  |  | 'application/vnd.ms-excel'		=> 'xls', | 
| 1509 |  |  |  |  |  |  | 'application/vnd.ms-mediapackage'			=> 'mpf', | 
| 1510 |  |  |  |  |  |  | 'application/vnd.ms-pki.certstore'			=> 'sst', | 
| 1511 |  |  |  |  |  |  | 'application/vnd.ms-pki.pko'	=> 'pko', | 
| 1512 |  |  |  |  |  |  | 'application/vnd.ms-pki.seccat'	=> 'cat', | 
| 1513 |  |  |  |  |  |  | 'application/vnd.ms-pki.stl'	=> 'stl', | 
| 1514 |  |  |  |  |  |  | 'application/vnd.ms-powerpoint'	=> 'ppt', | 
| 1515 |  |  |  |  |  |  | 'application/vnd.ms-project'	=> 'mpp', | 
| 1516 |  |  |  |  |  |  | 'application/vnd.ms-wpl'		=> 'wpl', | 
| 1517 |  |  |  |  |  |  | 'application/vnd.rn-realmedia'	=> 'rm', | 
| 1518 |  |  |  |  |  |  | 'application/vnd.rn-realmedia-vbr'			=> 'rmvb', | 
| 1519 |  |  |  |  |  |  | 'application/vnd.rn-realplayer'	=> 'rnx', | 
| 1520 |  |  |  |  |  |  | 'application/vnd.rn-realsystem-rjs'			=> 'rjs', | 
| 1521 |  |  |  |  |  |  | 'application/vnd.rn-realsystem-rmx'			=> 'rmx', | 
| 1522 |  |  |  |  |  |  | 'application/vnd.rn-rn_music_package'		=> 'rmp', | 
| 1523 |  |  |  |  |  |  | 'application/vnd.rn-rsml'		=> 'rsml', | 
| 1524 |  |  |  |  |  |  | 'application/vnd.visio'			=> 'vsd', | 
| 1525 |  |  |  |  |  |  | 'application/vnd.wap.wbxml'		=> 'wbxml', | 
| 1526 |  |  |  |  |  |  | 'application/vnd.wap.wmlc'		=> 'wmlc', | 
| 1527 |  |  |  |  |  |  | 'application/vnd.wap.wmlscriptc'			=> 'wmlsc', | 
| 1528 |  |  |  |  |  |  | 'application/xhtml+xml'			=> 'xhtml', | 
| 1529 |  |  |  |  |  |  | 'application/xml'				=> 'xml', | 
| 1530 |  |  |  |  |  |  | 'application/xml-dtd'			=> 'dtd', | 
| 1531 |  |  |  |  |  |  | 'application/x-ami'				=> 'ami', | 
| 1532 |  |  |  |  |  |  | 'application/x-bcpio'			=> 'bcpio', | 
| 1533 |  |  |  |  |  |  | 'application/x-ccf'				=> 'ccf', | 
| 1534 |  |  |  |  |  |  | 'application/x-cdf'				=> 'cdf', | 
| 1535 |  |  |  |  |  |  | 'application/x-cdlink'			=> 'vcd', | 
| 1536 |  |  |  |  |  |  | 'application/x-ceb'				=> 'ceb', | 
| 1537 |  |  |  |  |  |  | 'application/x-cef'				=> 'cef', | 
| 1538 |  |  |  |  |  |  | 'application/x-chess-pgn'		=> 'png', | 
| 1539 |  |  |  |  |  |  | 'application/x-compress'		=> 'z', | 
| 1540 |  |  |  |  |  |  | 'application/x-compressed'		=> 'tgz', | 
| 1541 |  |  |  |  |  |  | 'application/x-cpio'			=> 'cpio', | 
| 1542 |  |  |  |  |  |  | 'application/x-csh'				=> 'csh', | 
| 1543 |  |  |  |  |  |  | 'application/x-director'		=> 'dir', | 
| 1544 |  |  |  |  |  |  | 'application/x-dvi'				=> 'dvi', | 
| 1545 |  |  |  |  |  |  | 'application/x-futuresplash'	=> 'spl', | 
| 1546 |  |  |  |  |  |  | 'application/x-gtar'			=> 'gtar', | 
| 1547 |  |  |  |  |  |  | 'application/x-gzip'			=> 'gz', | 
| 1548 |  |  |  |  |  |  | 'application/x-hdf'				=> 'hdf', | 
| 1549 |  |  |  |  |  |  | 'application/x-internet-signup'	=> 'ins', | 
| 1550 |  |  |  |  |  |  | 'application/x-iphone'			=> 'iii', | 
| 1551 |  |  |  |  |  |  | 'application/x-javascript'		=> 'js', | 
| 1552 |  |  |  |  |  |  | 'application/x-java-jnlp-file'	=> 'jnlp', | 
| 1553 |  |  |  |  |  |  | 'application/x-koan'			=> 'skp', | 
| 1554 |  |  |  |  |  |  | 'application/x-latex'			=> 'latex', | 
| 1555 |  |  |  |  |  |  | 'application/x-netcdf'			=> 'cdf', | 
| 1556 |  |  |  |  |  |  | 'application/x-mix-transfer'	=> 'nix', | 
| 1557 |  |  |  |  |  |  | 'application/x-msdownload'		=> 'exe', | 
| 1558 |  |  |  |  |  |  | 'application/x-mplayer2'		=> 'asx', | 
| 1559 |  |  |  |  |  |  | 'application/x-msexcel'			=> 'xls', | 
| 1560 |  |  |  |  |  |  | 'application/x-mspowerpoint'	=> 'ppt', | 
| 1561 |  |  |  |  |  |  | 'application/x-ms-wmd'			=> 'wmd', | 
| 1562 |  |  |  |  |  |  | 'application/x-ms-wms'			=> 'wms', | 
| 1563 |  |  |  |  |  |  | 'application/x-ms-wmz'			=> 'wmz', | 
| 1564 |  |  |  |  |  |  | 'application/x-pkcs12'			=> 'p12', | 
| 1565 |  |  |  |  |  |  | 'application/x-pkcs7-certificates'			=> 'p7b', | 
| 1566 |  |  |  |  |  |  | 'application/x-pkcs7-certreqresp'			=> 'p7r', | 
| 1567 |  |  |  |  |  |  | 'application/x-quicktimeplayer'	=> 'qtl', | 
| 1568 |  |  |  |  |  |  | 'application/x-rtsp'			=> 'rtsp', | 
| 1569 |  |  |  |  |  |  | 'application/x-sdp'				=> 'sdp', | 
| 1570 |  |  |  |  |  |  | 'application/x-sh'				=> 'sh', | 
| 1571 |  |  |  |  |  |  | 'application/x-shar'			=> 'shar', | 
| 1572 |  |  |  |  |  |  | 'application/x-shockwave-flash'	=> 'swf', | 
| 1573 |  |  |  |  |  |  | 'application/x-stuffit'			=> 'sit', | 
| 1574 |  |  |  |  |  |  | 'application/x-sv4cpio'			=> 'sv4cpio', | 
| 1575 |  |  |  |  |  |  | 'application/x-sv4crc'			=> 'sv4crc', | 
| 1576 |  |  |  |  |  |  | 'application/x-tar'				=> 'tar', | 
| 1577 |  |  |  |  |  |  | 'application/x-tcl'				=> 'tcl', | 
| 1578 |  |  |  |  |  |  | 'application/x-tex'				=> 'tex', | 
| 1579 |  |  |  |  |  |  | 'application/x-texinfo'			=> 'texinfo', | 
| 1580 |  |  |  |  |  |  | 'application/x-troff'			=> 'tr', | 
| 1581 |  |  |  |  |  |  | 'application/x-troff-man'		=> 'man', | 
| 1582 |  |  |  |  |  |  | 'application/x-troff-me'		=> 'me', | 
| 1583 |  |  |  |  |  |  | 'application/x-troff-ms'		=> 'ms', | 
| 1584 |  |  |  |  |  |  | 'application/x-ustar'			=> 'ustar', | 
| 1585 |  |  |  |  |  |  | 'application/x-wais-source'		=> 'src', | 
| 1586 |  |  |  |  |  |  | 'application/x-x509-ca-cert'	=> 'cer', | 
| 1587 |  |  |  |  |  |  | 'application/x-zip-compressed'	=> 'zip', | 
| 1588 |  |  |  |  |  |  | 'application/zip'				=> 'zip', | 
| 1589 |  |  |  |  |  |  | 'audio/aiff'					=> 'aiff', | 
| 1590 |  |  |  |  |  |  | 'audio/basic'					=> 'au', | 
| 1591 |  |  |  |  |  |  | 'audio/mid'						=> 'mid', | 
| 1592 |  |  |  |  |  |  | 'audio/midi'					=> 'mid', | 
| 1593 |  |  |  |  |  |  | 'audio/mp3'						=> 'mp3', | 
| 1594 |  |  |  |  |  |  | 'audio/mp4'						=> 'mp4', | 
| 1595 |  |  |  |  |  |  | 'audio/mpeg'					=> 'mp3', | 
| 1596 |  |  |  |  |  |  | 'audio/mpegurl'					=> 'm3u', | 
| 1597 |  |  |  |  |  |  | 'audio/mpg'						=> 'mp3', | 
| 1598 |  |  |  |  |  |  | 'audio/vnd.qcelp'				=> 'qcp', | 
| 1599 |  |  |  |  |  |  | 'audio/vnd.rn-realaudio'		=> 'ra', | 
| 1600 |  |  |  |  |  |  | 'audio/wav'						=> 'wav', | 
| 1601 |  |  |  |  |  |  | 'audio/x-aiff'					=> 'aiff', | 
| 1602 |  |  |  |  |  |  | 'audio/x-gsm'					=> 'gsm', | 
| 1603 |  |  |  |  |  |  | 'audio/x-mid'					=> 'mid', | 
| 1604 |  |  |  |  |  |  | 'audio/x-midi'					=> 'mid', | 
| 1605 |  |  |  |  |  |  | 'audio/x-mp3'					=> 'mp3', | 
| 1606 |  |  |  |  |  |  | 'audio/x-mpeg'					=> 'mp3', | 
| 1607 |  |  |  |  |  |  | 'audio/x-mpegurl'				=> 'm3u', | 
| 1608 |  |  |  |  |  |  | 'audio/x-mpg'					=> 'mp3', | 
| 1609 |  |  |  |  |  |  | 'audio/x-ms-wax'				=> 'wax', | 
| 1610 |  |  |  |  |  |  | 'audio/x-ms-wma'				=> 'wma', | 
| 1611 |  |  |  |  |  |  | 'audio/x-pn-realaudio'			=> 'ram', | 
| 1612 |  |  |  |  |  |  | 'audio/x-realaudio'				=> 'ra', | 
| 1613 |  |  |  |  |  |  | 'audio/x-wav'					=> 'wav', | 
| 1614 |  |  |  |  |  |  | 'chemical/x-pdb'				=> 'pdb', | 
| 1615 |  |  |  |  |  |  | 'chemical/x-xyz'				=> 'xyz', | 
| 1616 |  |  |  |  |  |  | 'image/bmp'						=> 'bmp', | 
| 1617 |  |  |  |  |  |  | 'image/ief'						=> 'ief', | 
| 1618 |  |  |  |  |  |  | 'image/pict'					=> 'pict', | 
| 1619 |  |  |  |  |  |  | 'image/pjpeg'					=> 'jpg', | 
| 1620 |  |  |  |  |  |  | 'image/svg'						=> 'svg', | 
| 1621 |  |  |  |  |  |  | 'image/svg+xml'					=> 'svg', | 
| 1622 |  |  |  |  |  |  | 'image/svg-xml'					=> 'svg', | 
| 1623 |  |  |  |  |  |  | 'image/tiff'					=> 'tif', | 
| 1624 |  |  |  |  |  |  | 'image/vnd.djvu'				=> 'djvu', | 
| 1625 |  |  |  |  |  |  | 'image/vnd.dwg'					=> 'dwg', | 
| 1626 |  |  |  |  |  |  | 'image/vnd.dxf'					=> 'dxf', | 
| 1627 |  |  |  |  |  |  | 'image/vnd.rn-realflash'		=> 'rf', | 
| 1628 |  |  |  |  |  |  | 'image/vnd.rn-realpix'			=> 'rp', | 
| 1629 |  |  |  |  |  |  | 'image/vnd.wap.wbmp'			=> 'wbmp', | 
| 1630 |  |  |  |  |  |  | 'image/xbm'						=> 'xbm', | 
| 1631 |  |  |  |  |  |  | 'image/x-cmu-raster'			=> 'ras', | 
| 1632 |  |  |  |  |  |  | 'image/x-icon'					=> 'ico', | 
| 1633 |  |  |  |  |  |  | 'image/x-macpaint'				=> 'pntg', | 
| 1634 |  |  |  |  |  |  | 'image/x-pict'					=> 'pict', | 
| 1635 |  |  |  |  |  |  | 'image/x-png'					=> 'png', | 
| 1636 |  |  |  |  |  |  | 'image/x-portable-anymap'		=> 'pnm', | 
| 1637 |  |  |  |  |  |  | 'image/x-portable-bitmap'		=> 'pbm', | 
| 1638 |  |  |  |  |  |  | 'image/x-portable-graymap'		=> 'pgm', | 
| 1639 |  |  |  |  |  |  | 'image/x-portable-pixmap'		=> 'ppm', | 
| 1640 |  |  |  |  |  |  | 'image/x-quicktime'				=> 'qtif', | 
| 1641 |  |  |  |  |  |  | 'image/x-rgb'					=> 'rgb', | 
| 1642 |  |  |  |  |  |  | 'image/x-sgi'					=> 'sgi', | 
| 1643 |  |  |  |  |  |  | 'image/x-targa'					=> 'targa', | 
| 1644 |  |  |  |  |  |  | 'image/x-tiff'					=> 'tif', | 
| 1645 |  |  |  |  |  |  | 'image/x-xbitmap'				=> 'xbm', | 
| 1646 |  |  |  |  |  |  | 'image/x-xpixmap'				=> 'xpm', | 
| 1647 |  |  |  |  |  |  | 'image/x-xwindowdump'			=> 'xwd', | 
| 1648 |  |  |  |  |  |  | 'interface/x-winamp3-skin'		=> 'wal', | 
| 1649 |  |  |  |  |  |  | 'interface/x-winamp-skin'		=> 'wal', | 
| 1650 |  |  |  |  |  |  | 'midi/mid'						=> 'mid', | 
| 1651 |  |  |  |  |  |  | 'model/iges'					=> 'iges', | 
| 1652 |  |  |  |  |  |  | 'model/mesh'					=> 'mesh', | 
| 1653 |  |  |  |  |  |  | 'model/vnd.dwf'					=> 'dwf', | 
| 1654 |  |  |  |  |  |  | 'model/vrml'					=> 'vrml', | 
| 1655 |  |  |  |  |  |  | 'text/calendar'					=> 'ics', | 
| 1656 |  |  |  |  |  |  | 'text/css'						=> 'css', | 
| 1657 |  |  |  |  |  |  | 'text/h323'						=> '323', | 
| 1658 |  |  |  |  |  |  | 'text/iuls'						=> 'uls', | 
| 1659 |  |  |  |  |  |  | 'text/richtext'					=> 'rtx', | 
| 1660 |  |  |  |  |  |  | 'text/rtf'						=> 'rtf', | 
| 1661 |  |  |  |  |  |  | 'text/sgml'						=> 'sgml', | 
| 1662 |  |  |  |  |  |  | 'text/tab-separated-values'		=> 'tsv', | 
| 1663 |  |  |  |  |  |  | 'text/scriptlet'				=> 'wsc', | 
| 1664 |  |  |  |  |  |  | 'text/vnd.rn-realtext'			=> 'rt', | 
| 1665 |  |  |  |  |  |  | 'text/vnd.wap.wml'				=> 'wml', | 
| 1666 |  |  |  |  |  |  | 'text/vnd.wap.wmlscript'		=> 'wmls', | 
| 1667 |  |  |  |  |  |  | 'text/xml'						=> 'xml', | 
| 1668 |  |  |  |  |  |  | 'text/x-ms-iqy'					=> 'iqy', | 
| 1669 |  |  |  |  |  |  | 'text/x-ms-odc'					=> 'odc', | 
| 1670 |  |  |  |  |  |  | 'text/x-ms-rqy'					=> 'rqy', | 
| 1671 |  |  |  |  |  |  | 'text/x-setext'					=> 'etx', | 
| 1672 |  |  |  |  |  |  | 'text/x-vcard'					=> 'vcf', | 
| 1673 |  |  |  |  |  |  | 'video/avi'						=> 'avi', | 
| 1674 |  |  |  |  |  |  | 'video/flc'						=> 'flc', | 
| 1675 |  |  |  |  |  |  | 'video/mp4'						=> 'mp4', | 
| 1676 |  |  |  |  |  |  | 'video/mpeg'					=> 'mpg', | 
| 1677 |  |  |  |  |  |  | 'video/mpg'						=> 'mpg', | 
| 1678 |  |  |  |  |  |  | 'video/msvideo'					=> 'avi', | 
| 1679 |  |  |  |  |  |  | 'video/quicktime'				=> 'mov', | 
| 1680 |  |  |  |  |  |  | 'video/vnd.mpegurl'				=> 'mxu', | 
| 1681 |  |  |  |  |  |  | 'video/vnd.rn-realvideo'		=> 'rv', | 
| 1682 |  |  |  |  |  |  | 'video/x-ivf'					=> 'ivf', | 
| 1683 |  |  |  |  |  |  | 'video/x-mpeg'					=> 'mpg', | 
| 1684 |  |  |  |  |  |  | 'video/x-mpeg2a'				=> 'mpg', | 
| 1685 |  |  |  |  |  |  | 'video/x-ms-asf'				=> 'asf', | 
| 1686 |  |  |  |  |  |  | 'video/x-ms-asf-plugin'			=> 'asx', | 
| 1687 |  |  |  |  |  |  | 'video/x-msvideo'				=> 'avi', | 
| 1688 |  |  |  |  |  |  | 'video/x-ms-wm'					=> 'wm', | 
| 1689 |  |  |  |  |  |  | 'video/x-ms-wmp'				=> 'wmp', | 
| 1690 |  |  |  |  |  |  | 'video/x-ms-wmv'				=> 'wmv', | 
| 1691 |  |  |  |  |  |  | 'video/x-ms-wmx'				=> 'wmx', | 
| 1692 |  |  |  |  |  |  | 'video/x-ms-wvx'				=> 'wvx', | 
| 1693 |  |  |  |  |  |  | 'video/x-sgi-movie'				=> 'movie', | 
| 1694 |  |  |  |  |  |  | 'x-conference/x-cooltalk'		=> 'ice', | 
| 1695 |  |  |  |  |  |  | }; | 
| 1696 |  |  |  |  |  |  | } | 
| 1697 |  |  |  |  |  |  | %entity2char = (	# copied from HTML::Entities | 
| 1698 |  |  |  |  |  |  | # Some normal chars that have special meaning in SGML context | 
| 1699 |  |  |  |  |  |  | amp    => '&',  # ampersand | 
| 1700 |  |  |  |  |  |  | 'gt'    => '>',  # greater than | 
| 1701 |  |  |  |  |  |  | 'lt'    => '<',  # less than | 
| 1702 |  |  |  |  |  |  | quot   => '"',  # double quote | 
| 1703 |  |  |  |  |  |  | apos   => "'",  # single quote | 
| 1704 |  |  |  |  |  |  |  | 
| 1705 |  |  |  |  |  |  | # PUBLIC ISO 8879-1986//ENTITIES Added Latin 1//EN//HTML | 
| 1706 |  |  |  |  |  |  | AElig	=> 'Æ',  # capital AE diphthong (ligature) | 
| 1707 |  |  |  |  |  |  | Aacute	=> 'Á',  # capital A, acute accent | 
| 1708 |  |  |  |  |  |  | Acirc	=> 'Â',  # capital A, circumflex accent | 
| 1709 |  |  |  |  |  |  | Agrave	=> 'À',  # capital A, grave accent | 
| 1710 |  |  |  |  |  |  | Aring	=> 'Å',  # capital A, ring | 
| 1711 |  |  |  |  |  |  | Atilde	=> 'Ã',  # capital A, tilde | 
| 1712 |  |  |  |  |  |  | Auml	=> 'Ä',  # capital A, dieresis or umlaut mark | 
| 1713 |  |  |  |  |  |  | Ccedil	=> 'Ç',  # capital C, cedilla | 
| 1714 |  |  |  |  |  |  | ETH	=> 'Ð',  # capital Eth, Icelandic | 
| 1715 |  |  |  |  |  |  | Eacute	=> 'É',  # capital E, acute accent | 
| 1716 |  |  |  |  |  |  | Ecirc	=> 'Ê',  # capital E, circumflex accent | 
| 1717 |  |  |  |  |  |  | Egrave	=> 'È',  # capital E, grave accent | 
| 1718 |  |  |  |  |  |  | Euml	=> 'Ë',  # capital E, dieresis or umlaut mark | 
| 1719 |  |  |  |  |  |  | Iacute	=> 'Í',  # capital I, acute accent | 
| 1720 |  |  |  |  |  |  | Icirc	=> 'Î',  # capital I, circumflex accent | 
| 1721 |  |  |  |  |  |  | Igrave	=> 'Ì',  # capital I, grave accent | 
| 1722 |  |  |  |  |  |  | Iuml	=> 'Ï',  # capital I, dieresis or umlaut mark | 
| 1723 |  |  |  |  |  |  | Ntilde	=> 'Ñ',  # capital N, tilde | 
| 1724 |  |  |  |  |  |  | Oacute	=> 'Ó',  # capital O, acute accent | 
| 1725 |  |  |  |  |  |  | Ocirc	=> 'Ô',  # capital O, circumflex accent | 
| 1726 |  |  |  |  |  |  | Ograve	=> 'Ò',  # capital O, grave accent | 
| 1727 |  |  |  |  |  |  | Oslash	=> 'Ø',  # capital O, slash | 
| 1728 |  |  |  |  |  |  | Otilde	=> 'Õ',  # capital O, tilde | 
| 1729 |  |  |  |  |  |  | Ouml	=> 'Ö',  # capital O, dieresis or umlaut mark | 
| 1730 |  |  |  |  |  |  | THORN	=> 'Þ',  # capital THORN, Icelandic | 
| 1731 |  |  |  |  |  |  | Uacute	=> 'Ú',  # capital U, acute accent | 
| 1732 |  |  |  |  |  |  | Ucirc	=> 'Û',  # capital U, circumflex accent | 
| 1733 |  |  |  |  |  |  | Ugrave	=> 'Ù',  # capital U, grave accent | 
| 1734 |  |  |  |  |  |  | Uuml	=> 'Ü',  # capital U, dieresis or umlaut mark | 
| 1735 |  |  |  |  |  |  | Yacute	=> 'Ý',  # capital Y, acute accent | 
| 1736 |  |  |  |  |  |  | aacute	=> 'á',  # small a, acute accent | 
| 1737 |  |  |  |  |  |  | acirc	=> 'â',  # small a, circumflex accent | 
| 1738 |  |  |  |  |  |  | aelig	=> 'æ',  # small ae diphthong (ligature) | 
| 1739 |  |  |  |  |  |  | agrave	=> 'à',  # small a, grave accent | 
| 1740 |  |  |  |  |  |  | aring	=> 'å',  # small a, ring | 
| 1741 |  |  |  |  |  |  | atilde	=> 'ã',  # small a, tilde | 
| 1742 |  |  |  |  |  |  | auml	=> 'ä',  # small a, dieresis or umlaut mark | 
| 1743 |  |  |  |  |  |  | ccedil	=> 'ç',  # small c, cedilla | 
| 1744 |  |  |  |  |  |  | eacute	=> 'é',  # small e, acute accent | 
| 1745 |  |  |  |  |  |  | ecirc	=> 'ê',  # small e, circumflex accent | 
| 1746 |  |  |  |  |  |  | egrave	=> 'è',  # small e, grave accent | 
| 1747 |  |  |  |  |  |  | eth	=> 'ð',  # small eth, Icelandic | 
| 1748 |  |  |  |  |  |  | euml	=> 'ë',  # small e, dieresis or umlaut mark | 
| 1749 |  |  |  |  |  |  | iacute	=> 'í',  # small i, acute accent | 
| 1750 |  |  |  |  |  |  | icirc	=> 'î',  # small i, circumflex accent | 
| 1751 |  |  |  |  |  |  | igrave	=> 'ì',  # small i, grave accent | 
| 1752 |  |  |  |  |  |  | iuml	=> 'ï',  # small i, dieresis or umlaut mark | 
| 1753 |  |  |  |  |  |  | ntilde	=> 'ñ',  # small n, tilde | 
| 1754 |  |  |  |  |  |  | oacute	=> 'ó',  # small o, acute accent | 
| 1755 |  |  |  |  |  |  | ocirc	=> 'ô',  # small o, circumflex accent | 
| 1756 |  |  |  |  |  |  | ograve	=> 'ò',  # small o, grave accent | 
| 1757 |  |  |  |  |  |  | oslash	=> 'ø',  # small o, slash | 
| 1758 |  |  |  |  |  |  | otilde	=> 'õ',  # small o, tilde | 
| 1759 |  |  |  |  |  |  | ouml	=> 'ö',  # small o, dieresis or umlaut mark | 
| 1760 |  |  |  |  |  |  | szlig	=> 'ß',  # small sharp s, German (sz ligature) | 
| 1761 |  |  |  |  |  |  | thorn	=> 'þ',  # small thorn, Icelandic | 
| 1762 |  |  |  |  |  |  | uacute	=> 'ú',  # small u, acute accent | 
| 1763 |  |  |  |  |  |  | ucirc	=> 'û',  # small u, circumflex accent | 
| 1764 |  |  |  |  |  |  | ugrave	=> 'ù',  # small u, grave accent | 
| 1765 |  |  |  |  |  |  | uuml	=> 'ü',  # small u, dieresis or umlaut mark | 
| 1766 |  |  |  |  |  |  | yacute	=> 'ý',  # small y, acute accent | 
| 1767 |  |  |  |  |  |  | yuml	=> 'ÿ',  # small y, dieresis or umlaut mark | 
| 1768 |  |  |  |  |  |  |  | 
| 1769 |  |  |  |  |  |  | # Some extra Latin 1 chars that are listed in the HTML3.2 draft (21-May-96) | 
| 1770 |  |  |  |  |  |  | copy   => '©',  # copyright sign | 
| 1771 |  |  |  |  |  |  | reg    => '®',  # registered sign | 
| 1772 |  |  |  |  |  |  | nbsp   => " ", # non breaking space | 
| 1773 |  |  |  |  |  |  |  | 
| 1774 |  |  |  |  |  |  | # Additional ISO-8859/1 entities listed in rfc1866 (section 14) | 
| 1775 |  |  |  |  |  |  | iexcl  => '¡', | 
| 1776 |  |  |  |  |  |  | cent   => '¢', | 
| 1777 |  |  |  |  |  |  | pound  => '£', | 
| 1778 |  |  |  |  |  |  | curren => '¤', | 
| 1779 |  |  |  |  |  |  | yen    => '¥', | 
| 1780 |  |  |  |  |  |  | brvbar => '¦', | 
| 1781 |  |  |  |  |  |  | sect   => '§', | 
| 1782 |  |  |  |  |  |  | uml    => '¨', | 
| 1783 |  |  |  |  |  |  | ordf   => 'ª', | 
| 1784 |  |  |  |  |  |  | laquo  => '«', | 
| 1785 |  |  |  |  |  |  | 'not'   => '¬',    # not is a keyword in perl | 
| 1786 |  |  |  |  |  |  | shy    => '', | 
| 1787 |  |  |  |  |  |  | macr   => '¯', | 
| 1788 |  |  |  |  |  |  | deg    => '°', | 
| 1789 |  |  |  |  |  |  | plusmn => '±', | 
| 1790 |  |  |  |  |  |  | sup1   => '¹', | 
| 1791 |  |  |  |  |  |  | sup2   => '²', | 
| 1792 |  |  |  |  |  |  | sup3   => '³', | 
| 1793 |  |  |  |  |  |  | acute  => '´', | 
| 1794 |  |  |  |  |  |  | micro  => 'µ', | 
| 1795 |  |  |  |  |  |  | para   => '¶', | 
| 1796 |  |  |  |  |  |  | middot => '·', | 
| 1797 |  |  |  |  |  |  | cedil  => '¸', | 
| 1798 |  |  |  |  |  |  | ordm   => 'º', | 
| 1799 |  |  |  |  |  |  | raquo  => '»', | 
| 1800 |  |  |  |  |  |  | frac14 => '¼', | 
| 1801 |  |  |  |  |  |  | frac12 => '½', | 
| 1802 |  |  |  |  |  |  | frac34 => '¾', | 
| 1803 |  |  |  |  |  |  | iquest => '¿', | 
| 1804 |  |  |  |  |  |  | 'times' => '×',    # times is a keyword in perl | 
| 1805 |  |  |  |  |  |  | divide => '÷', | 
| 1806 |  |  |  |  |  |  | OElig    => chr(338), | 
| 1807 |  |  |  |  |  |  | oelig    => chr(339), | 
| 1808 |  |  |  |  |  |  | Scaron   => chr(352), | 
| 1809 |  |  |  |  |  |  | scaron   => chr(353), | 
| 1810 |  |  |  |  |  |  | Yuml     => chr(376), | 
| 1811 |  |  |  |  |  |  | fnof     => chr(402), | 
| 1812 |  |  |  |  |  |  | circ     => chr(710), | 
| 1813 |  |  |  |  |  |  | tilde    => chr(732), | 
| 1814 |  |  |  |  |  |  | Alpha    => chr(913), | 
| 1815 |  |  |  |  |  |  | Beta     => chr(914), | 
| 1816 |  |  |  |  |  |  | Gamma    => chr(915), | 
| 1817 |  |  |  |  |  |  | Delta    => chr(916), | 
| 1818 |  |  |  |  |  |  | Epsilon  => chr(917), | 
| 1819 |  |  |  |  |  |  | Zeta     => chr(918), | 
| 1820 |  |  |  |  |  |  | Eta      => chr(919), | 
| 1821 |  |  |  |  |  |  | Theta    => chr(920), | 
| 1822 |  |  |  |  |  |  | Iota     => chr(921), | 
| 1823 |  |  |  |  |  |  | Kappa    => chr(922), | 
| 1824 |  |  |  |  |  |  | Lambda   => chr(923), | 
| 1825 |  |  |  |  |  |  | Mu       => chr(924), | 
| 1826 |  |  |  |  |  |  | Nu       => chr(925), | 
| 1827 |  |  |  |  |  |  | Xi       => chr(926), | 
| 1828 |  |  |  |  |  |  | Omicron  => chr(927), | 
| 1829 |  |  |  |  |  |  | Pi       => chr(928), | 
| 1830 |  |  |  |  |  |  | Rho      => chr(929), | 
| 1831 |  |  |  |  |  |  | Sigma    => chr(931), | 
| 1832 |  |  |  |  |  |  | Tau      => chr(932), | 
| 1833 |  |  |  |  |  |  | Upsilon  => chr(933), | 
| 1834 |  |  |  |  |  |  | Phi      => chr(934), | 
| 1835 |  |  |  |  |  |  | Chi      => chr(935), | 
| 1836 |  |  |  |  |  |  | Psi      => chr(936), | 
| 1837 |  |  |  |  |  |  | Omega    => chr(937), | 
| 1838 |  |  |  |  |  |  | alpha    => chr(945), | 
| 1839 |  |  |  |  |  |  | beta     => chr(946), | 
| 1840 |  |  |  |  |  |  | gamma    => chr(947), | 
| 1841 |  |  |  |  |  |  | delta    => chr(948), | 
| 1842 |  |  |  |  |  |  | epsilon  => chr(949), | 
| 1843 |  |  |  |  |  |  | zeta     => chr(950), | 
| 1844 |  |  |  |  |  |  | eta      => chr(951), | 
| 1845 |  |  |  |  |  |  | theta    => chr(952), | 
| 1846 |  |  |  |  |  |  | iota     => chr(953), | 
| 1847 |  |  |  |  |  |  | kappa    => chr(954), | 
| 1848 |  |  |  |  |  |  | lambda   => chr(955), | 
| 1849 |  |  |  |  |  |  | mu       => chr(956), | 
| 1850 |  |  |  |  |  |  | nu       => chr(957), | 
| 1851 |  |  |  |  |  |  | xi       => chr(958), | 
| 1852 |  |  |  |  |  |  | omicron  => chr(959), | 
| 1853 |  |  |  |  |  |  | pi       => chr(960), | 
| 1854 |  |  |  |  |  |  | rho      => chr(961), | 
| 1855 |  |  |  |  |  |  | sigmaf   => chr(962), | 
| 1856 |  |  |  |  |  |  | sigma    => chr(963), | 
| 1857 |  |  |  |  |  |  | tau      => chr(964), | 
| 1858 |  |  |  |  |  |  | upsilon  => chr(965), | 
| 1859 |  |  |  |  |  |  | phi      => chr(966), | 
| 1860 |  |  |  |  |  |  | chi      => chr(967), | 
| 1861 |  |  |  |  |  |  | psi      => chr(968), | 
| 1862 |  |  |  |  |  |  | omega    => chr(969), | 
| 1863 |  |  |  |  |  |  | thetasym => chr(977), | 
| 1864 |  |  |  |  |  |  | upsih    => chr(978), | 
| 1865 |  |  |  |  |  |  | piv      => chr(982), | 
| 1866 |  |  |  |  |  |  | ensp     => chr(8194), | 
| 1867 |  |  |  |  |  |  | emsp     => chr(8195), | 
| 1868 |  |  |  |  |  |  | thinsp   => chr(8201), | 
| 1869 |  |  |  |  |  |  | zwnj     => chr(8204), | 
| 1870 |  |  |  |  |  |  | zwj      => chr(8205), | 
| 1871 |  |  |  |  |  |  | lrm      => chr(8206), | 
| 1872 |  |  |  |  |  |  | rlm      => chr(8207), | 
| 1873 |  |  |  |  |  |  | ndash    => chr(8211), | 
| 1874 |  |  |  |  |  |  | mdash    => chr(8212), | 
| 1875 |  |  |  |  |  |  | lsquo    => chr(8216), | 
| 1876 |  |  |  |  |  |  | rsquo    => chr(8217), | 
| 1877 |  |  |  |  |  |  | sbquo    => chr(8218), | 
| 1878 |  |  |  |  |  |  | ldquo    => chr(8220), | 
| 1879 |  |  |  |  |  |  | rdquo    => chr(8221), | 
| 1880 |  |  |  |  |  |  | bdquo    => chr(8222), | 
| 1881 |  |  |  |  |  |  | dagger   => chr(8224), | 
| 1882 |  |  |  |  |  |  | Dagger   => chr(8225), | 
| 1883 |  |  |  |  |  |  | bull     => chr(8226), | 
| 1884 |  |  |  |  |  |  | hellip   => chr(8230), | 
| 1885 |  |  |  |  |  |  | permil   => chr(8240), | 
| 1886 |  |  |  |  |  |  | prime    => chr(8242), | 
| 1887 |  |  |  |  |  |  | Prime    => chr(8243), | 
| 1888 |  |  |  |  |  |  | lsaquo   => chr(8249), | 
| 1889 |  |  |  |  |  |  | rsaquo   => chr(8250), | 
| 1890 |  |  |  |  |  |  | oline    => chr(8254), | 
| 1891 |  |  |  |  |  |  | frasl    => chr(8260), | 
| 1892 |  |  |  |  |  |  | euro     => chr(8364), | 
| 1893 |  |  |  |  |  |  | image    => chr(8465), | 
| 1894 |  |  |  |  |  |  | weierp   => chr(8472), | 
| 1895 |  |  |  |  |  |  | real     => chr(8476), | 
| 1896 |  |  |  |  |  |  | trade    => chr(8482), | 
| 1897 |  |  |  |  |  |  | alefsym  => chr(8501), | 
| 1898 |  |  |  |  |  |  | larr     => chr(8592), | 
| 1899 |  |  |  |  |  |  | uarr     => chr(8593), | 
| 1900 |  |  |  |  |  |  | rarr     => chr(8594), | 
| 1901 |  |  |  |  |  |  | darr     => chr(8595), | 
| 1902 |  |  |  |  |  |  | harr     => chr(8596), | 
| 1903 |  |  |  |  |  |  | crarr    => chr(8629), | 
| 1904 |  |  |  |  |  |  | lArr     => chr(8656), | 
| 1905 |  |  |  |  |  |  | uArr     => chr(8657), | 
| 1906 |  |  |  |  |  |  | rArr     => chr(8658), | 
| 1907 |  |  |  |  |  |  | dArr     => chr(8659), | 
| 1908 |  |  |  |  |  |  | hArr     => chr(8660), | 
| 1909 |  |  |  |  |  |  | forall   => chr(8704), | 
| 1910 |  |  |  |  |  |  | part     => chr(8706), | 
| 1911 |  |  |  |  |  |  | exist    => chr(8707), | 
| 1912 |  |  |  |  |  |  | empty    => chr(8709), | 
| 1913 |  |  |  |  |  |  | nabla    => chr(8711), | 
| 1914 |  |  |  |  |  |  | isin     => chr(8712), | 
| 1915 |  |  |  |  |  |  | notin    => chr(8713), | 
| 1916 |  |  |  |  |  |  | ni       => chr(8715), | 
| 1917 |  |  |  |  |  |  | prod     => chr(8719), | 
| 1918 |  |  |  |  |  |  | sum      => chr(8721), | 
| 1919 |  |  |  |  |  |  | minus    => chr(8722), | 
| 1920 |  |  |  |  |  |  | lowast   => chr(8727), | 
| 1921 |  |  |  |  |  |  | radic    => chr(8730), | 
| 1922 |  |  |  |  |  |  | prop     => chr(8733), | 
| 1923 |  |  |  |  |  |  | infin    => chr(8734), | 
| 1924 |  |  |  |  |  |  | ang      => chr(8736), | 
| 1925 |  |  |  |  |  |  | 'and'     => chr(8743), | 
| 1926 |  |  |  |  |  |  | 'or'      => chr(8744), | 
| 1927 |  |  |  |  |  |  | cap      => chr(8745), | 
| 1928 |  |  |  |  |  |  | cup      => chr(8746), | 
| 1929 |  |  |  |  |  |  | 'int'     => chr(8747), | 
| 1930 |  |  |  |  |  |  | there4   => chr(8756), | 
| 1931 |  |  |  |  |  |  | sim      => chr(8764), | 
| 1932 |  |  |  |  |  |  | cong     => chr(8773), | 
| 1933 |  |  |  |  |  |  | asymp    => chr(8776), | 
| 1934 |  |  |  |  |  |  | 'ne'      => chr(8800), | 
| 1935 |  |  |  |  |  |  | equiv    => chr(8801), | 
| 1936 |  |  |  |  |  |  | 'le'      => chr(8804), | 
| 1937 |  |  |  |  |  |  | 'ge'      => chr(8805), | 
| 1938 |  |  |  |  |  |  | 'sub'     => chr(8834), | 
| 1939 |  |  |  |  |  |  | sup      => chr(8835), | 
| 1940 |  |  |  |  |  |  | nsub     => chr(8836), | 
| 1941 |  |  |  |  |  |  | sube     => chr(8838), | 
| 1942 |  |  |  |  |  |  | supe     => chr(8839), | 
| 1943 |  |  |  |  |  |  | oplus    => chr(8853), | 
| 1944 |  |  |  |  |  |  | otimes   => chr(8855), | 
| 1945 |  |  |  |  |  |  | perp     => chr(8869), | 
| 1946 |  |  |  |  |  |  | sdot     => chr(8901), | 
| 1947 |  |  |  |  |  |  | lceil    => chr(8968), | 
| 1948 |  |  |  |  |  |  | rceil    => chr(8969), | 
| 1949 |  |  |  |  |  |  | lfloor   => chr(8970), | 
| 1950 |  |  |  |  |  |  | rfloor   => chr(8971), | 
| 1951 |  |  |  |  |  |  | lang     => chr(9001), | 
| 1952 |  |  |  |  |  |  | rang     => chr(9002), | 
| 1953 |  |  |  |  |  |  | loz      => chr(9674), | 
| 1954 |  |  |  |  |  |  | spades   => chr(9824), | 
| 1955 |  |  |  |  |  |  | clubs    => chr(9827), | 
| 1956 |  |  |  |  |  |  | hearts   => chr(9829), | 
| 1957 |  |  |  |  |  |  | diams    => chr(9830), | 
| 1958 |  |  |  |  |  |  | ); | 
| 1959 |  |  |  |  |  |  |  | 
| 1960 |  |  |  |  |  |  | #------------------------------------------------------------- | 
| 1961 |  |  |  |  |  |  | # callback functions | 
| 1962 |  |  |  |  |  |  | #	$bot->get_alias()									=> $alias | 
| 1963 |  |  |  |  |  |  | #	$bot->argv_default()								=> @argv_args | 
| 1964 |  |  |  |  |  |  | #		qw(cat1=i cat2=i pageno=i desc=s) | 
| 1965 |  |  |  |  |  |  | #	$bot->argv_process(\%args)							=> N/A | 
| 1966 |  |  |  |  |  |  | #	$bot->argv_process_all(\%args)						=> N/A | 
| 1967 |  |  |  |  |  |  | #	$bot->get_url_verify($url_in_out)					=> N/A | 
| 1968 |  |  |  |  |  |  | #		a callback to verify or change $url before real get | 
| 1969 |  |  |  |  |  |  | #	$bot->go_login()									=> N/A | 
| 1970 |  |  |  |  |  |  | #	$bot->getpattern_catalog_head_data()				=> N/A | 
| 1971 |  |  |  |  |  |  | #	$bot->getpattern_catalog_end_data()					=> N/A | 
| 1972 |  |  |  |  |  |  | #	$bot->getpattern_catalog_get_bookargs_data()		=> $raw_pattern | 
| 1973 |  |  |  |  |  |  | #	$bot->catalog_get_bookargs(\%args)					=> 'OK' / 'Skip' | 
| 1974 |  |  |  |  |  |  | #		called after match | 
| 1975 |  |  |  |  |  |  | #	$bot->getpattern_chapters_head_data()				=> N/A | 
| 1976 |  |  |  |  |  |  | #	$bot->getpattern_chapters_end_data()				=> N/A | 
| 1977 |  |  |  |  |  |  | #	$bot->getpattern_chapters_get_chapterargs_data()	=> $raw_pattern | 
| 1978 |  |  |  |  |  |  | #	$bot->chapters_get_chapterargs(\%args)				=> 'OK' / 'Skip' | 
| 1979 |  |  |  |  |  |  | #		called after match | 
| 1980 |  |  |  |  |  |  | #	$bot->getpattern_TOC_exists_data()					=> $raw_pattern | 
| 1981 |  |  |  |  |  |  | #		'' means TOC is always exists | 
| 1982 |  |  |  |  |  |  | #	$bot->getpattern_TOC_head_data()					=> N/A | 
| 1983 |  |  |  |  |  |  | #	$bot->getpattern_TOC_end_data()						=> N/A | 
| 1984 |  |  |  |  |  |  | #	$bot->getpattern_chapter_head_data()				=> N/A | 
| 1985 |  |  |  |  |  |  | #	$bot->getpattern_chapter_end_data()					=> N/A | 
| 1986 |  |  |  |  |  |  | #	$bot->parse_paragraph_begin($content_dein_deout)	=> N/A | 
| 1987 |  |  |  |  |  |  | #	$bot->parse_paragraph_end($content_dein_deout)		=> N/A | 
| 1988 |  |  |  |  |  |  | #	$bot->book_finish(\%args)							=> N/A | 
| 1989 |  |  |  |  |  |  | #	$bot->result_filestem(\%args)						=> filestem | 
| 1990 |  |  |  |  |  |  | #	$bot->result_time(\%args)						=> time / undef | 
| 1991 |  |  |  |  |  |  | #		undef forbiden bot to set file time | 
| 1992 |  |  |  |  |  |  | #------------------------------------------------------------- | 
| 1993 |  |  |  |  |  |  | sub get_alias { | 
| 1994 | 21 |  |  | 21 | 0 | 130 | 'unknown'; | 
| 1995 |  |  |  |  |  |  | } | 
| 1996 |  |  |  |  |  |  | sub argv_default { | 
| 1997 |  |  |  |  |  |  | # default argv list in Getopt::Long format, to pass back to argv_process | 
| 1998 | 0 |  |  | 0 | 0 | 0 | qw(); | 
| 1999 |  |  |  |  |  |  | } | 
| 2000 | 0 |  |  | 0 | 0 | 0 | sub argv_process { | 
| 2001 |  |  |  |  |  |  | } | 
| 2002 | 0 |  |  | 0 | 0 | 0 | sub argv_process_all { | 
| 2003 |  |  |  |  |  |  | } | 
| 2004 | 0 |  |  | 0 | 0 | 0 | sub get_url_verify { | 
| 2005 |  |  |  |  |  |  | # a call back to verify or change $_[1] before real get | 
| 2006 |  |  |  |  |  |  | } | 
| 2007 | 21 |  |  | 21 | 0 | 43 | sub go_login { | 
| 2008 |  |  |  |  |  |  | # login after initialize | 
| 2009 |  |  |  |  |  |  | } | 
| 2010 |  |  |  |  |  |  | sub getpattern_catalog_head_data { | 
| 2011 | 21 |  |  | 21 | 0 | 57 | <<'DATA'; | 
| 2012 |  |  |  |  |  |  | (?= | 
| 2013 |  |  |  |  |  |  | DATA | 
| 2014 |  |  |  |  |  |  | } | 
| 2015 |  |  |  |  |  |  | sub getpattern_catalog_end_data { | 
| 2016 | 21 |  |  | 21 | 0 | 57 | <<'DATA'; | 
| 2017 |  |  |  |  |  |  |  | 
| 2018 |  |  |  |  |  |  | DATA | 
| 2019 |  |  |  |  |  |  | } | 
| 2020 |  |  |  |  |  |  | sub getpattern_catalog_get_bookargs_data { | 
| 2021 | 21 |  |  | 21 | 0 | 58 | <<'DATA'; | 
| 2022 |  |  |  |  |  |  | ]*href=['"]{0,1}(.*?)(?:['" ][^<>]*>|>)(.*?) | 
| 2023 |  |  |  |  |  |  | DATA | 
| 2024 |  |  |  |  |  |  | } | 
| 2025 |  |  |  |  |  |  | sub catalog_get_bookargs { | 
| 2026 | 3 |  |  | 3 | 0 | 8 | my $self = shift; | 
| 2027 | 3 |  |  |  |  | 23 | my @a=@_; | 
| 2028 | 3 |  |  |  |  | 7 | my $pargs=$a[0]; | 
| 2029 | 3 |  |  |  |  | 11 | $pargs->{url}=$a[1]; | 
| 2030 | 3 |  |  |  |  | 19 | $pargs->{title}=$self->parse_titleen($a[2]); | 
| 2031 | 3 |  |  |  |  | 27 | 'OK'; | 
| 2032 |  |  |  |  |  |  | } | 
| 2033 |  |  |  |  |  |  | sub getpattern_catalog_get_next_data { | 
| 2034 | 21 |  |  | 21 | 0 | 63 | '';		#'' means don't know how to stop | 
| 2035 |  |  |  |  |  |  | } | 
| 2036 |  |  |  |  |  |  | sub getpattern_chapters_head_data { | 
| 2037 | 21 |  |  | 21 | 0 | 84 | $_[0]->getpattern_TOC_head_data; | 
| 2038 |  |  |  |  |  |  | } | 
| 2039 |  |  |  |  |  |  | sub getpattern_chapters_end_data { | 
| 2040 | 21 |  |  | 21 | 0 | 82 | $_[0]->getpattern_TOC_end_data; | 
| 2041 |  |  |  |  |  |  | } | 
| 2042 |  |  |  |  |  |  | sub getpattern_chapters_get_chapterargs_data { | 
| 2043 | 21 |  |  | 21 | 0 | 59 | <<'DATA'; | 
| 2044 |  |  |  |  |  |  | ]*href=['"]{0,1}(.*?)(?:['" ][^<>]*>|>)(.*?) | 
| 2045 |  |  |  |  |  |  | DATA | 
| 2046 |  |  |  |  |  |  | } | 
| 2047 |  |  |  |  |  |  | sub chapters_get_chapterargs { | 
| 2048 | 3 |  |  | 3 | 0 | 5 | my $self = shift; | 
| 2049 | 3 |  |  |  |  | 20 | my @a=@_; | 
| 2050 | 3 |  |  |  |  | 7 | my $pargs=$a[0]; | 
| 2051 | 3 |  |  |  |  | 11 | $pargs->{url}=$a[1]; | 
| 2052 | 3 |  |  |  |  | 14 | $pargs->{title}=$self->parse_titleen($a[2]); | 
| 2053 | 3 |  |  |  |  | 20 | 'OK'; | 
| 2054 |  |  |  |  |  |  | } | 
| 2055 |  |  |  |  |  |  | sub getpattern_TOC_exists_data { | 
| 2056 | 21 |  |  | 21 | 0 | 57 | ''; | 
| 2057 |  |  |  |  |  |  | } | 
| 2058 |  |  |  |  |  |  | sub getpattern_TOC_head_data { | 
| 2059 | 42 |  |  | 42 | 0 | 106 | <<'DATA'; | 
| 2060 |  |  |  |  |  |  | (?= | 
| 2061 |  |  |  |  |  |  | DATA | 
| 2062 |  |  |  |  |  |  | } | 
| 2063 |  |  |  |  |  |  | sub getpattern_TOC_end_data { | 
| 2064 | 42 |  |  | 42 | 0 | 109 | <<'DATA'; | 
| 2065 |  |  |  |  |  |  |  | 
| 2066 |  |  |  |  |  |  | DATA | 
| 2067 |  |  |  |  |  |  | } | 
| 2068 |  |  |  |  |  |  | sub getpattern_chapter_head_data { | 
| 2069 | 21 |  |  | 21 | 0 | 59 | <<'DATA'; | 
| 2070 |  |  |  |  |  |  | (?= | 
| 2071 |  |  |  |  |  |  | DATA | 
| 2072 |  |  |  |  |  |  | } | 
| 2073 |  |  |  |  |  |  | sub getpattern_chapter_end_data { | 
| 2074 | 21 |  |  | 21 | 0 | 58 | <<'DATA'; | 
| 2075 |  |  |  |  |  |  |  | 
| 2076 |  |  |  |  |  |  | DATA | 
| 2077 |  |  |  |  |  |  | } | 
| 2078 | 9 |  |  | 9 | 0 | 17 | sub parse_paragraph_begin { | 
| 2079 |  |  |  |  |  |  | } | 
| 2080 | 9 |  |  | 9 | 0 | 19 | sub parse_paragraph_end { | 
| 2081 |  |  |  |  |  |  | } | 
| 2082 | 3 |  |  | 3 | 0 | 7 | sub book_finish { | 
| 2083 |  |  |  |  |  |  | } | 
| 2084 |  |  |  |  |  |  | sub result_filestem { | 
| 2085 | 18 |  |  | 18 | 0 | 36 | my ($self, $pargs) = @_; | 
| 2086 | 18 |  |  |  |  | 198 | return $pargs->{prefix}. | 
| 2087 |  |  |  |  |  |  | sprintf("%04d",$self->{book_get_num}). | 
| 2088 |  |  |  |  |  |  | $pargs->{title}. | 
| 2089 |  |  |  |  |  |  | $pargs->{postfix}; | 
| 2090 |  |  |  |  |  |  | } | 
| 2091 |  |  |  |  |  |  | sub result_time { | 
| 2092 | 9 |  |  | 9 | 0 | 21 | my ($self, $pargs) = @_; | 
| 2093 | 9 |  |  |  |  | 26 | return $pargs->{last_modified}; | 
| 2094 |  |  |  |  |  |  | } | 
| 2095 |  |  |  |  |  |  |  | 
| 2096 |  |  |  |  |  |  | 1; | 
| 2097 |  |  |  |  |  |  | __END__ |