| blib/lib/Novel/Robot/Parser.pm | |||
|---|---|---|---|
| Criterion | Covered | Total | % |
| statement | 162 | 386 | 41.9 |
| branch | 34 | 138 | 24.6 |
| condition | 18 | 121 | 14.8 |
| subroutine | 28 | 59 | 47.4 |
| pod | 3 | 34 | 8.8 |
| total | 245 | 738 | 33.2 |
| line | stmt | bran | cond | sub | pod | time | code |
|---|---|---|---|---|---|---|---|
| 1 | # ABSTRACT: get novel / bbs content from website | ||||||
| 2 | package Novel::Robot::Parser; | ||||||
| 3 | |||||||
| 4 | 3 | 3 | 587 | use strict; | |||
| 3 | 6 | ||||||
| 3 | 98 | ||||||
| 5 | 3 | 3 | 15 | use warnings; | |||
| 3 | 6 | ||||||
| 3 | 112 | ||||||
| 6 | 3 | 3 | 697 | use utf8; | |||
| 3 | 18 | ||||||
| 3 | 15 | ||||||
| 7 | |||||||
| 8 | 3 | 3 | 1443 | use Novel::Robot::Browser; | |||
| 3 | 33 | ||||||
| 3 | 126 | ||||||
| 9 | 3 | 3 | 20 | use URI; | |||
| 3 | 9 | ||||||
| 3 | 60 | ||||||
| 10 | 3 | 3 | 17 | use Encode; | |||
| 3 | 6 | ||||||
| 3 | 278 | ||||||
| 11 | 3 | 3 | 1601 | use Web::Scraper; | |||
| 3 | 230690 | ||||||
| 3 | 24 | ||||||
| 12 | 3 | 3 | 240 | use HTML::TreeBuilder; | |||
| 3 | 6 | ||||||
| 3 | 33 | ||||||
| 13 | 3 | 3 | 83 | use Data::Dumper; | |||
| 3 | 9 | ||||||
| 3 | 4941 | ||||||
| 14 | |||||||
| 15 | ### {{{ data | ||||||
| 16 | |||||||
| 17 | #our $VERSION = 0.32; | ||||||
| 18 | |||||||
| 19 | our %SITE_DOM_NAME = ( | ||||||
| 20 | 'bbs.jjwxc.net' => 'hjj', | ||||||
| 21 | 'www.jjwxc.net' => 'jjwxc', | ||||||
| 22 | 'tieba.baidu.com' => 'tieba', | ||||||
| 23 | |||||||
| 24 | 'www.ddshu.net' => 'ddshu', | ||||||
| 25 | 'www.kanunu8.com' => 'kanunu8', | ||||||
| 26 | ); | ||||||
| 27 | |||||||
| 28 | our %NULL_INDEX = ( | ||||||
| 29 | url => '', | ||||||
| 30 | book => '', | ||||||
| 31 | writer => '', | ||||||
| 32 | writer_url => '', | ||||||
| 33 | item_list => [], | ||||||
| 34 | |||||||
| 35 | intro => '', | ||||||
| 36 | series => '', | ||||||
| 37 | progress => '', | ||||||
| 38 | word_num => '', | ||||||
| 39 | ); | ||||||
| 40 | |||||||
| 41 | our %NULL_CHAPTER = ( | ||||||
| 42 | content => '', | ||||||
| 43 | id => 0, | ||||||
| 44 | pid => 0, | ||||||
| 45 | time => '', | ||||||
| 46 | title => '', | ||||||
| 47 | url => '', | ||||||
| 48 | writer => '', | ||||||
| 49 | writer_say => '', | ||||||
| 50 | abstract => '', | ||||||
| 51 | word_num => '', | ||||||
| 52 | type => '', | ||||||
| 53 | ); | ||||||
| 54 | |||||||
| 55 | ### }}} | ||||||
| 56 | |||||||
| 57 | ### init {{{ | ||||||
| 58 | sub new { | ||||||
| 59 | 2 | 2 | 1 | 1396 | my ( $self, %opt ) = @_; | ||
| 60 | |||||||
| 61 | 2 | 18 | $opt{site} = $self->detect_site( $opt{site} ); | ||||
| 62 | |||||||
| 63 | 2 | 8 | my $module = "Novel::Robot::Parser::$opt{site}"; | ||||
| 64 | 2 | 128 | eval "require $module;"; | ||||
| 65 | |||||||
| 66 | 2 | 30 | my $browser = Novel::Robot::Browser->new( %opt ); | ||||
| 67 | 2 | 21 | bless { browser => $browser, %opt }, $module; | ||||
| 68 | } | ||||||
| 69 | |||||||
| 70 | 0 | 0 | sub domain { } | ||||
| 71 | |||||||
| 72 | sub detect_domain { | ||||||
| 73 | 0 | 0 | 0 | 0 | my ( $self, $url ) = @_; | ||
| 74 | 0 | 0 | 0 | return ( $url, $url ) unless ( $url =~ /^https?:/ ); | |||
| 75 | |||||||
| 76 | 0 | 0 | my ( $dom ) = $url =~ m#^.*?\/\/(.+?)(?:/|$)#; | ||||
| 77 | |||||||
| 78 | 0 | 0 | my $base_dom = $dom; | ||||
| 79 | 0 | 0 | $base_dom =~ s/^[^.]+\.//; | ||||
| 80 | 0 | 0 | 0 | $base_dom = $base_dom =~ /\./ ? $base_dom : $dom; | |||
| 81 | 0 | 0 | return ( $dom, $base_dom ); | ||||
| 82 | } | ||||||
| 83 | |||||||
| 84 | sub detect_site { | ||||||
| 85 | 2 | 2 | 0 | 6 | my ( $self, $url ) = @_; | ||
| 86 | |||||||
| 87 | 2 | 50 | 33 | 19 | if ( $url and $url =~ /^https?:/ ) { | ||
| 88 | 0 | 0 | my ( $dom ) = $url =~ m#^.*?\/\/(.+?)/#; | ||||
| 89 | 0 | 0 | 0 | return $SITE_DOM_NAME{$dom} if ( exists $SITE_DOM_NAME{$dom} ); | |||
| 90 | 0 | 0 | return 'default'; | ||||
| 91 | } | ||||||
| 92 | |||||||
| 93 | 2 | 50 | 9 | return $url // 'default'; | |||
| 94 | } | ||||||
| 95 | ### }}} | ||||||
| 96 | |||||||
| 97 | ### {{{ common | ||||||
| 98 | 2 | 2 | 0 | 8 | sub site_type { 'novel' } | ||
| 99 | 0 | 0 | 0 | 0 | sub charset { 'cp936' } | ||
| 100 | 0 | 0 | sub base_url { } | ||||
| 101 | |||||||
| 102 | sub generate_novel_url { | ||||||
| 103 | 0 | 0 | 0 | 0 | my ( $self, $index_url, @args ) = @_; | ||
| 104 | 0 | 0 | return ( $index_url, @args ); | ||||
| 105 | } | ||||||
| 106 | |||||||
| 107 | ### }}} | ||||||
| 108 | |||||||
| 109 | ### {{{ novel | ||||||
| 110 | |||||||
| 111 | sub get_novel_info { | ||||||
| 112 | 1 | 1 | 1 | 12 | my ( $self, $url ) = @_; | ||
| 113 | 1 | 4 | my ( $i_url, $post_data ) = $self->generate_novel_url( $url ); | ||||
| 114 | 1 | 8 | my $c = $self->{browser}->request_url( $i_url, $post_data ); | ||||
| 115 | |||||||
| 116 | 1 | 25 | my $r = $self->extract_elements( | ||||
| 117 | \$c, | ||||||
| 118 | path => $self->scrape_novel(), | ||||||
| 119 | sub => $self->can( "parse_novel" ), | ||||||
| 120 | ); | ||||||
| 121 | 1 | 11 | $r->{item_list} = $self->parse_item_list( \$c, $r ); | ||||
| 122 | 1 | 6 | ( $r->{item_list}, $r->{item_num} ) = $self->update_item_list( $r->{item_list}, $url ); | ||||
| 123 | 1 | 6 | return $r; | ||||
| 124 | } | ||||||
| 125 | |||||||
| 126 | sub get_novel_ref { | ||||||
| 127 | 2 | 2 | 1 | 904 | my ( $self, $index_url, %o ) = @_; | ||
| 128 | |||||||
| 129 | 2 | 50 | 8 | return $self->get_tiezi_ref( $index_url, %o ) if ( $self->site_type() eq 'tiezi' ); | |||
| 130 | |||||||
| 131 | 2 | 5 | my ( $r, $item_list, $max_item_num ); | ||||
| 132 | 2 | 50 | 8 | if ( $index_url !~ /^https?:/ ) { | |||
| 133 | 2 | 10 | $r = $self->parse_novel( $index_url, %o ); | ||||
| 134 | } else { | ||||||
| 135 | 0 | 0 | my ( $i_url, $post_data ) = $self->generate_novel_url( $index_url ); | ||||
| 136 | |||||||
| 137 | ( $r, $item_list ) = $self->{browser}->request_url_whole( | ||||||
| 138 | $i_url, | ||||||
| 139 | post_data => $post_data, | ||||||
| 140 | info_sub => sub { | ||||||
| 141 | 0 | 0 | 0 | $self->extract_elements( | |||
| 142 | @_, | ||||||
| 143 | path => $self->scrape_novel(), | ||||||
| 144 | sub => $self->can( "parse_novel" ), | ||||||
| 145 | ); | ||||||
| 146 | }, | ||||||
| 147 | 0 | 0 | 0 | item_list_sub => sub { $self->can( "parse_item_list" )->( $self, @_ ) }, | |||
| 148 | item_sub => sub { | ||||||
| 149 | 0 | 0 | 0 | $self->extract_elements( | |||
| 150 | @_, | ||||||
| 151 | path => $self->scrape_novel_item(), | ||||||
| 152 | sub => $self->can( "parse_novel_item" ), | ||||||
| 153 | ); | ||||||
| 154 | }, | ||||||
| 155 | 0 | 0 | %o, | ||||
| 156 | ); | ||||||
| 157 | |||||||
| 158 | 0 | 0 | $r->{url} = $index_url; | ||||
| 159 | 0 | 0 | 0 | $r->{item_list} = $item_list || []; | |||
| 160 | |||||||
| 161 | #$r->{item_num} = $max_item_num || undef; | ||||||
| 162 | } ## end else [ if ( $index_url !~ /^https?:/)] | ||||||
| 163 | |||||||
| 164 | 2 | 10 | ( $r->{item_list}, $r->{item_num} ) = $self->update_item_list( $r->{item_list}, $index_url ); | ||||
| 165 | 2 | 12 | $self->filter_item_list( $r, %o ); | ||||
| 166 | 2 | 13 | $r->{writer_url} = $self->format_abs_url( $r->{writer_url}, $index_url ); | ||||
| 167 | |||||||
| 168 | 2 | 6 | for my $k ( qw/writer book/ ) { | ||||
| 169 | 4 | 50 | 12 | $r->{$k} = $o{$k} if ( exists $o{$k} ); | |||
| 170 | } | ||||||
| 171 | 2 | 66 | 35 | $r->{$_} ||= $NULL_INDEX{$_} for keys( %NULL_INDEX ); | |||
| 172 | 2 | 15 | $r->{$_} = $self->tidy_string( $r->{$_} ) for qw/writer book/; | ||||
| 173 | |||||||
| 174 | 2 | 8 | return $r; | ||||
| 175 | } ## end sub get_novel_ref | ||||||
| 176 | |||||||
| 177 | sub scrape_novel { | ||||||
| 178 | 1 | 1 | 0 | 5 | my ( $self ) = @_; | ||
| 179 | 1 | 5 | my $r = {}; | ||||
| 180 | 1 | 50 | 7 | push @{$r->{book}}, { path => $self->{book_path} } if ( exists $self->{book_path} ); | |||
| 0 | 0 | ||||||
| 181 | 1 | 50 | 5 | push @{$r->{book}}, { regex => $self->{book_regex} } if ( exists $self->{book_regex} ); | |||
| 0 | 0 | ||||||
| 182 | 1 | 50 | 5 | push @{$r->{writer}}, { path => $self->{writer_path} } if ( exists $self->{writer_path} ); | |||
| 0 | 0 | ||||||
| 183 | 1 | 50 | 5 | push @{$r->{writer}}, { regex => $self->{writer_regex} } if ( exists $self->{writer_regex} ); | |||
| 0 | 0 | ||||||
| 184 | 1 | 44 | return $r; | ||||
| 185 | } | ||||||
| 186 | |||||||
| 187 | sub parse_novel { | ||||||
| 188 | 0 | 0 | 0 | 0 | my ( $self, $h, $r ) = @_; | ||
| 189 | |||||||
| 190 | 0 | 0 | 0 | $r->{book} ||= $self->scrape_element_try( | |||
| 191 | $h, | ||||||
| 192 | [ { path => '//meta[@name="og:novel:book_name"]', extract => '@content' }, | ||||||
| 193 | { path => '//meta[@property="og:novel:book_name"]', extract => '@content' }, | ||||||
| 194 | { path => '//meta[@property="og:title"]', extract => '@content' }, | ||||||
| 195 | { path => '//div[@id="title"]', }, | ||||||
| 196 | { path => '//div[@class="title"]', }, | ||||||
| 197 | { regex => qr# |
||||||
| 198 | { regex => qr# |
||||||
| 199 | { regex => qr# |
||||||
| 200 | { path => '//h1', }, | ||||||
| 201 | { path => '//h2', }, | ||||||
| 202 | ], | ||||||
| 203 | sub => $self->can( "tidy_writer_book" ), | ||||||
| 204 | ); | ||||||
| 205 | |||||||
| 206 | 0 | 0 | 0 | $r->{writer} ||= $self->scrape_element_try( | |||
| 207 | $h, | ||||||
| 208 | [ { path => '//meta[@name="author"]', extract => '@content' }, | ||||||
| 209 | |||||||
| 210 | { path => '//meta[@name="og:novel:author"]', extract => '@content' }, | ||||||
| 211 | { path => '//meta[@property="og:novel:author"]', extract => '@content' }, | ||||||
| 212 | { path => '//*[@class="author"]', }, | ||||||
| 213 | { path => '//*[@class="writer"]', }, | ||||||
| 214 | 3 | 3 | 68 | { regex => qr#作者:([^<]+)#si, }, | |||
| 3 | 7 | ||||||
| 3 | 53 | ||||||
| 215 | { regex => qr#作者:([^<]+)#si, }, | ||||||
| 216 | { regex => qr#<(?:em|i|h3|h2|span)>作者:([^<]+)(?:em|i|h3|h2|span)>#si, }, | ||||||
| 217 | { regex => qr#作者:(?:)?]*>([^<]+)#si, }, | ||||||
| 218 | { regex => qr# 作(?: |\s)*者:([^<]+) #si, }, |
||||||
| 219 | { regex => qr#作者:([^<]+?) 发布时间:#s, }, | ||||||
| 220 | { regex => qr#content="([^"]+?)最新著作#s, }, | ||||||
| 221 | { regex => qr# |
||||||
| 222 | { regex => qr# |
||||||
| 223 | { regex => qr#content="[^"]+?,([^",]+?)作品#s, }, | ||||||
| 224 | ], | ||||||
| 225 | sub => $self->can( "tidy_writer_book" ), | ||||||
| 226 | ); | ||||||
| 227 | |||||||
| 228 | 0 | 0 | $r->{$_} = $self->tidy_writer_book( $r->{$_} ) for qw/writer book title/; | ||||
| 229 | |||||||
| 230 | 0 | 0 | return $r; | ||||
| 231 | } ## end sub parse_novel | ||||||
| 232 | |||||||
| 233 | sub scrape_item_list { | ||||||
| 234 | 0 | 0 | 0 | 0 | my ( $self ) = @_; | ||
| 235 | 0 | 0 | my $r = {}; | ||||
| 236 | 0 | 0 | 0 | $r->{path} = $self->{item_list_path} if ( exists $self->{item_list_path} ); | |||
| 237 | 0 | 0 | return $r; | ||||
| 238 | } | ||||||
| 239 | |||||||
| 240 | sub parse_item_list { | ||||||
| 241 | 1 | 1 | 0 | 5 | my ( $self, $h, $r ) = @_; | ||
| 242 | |||||||
| 243 | 1 | 50 | 10 | return $r->{item_list} if ( exists $r->{item_list} ); | |||
| 244 | |||||||
| 245 | 0 | 0 | my $path_r = $self->scrape_item_list(); | ||||
| 246 | |||||||
| 247 | 0 | 0 | 0 | return $self->guess_item_list( $h ) unless ( exists $path_r->{path} ); | |||
| 248 | |||||||
| 249 | my $parse_novel = scraper { | ||||||
| 250 | process $path_r->{path}, | ||||||
| 251 | 0 | 0 | 0 | 'item_list[]' => { | |||
| 252 | 'title' => 'TEXT', | ||||||
| 253 | 'url' => '@href' | ||||||
| 254 | }; | ||||||
| 255 | 0 | 0 | }; | ||||
| 256 | 0 | 0 | my $ref = $parse_novel->scrape( $h ); | ||||
| 257 | |||||||
| 258 | 0 | 0 | 0 | my @chap = grep { exists $_->{url} and $_->{url} } @{ $ref->{item_list} }; | |||
| 0 | 0 | ||||||
| 0 | 0 | ||||||
| 259 | |||||||
| 260 | 0 | 0 | 0 | if ( $path_r->{sort} ) { | |||
| 261 | 0 | 0 | @chap = sort { $a->{url} cmp $b->{url} } @chap; | ||||
| 0 | 0 | ||||||
| 262 | } | ||||||
| 263 | |||||||
| 264 | 0 | 0 | return \@chap; | ||||
| 265 | } ## end sub parse_item_list | ||||||
| 266 | |||||||
| 267 | sub guess_item_list { | ||||||
| 268 | 0 | 0 | 0 | 0 | my ( $self, $h, %opt ) = @_; | ||
| 269 | |||||||
| 270 | 0 | 0 | my $new_h = $$h; | ||||
| 271 | 0 | 0 | $new_h=~s# |
||||
| 272 | |||||||
| 273 | 0 | 0 | my $tree = HTML::TreeBuilder->new(); | ||||
| 274 | 0 | 0 | $tree->parse( $new_h ); | ||||
| 275 | |||||||
| 276 | 0 | 0 | my @links = $tree->look_down( '_tag', 'a' ); | ||||
| 277 | 0 | 0 | @links = grep { $_->attr( 'href' ) } @links; | ||||
| 0 | 0 | ||||||
| 278 | 0 | 0 | for my $x ( @links ) { | ||||
| 279 | 0 | 0 | my $up_url = $x->attr( 'href' ); | ||||
| 280 | 0 | 0 | $up_url =~ s#/[^/]+/?$#/#; | ||||
| 281 | 0 | 0 | 0 | $up_url = '.' if ( $up_url !~ m#/# ); | |||
| 282 | |||||||
| 283 | 0 | 0 | $x = { parent => $up_url, depth => $x->depth(), url => $x->attr( 'href' ), title => $x->as_text() }; | ||||
| 284 | } | ||||||
| 285 | |||||||
| 286 | 0 | 0 | my @out_links; | ||||
| 287 | 0 | 0 | my @temp_arr = ( $links[0] ); | ||||
| 288 | 0 | 0 | my $parent = $links[0]{parent}; | ||||
| 289 | 0 | 0 | my $depth = $links[0]{depth}; | ||||
| 290 | 0 | 0 | for ( my $i = 1 ; $i <= $#links ; $i++ ) { | ||||
| 291 | 0 | 0 | 0 | 0 | if ( $depth == $links[$i]{depth} and $parent eq $links[$i]{parent} ) { | ||
| 292 | 0 | 0 | push @temp_arr, $links[$i]; | ||||
| 293 | } else { | ||||||
| 294 | 0 | 0 | push @out_links, [@temp_arr]; | ||||
| 295 | 0 | 0 | @temp_arr = ( $links[$i] ); | ||||
| 296 | 0 | 0 | $depth = $links[$i]{depth}; | ||||
| 297 | 0 | 0 | $parent = $links[$i]{parent}; | ||||
| 298 | } | ||||||
| 299 | } | ||||||
| 300 | |||||||
| 301 | 0 | 0 | 0 | push @out_links, \@temp_arr if ( @temp_arr ); | |||
| 302 | |||||||
| 303 | 0 | 0 | @out_links = sort { scalar( @$b ) <=> scalar( @$a ) } @out_links; | ||||
| 0 | 0 | ||||||
| 304 | |||||||
| 305 | 0 | 0 | my $res_arr; | ||||
| 306 | 0 | 0 | my $title_regex = | ||||
| 307 | qr/引子|楔子|内容简介|正文|序言|文案|第\s*[0123456789零○〇一二三四五六七八九十百千\d]+\s*(章|节)|(^[0-9]+)/; | ||||||
| 308 | 0 | 0 | my $chap_num_regex = qr/(^|\/)\d+(\.html)?$/; | ||||
| 309 | 0 | 0 | for my $arr ( @out_links ) { | ||||
| 310 | 0 | 0 | my $x = $arr->[0]; | ||||
| 311 | 0 | 0 | my $y = $arr->[1]; | ||||
| 312 | 0 | 0 | my $z = $arr->[-1]; | ||||
| 313 | |||||||
| 314 | 0 | 0 | 0 | 0 | $res_arr = $arr if ( $opt{chapter_url_regex} and $x->{url} =~ /$opt{chapter_url_regex}/ ); | ||
| 315 | 0 | 0 | 0 | 0 | $res_arr = $arr if ( $opt{chapter_title_regex} and $x->{title} =~ /$opt{chapter_title_regex}/ ); | ||
| 316 | $res_arr = $arr | ||||||
| 317 | 0 | 0 | 0 | 0 | if ( ($x and $x->{title} =~ /$title_regex/) or ( $y and $y->{title} =~ /$title_regex/ ) or ( $z and $z->{title} =~ /$title_regex/ ) ); | ||
| 0 | |||||||
| 0 | |||||||
| 0 | |||||||
| 0 | |||||||
| 318 | 0 | 0 | 0 | 0 | $res_arr = $arr if ( ( $x->{url} =~ /$chap_num_regex/ or $z->{url} =~ /$chap_num_regex/ ) and scalar( @$arr ) > 50 ); | ||
| 0 | |||||||
| 319 | |||||||
| 320 | #$res_arr= $arr if( ($x->{url}=~/\/?\d+$/ or $z->{url}=~/\/?\d+$/) and scalar(@$arr)>50); | ||||||
| 321 | 0 | 0 | 0 | last if ( $res_arr ); | |||
| 322 | } | ||||||
| 323 | |||||||
| 324 | #remove not chapter url | ||||||
| 325 | 0 | 0 | while ( 1 ) { | ||||
| 326 | 0 | 0 | my $x = $res_arr->[0]; | ||||
| 327 | 0 | 0 | my $y = $res_arr->[ int( $#$res_arr / 2 ) ]; | ||||
| 328 | 0 | 0 | 0 | 0 | if ( defined $y->{title} and $y->{title} =~ /$title_regex/ and defined $y->{url} and $y->{url} =~ /\.html$/ and $x->{url} !~ /\.html$/ ) { | ||
| 0 | 0 | ||||||
| 0 | |||||||
| 0 | |||||||
| 0 | |||||||
| 0 | |||||||
| 0 | |||||||
| 0 | |||||||
| 329 | 0 | 0 | shift( @$res_arr ); | ||||
| 330 | } elsif ( defined $y->{title} and $y->{title} =~ /$title_regex/ and defined $y->{url} and $y->{url} =~ /$chap_num_regex/ and $x->{url} !~ /$chap_num_regex/ ) { | ||||||
| 331 | 0 | 0 | shift( @$res_arr ); | ||||
| 332 | } else { | ||||||
| 333 | 0 | 0 | last; | ||||
| 334 | } | ||||||
| 335 | } | ||||||
| 336 | |||||||
| 337 | #sort chapter url | ||||||
| 338 | 0 | 0 | 0 | 0 | if ( $res_arr and defined $res_arr->[0]{url} and $res_arr->[0]{url} =~ /$chap_num_regex/ ) { | ||
| 0 | |||||||
| 339 | 0 | 0 | 0 | my $trim_sub = sub { my $s = $_[0]; $s =~ s/^.+\///; $s =~ s/\.html$//; return $s }; | |||
| 0 | 0 | ||||||
| 0 | 0 | ||||||
| 0 | 0 | ||||||
| 0 | 0 | ||||||
| 340 | 0 | 0 | my @sort_arr; | ||||
| 341 | 0 | 0 | 0 | if($opt{sort_chapter_url}){ | |||
| 342 | 0 | 0 | @sort_arr = sort { $trim_sub->( $a->{url} ) <=> $trim_sub->( $b->{url} ) } grep { $_->{url} =~ /$chap_num_regex/ } @$res_arr; | ||||
| 0 | 0 | ||||||
| 0 | 0 | ||||||
| 343 | }else{ | ||||||
| 344 | 0 | 0 | @sort_arr = @$res_arr; | ||||
| 345 | } | ||||||
| 346 | 0 | 0 | my @s = map { $trim_sub->( $_->{url} ) } @sort_arr; | ||||
| 0 | 0 | ||||||
| 347 | 0 | 0 | my $random_sort = 0; | ||||
| 348 | 0 | 0 | for my $i ( 0 .. $#s - 1 ) { | ||||
| 349 | 0 | 0 | 0 | $random_sort = 1 if ( $s[$i] > $s[ $i + 1 ] ); | |||
| 350 | 0 | 0 | 0 | last if ( $random_sort ); | |||
| 351 | } | ||||||
| 352 | 0 | 0 | 0 | return \@sort_arr if ( $random_sort == 0 ); | |||
| 353 | } | ||||||
| 354 | |||||||
| 355 | 0 | 0 | 0 | return $res_arr || []; | |||
| 356 | } ## end sub guess_item_list | ||||||
| 357 | |||||||
| 358 | sub scrape_novel_item { | ||||||
| 359 | 1 | 1 | 0 | 32 | my ( $self ) = @_; | ||
| 360 | 1 | 4 | my $r = {}; | ||||
| 361 | 1 | 50 | 8 | push @{$r->{content}}, { path => $self->{content_path}, extract => 'HTML' } if( exists $self->{content_path} ); | |||
| 0 | 0 | ||||||
| 362 | 1 | 50 | 6 | push @{$r->{content}}, { regex => $self->{content_regex} } if( exists $self->{content_regex} ); | |||
| 0 | 0 | ||||||
| 363 | 1 | 4 | push @{$r->{content}}, ( | ||||
| 1 | 8 | ||||||
| 364 | { path => '//div[@class="novel_content"]' }, | ||||||
| 365 | { path => '//div[@id="content"]' }, | ||||||
| 366 | ); | ||||||
| 367 | 1 | 13 | return $r; | ||||
| 368 | } | ||||||
| 369 | |||||||
| 370 | sub parse_novel_item { | ||||||
| 371 | 0 | 0 | 0 | 0 | my ( $self, $h, $r ) = @_; | ||
| 372 | |||||||
| 373 | 0 | 0 | 0 | $r = $self->guess_novel_item( $h ) unless ( $r->{content} ); | |||
| 374 | 0 | 0 | 0 | $r->{$_} ||= $NULL_CHAPTER{$_} for keys( %NULL_CHAPTER ); | |||
| 375 | 0 | 0 | $r->{content} = $self->tidy_content( $r->{content} ); | ||||
| 376 | |||||||
| 377 | 0 | 0 | my $next_url = $self->scrape_element_try($h, [ | ||||
| 378 | { path => '//a[@id="next_url"]', extract =>'@href' }, | ||||||
| 379 | { path => '//a[contains(text(),"下一页")]', extract => '@href' }, | ||||||
| 380 | ]); | ||||||
| 381 | 0 | 0 | 0 | if($next_url){ | |||
| 382 | 0 | 0 | $r->{next_url} = $next_url; | ||||
| 383 | } | ||||||
| 384 | |||||||
| 385 | 0 | 0 | return $r; | ||||
| 386 | } | ||||||
| 387 | |||||||
| 388 | sub guess_novel_item { | ||||||
| 389 | 0 | 0 | 0 | 0 | my ( $self, $h, %opt ) = @_; | ||
| 390 | |||||||
| 391 | 0 | 0 | $$h =~ s###sg; | ||||
| 392 | 0 | 0 | $$h =~ s###sg; | ||||
| 393 | |||||||
| 394 | 0 | 0 | my $tree = HTML::TreeBuilder->new(); | ||||
| 395 | 0 | 0 | $tree->parse( $$h ); | ||||
| 396 | |||||||
| 397 | 0 | 0 | my @links = $tree->look_down( 'text', undef ); | ||||
| 398 | 0 | 0 | for my $x ( @links ) { | ||||
| 399 | 0 | 0 | $x = { content => $x->as_HTML( '<>&' ) }; | ||||
| 400 | 0 | 0 | $self->calc_content_wordnum( $x ); | ||||
| 401 | } | ||||||
| 402 | 0 | 0 | my @out_links = sort { $b->{word_num} <=> $a->{word_num} } @links; | ||||
| 0 | 0 | ||||||
| 403 | |||||||
| 404 | 0 | 0 | my $no_next_r; | ||||
| 405 | 0 | 0 | for my $r ( @out_links ) { | ||||
| 406 | 0 | 0 | 0 | next if ( $r->{content} =~ m#(style|head|body|html)>#s ); | |||
| 407 | 0 | 0 | 0 | next if ( $r->{content} =~ m#^\s* | |||
| 408 | 0 | 0 | 0 | next if ( $r->{content} =~ /(上|下)一(章|页|篇)/s ); | |||
| 409 | 0 | 0 | 0 | next if ( $r->{content} =~ m##s ); | |||
| 410 | 0 | 0 | 0 | next if ( $r->{content} =~ m#All rights reserved#s ); | |||
| 411 | 0 | 0 | 0 | next if ( $r->{content} =~ m#(.+?){5,}#s ); | |||
| 412 | |||||||
| 413 | 0 | 0 | $no_next_r = $r; | ||||
| 414 | 0 | 0 | last; | ||||
| 415 | } | ||||||
| 416 | |||||||
| 417 | #my @grep_next_r = grep { $_->{content} =~ /(上|下)一(章|页|篇)\w{0,20}$/s and $_->{word_num} > 50 } @out_links; | ||||||
| 418 | 0 | 0 | my @grep_next_r = grep { $_->{content} =~ /(上|下)一(章|页|篇)/s | ||||
| 419 | 0 | 0 | 0 | and $_->{word_num} > 50 | |||
| 420 | } @out_links; | ||||||
| 421 | |||||||
| 422 | 0 | 0 | my $cc = $no_next_r->{content}; | ||||
| 423 | 0 | 0 | my $cc_n = $cc =~ s/(\n| ]*>| |
||||
| 424 | 0 | 0 | 0 | 0 | return $no_next_r if ( ( $cc_n > 5 and $no_next_r->{word_num} > 50) or !@grep_next_r ); | ||
| 0 | |||||||
| 425 | |||||||
| 426 | 0 | 0 | 0 | return $grep_next_r[-1] || {}; | |||
| 427 | } ## end sub guess_novel_item | ||||||
| 428 | |||||||
| 429 | ### }}} | ||||||
| 430 | |||||||
| 431 | ### {{{ tiezi | ||||||
| 432 | |||||||
| 433 | sub get_tiezi_ref { | ||||||
| 434 | 0 | 0 | 0 | 0 | my ( $self, $url, %o ) = @_; | ||
| 435 | |||||||
| 436 | 0 | 0 | my $class = 'novel'; | ||||
| 437 | my ( $topic, $item_list ) = $self->{browser}->request_url_whole( | ||||||
| 438 | $url, | ||||||
| 439 | |||||||
| 440 | #post_data => $o{post_data}, | ||||||
| 441 | info_sub => sub { | ||||||
| 442 | 0 | 0 | 0 | $self->extract_elements( | |||
| 443 | @_, | ||||||
| 444 | path => $self->can( "scrape_$class" )->( $self ), | ||||||
| 445 | sub => $self->can( "parse_$class" ), | ||||||
| 446 | ); | ||||||
| 447 | }, | ||||||
| 448 | 0 | 0 | 0 | item_list_sub => sub { $self->can( "parse_${class}_item" )->( $self, @_ ) }, | |||
| 449 | 0 | 0 | 0 | page_list_sub => sub { $self->can( "parse_${class}_list" )->( $self, @_ ) }, | |||
| 450 | |||||||
| 451 | #min_page_num => $o{"min_page_num"}, | ||||||
| 452 | #max_page_num => $o{"max_page_num"}, | ||||||
| 453 | stop_sub => sub { | ||||||
| 454 | 0 | 0 | 0 | my ( $info, $data_list, $i ) = @_; | |||
| 455 | 0 | 0 | $self->{browser}->is_list_overflow( $data_list, $o{"max_item_num"} ); | ||||
| 456 | }, | ||||||
| 457 | 0 | 0 | %o, | ||||
| 458 | ); | ||||||
| 459 | |||||||
| 460 | 0 | 0 | $item_list = $self->update_item_list( $item_list, $url ); | ||||
| 461 | |||||||
| 462 | 0 | 0 | 0 | unshift @$item_list, $topic if ( $topic->{content} ); | |||
| 463 | my %r = ( | ||||||
| 464 | %$topic, | ||||||
| 465 | writer => $o{writer} || $topic->{writer}, | ||||||
| 466 | book => $o{book} || $topic->{book} || $topic->{title}, | ||||||
| 467 | 0 | 0 | 0 | url => $url, | |||
| 0 | |||||||
| 468 | item_list => $item_list, | ||||||
| 469 | ); | ||||||
| 470 | 0 | 0 | $self->filter_item_list( \%r, %o ); | ||||
| 471 | |||||||
| 472 | 0 | 0 | return \%r; | ||||
| 473 | } ## end sub get_tiezi_ref | ||||||
| 474 | |||||||
| 475 | ### }}} | ||||||
| 476 | |||||||
| 477 | ### {{{ iterate_ref | ||||||
| 478 | sub get_iterate_ref { | ||||||
| 479 | 0 | 0 | 0 | 0 | my ( $self, $url, %o ) = @_; | ||
| 480 | my ( $info, $item_list ) = $self->{browser}->request_url_whole( | ||||||
| 481 | $url, | ||||||
| 482 | post_data => $o{post_data}, | ||||||
| 483 | info_sub => sub { | ||||||
| 484 | $self->extract_elements( | ||||||
| 485 | @_, | ||||||
| 486 | path => {}, | ||||||
| 487 | 0 | 0 | sub => sub { my ( $self, $html_ref, $r ) = @_; return $r; }, | ||||
| 0 | 0 | ||||||
| 488 | 0 | 0 | 0 | ); | |||
| 489 | }, | ||||||
| 490 | 0 | 0 | 0 | item_sub => sub { my ( $self, $html_ref ) = @_; return {}; }, | |||
| 0 | 0 | ||||||
| 491 | 0 | 0 | 0 | item_list_sub => sub { my ( $self, $html_ref ) = @_; return []; }, | |||
| 0 | 0 | ||||||
| 492 | |||||||
| 493 | #min_page_num => $o{"min_page_num"}, | ||||||
| 494 | #max_page_num => $o{"max_page_num"}, | ||||||
| 495 | stop_sub => sub { | ||||||
| 496 | 0 | 0 | 0 | my ( $info, $data_list, $i ) = @_; | |||
| 497 | 0 | 0 | $self->{browser}->is_list_overflow( $data_list, $o{"max_item_num"} ); | ||||
| 498 | }, | ||||||
| 499 | 0 | 0 | %o, | ||||
| 500 | ); | ||||||
| 501 | |||||||
| 502 | 0 | 0 | $info->{item_list} = $self->update_item_list( $item_list, $url ); | ||||
| 503 | |||||||
| 504 | 0 | 0 | return $info; | ||||
| 505 | } ## end sub get_iterate_ref | ||||||
| 506 | |||||||
| 507 | ### }}} | ||||||
| 508 | |||||||
| 509 | ### {{{ base | ||||||
| 510 | |||||||
| 511 | sub update_item_list { | ||||||
| 512 | 5 | 5 | 0 | 14 | my ( $self, $arr, $base_url ) = @_; | ||
| 513 | |||||||
| 514 | 5 | 9 | my %rem; | ||||
| 515 | 5 | 11 | for my $chap (@$arr){ | ||||
| 516 | 22 | 50 | 0 | 55 | $chap = { url => $chap || '' } if ( ref( $chap ) ne 'HASH' ); | ||
| 517 | 22 | 100 | 48 | if ( $chap->{url} ) { | |||
| 518 | 10 | 24 | $chap->{url} = $self->format_abs_url( $chap->{url}, $base_url ); | ||||
| 519 | 10 | 9283 | $rem{ $chap->{url} }++; | ||||
| 520 | } | ||||||
| 521 | } | ||||||
| 522 | |||||||
| 523 | 5 | 10 | my $i = 0; | ||||
| 524 | 5 | 12 | my @res; | ||||
| 525 | 5 | 10 | for my $chap ( @$arr ) { | ||||
| 526 | 22 | 50 | 66 | 63 | if($chap->{url} and $rem{ $chap->{url} }>1){ | ||
| 527 | 0 | 0 | $rem{$chap->{url}}--; | ||||
| 528 | }else{ | ||||||
| 529 | 22 | 33 | ++$i; | ||||
| 530 | 22 | 66 | 67 | $chap->{pid} //= $i; #page id | |||
| 531 | 22 | 66 | 51 | $chap->{id} //= $i; #item id | |||
| 532 | 22 | 100 | 54 | $chap->{content} //= ''; | |||
| 533 | 22 | 50 | 55 | push @res, $chap unless($chap->{content}=~m#正在手打中#s); | |||
| 534 | } | ||||||
| 535 | } | ||||||
| 536 | |||||||
| 537 | 5 | 33 | 29 | while(@res and $res[-1]{content}=~m#正在手打中#s ){ | |||
| 538 | 0 | 0 | pop @res; | ||||
| 539 | } | ||||||
| 540 | |||||||
| 541 | #$i = $arr->[-1]{id} if ( $#$arr >= 0 and exists $arr->[-1]{id} and $arr->[-1]{id} > $i ); | ||||||
| 542 | 5 | 100 | 31 | return wantarray ? ( \@res, $i ) : \@res; | |||
| 543 | } ## end sub update_item_list | ||||||
| 544 | |||||||
| 545 | sub format_abs_url { | ||||||
| 546 | 12 | 12 | 0 | 28 | my ( $self, $url, $base_url ) = @_; | ||
| 547 | 12 | 50 | 24 | return $url unless ( $base_url ); | |||
| 548 | 12 | 100 | 43 | return $url unless ( $base_url =~ /^https?:/ ); | |||
| 549 | 10 | 36 | my $abs_url = URI->new_abs( $url, $base_url )->as_string; | ||||
| 550 | } | ||||||
| 551 | |||||||
| 552 | sub extract_elements { | ||||||
| 553 | 2 | 2 | 0 | 12 | my ( $self, $h, %o ) = @_; | ||
| 554 | 2 | 50 | 11 | $o{path} ||= {}; | |||
| 555 | |||||||
| 556 | 2 | 6 | my $r = {}; | ||||
| 557 | 2 | 6 | while ( my ( $xk, $xr ) = each %{ $o{path} } ) { | ||||
| 3 | 24 | ||||||
| 558 | 1 | 11 | $r->{$xk} = $self->scrape_element_try( $h, $xr ); | ||||
| 559 | } | ||||||
| 560 | 2 | 50 | 18 | $r = $o{sub}->( $self, $h, $r ) if ( $o{sub} ); | |||
| 561 | 2 | 13 | return $r; | ||||
| 562 | } | ||||||
| 563 | |||||||
| 564 | sub scrape_element_try { | ||||||
| 565 | 1 | 1 | 0 | 4 | my ( $self, $h, $r_list, %o ) = @_; | ||
| 566 | 1 | 3 | my $c; | ||||
| 567 | 1 | 5 | for my $path_or_regex ( @$r_list ) { | ||||
| 568 | 2 | 13 | $c = $self->scrape_element( $h, $path_or_regex ); | ||||
| 569 | 2 | 50 | 13 | next unless ( $c ); | |||
| 570 | 0 | 0 | 0 | $c = $o{sub}->( $self, $c ) if ( exists $o{sub} ); | |||
| 571 | 0 | 0 | 0 | next unless ( $c ); | |||
| 572 | 0 | 0 | return $c; | ||||
| 573 | } | ||||||
| 574 | 1 | 8 | return; | ||||
| 575 | } | ||||||
| 576 | |||||||
| 577 | sub scrape_element { | ||||||
| 578 | 2 | 2 | 0 | 8 | my ( $self, $h, $o ) = @_; | ||
| 579 | 2 | 50 | 10 | return $self->extract_regex_element( $h, $o->{regex} ) if ( $o->{regex} ); | |||
| 580 | 2 | 50 | 11 | return $o->{sub}->( $h ) unless ( $o->{path} ); | |||
| 581 | |||||||
| 582 | 2 | 50 | 15 | $o->{extract} ||= 'TEXT'; | |||
| 583 | |||||||
| 584 | my $parse = $o->{is_list} | ||||||
| 585 | 0 | 0 | 0 | ? scraper { process $o->{path}, 'data[]' => $o->{extract}; } | |||
| 586 | 2 | 50 | 2 | 22 | : scraper { process_first $o->{path}, 'data' => $o->{extract}; }; | ||
| 2 | 123653 | ||||||
| 587 | 2 | 31 | my $r = $parse->scrape( $h ); | ||||
| 588 | 2 | 50 | 115359 | return unless ( defined $r->{data} ); | |||
| 589 | |||||||
| 590 | 0 | 0 | 0 | return $r->{data} unless ( $o->{sub} ); | |||
| 591 | 0 | 0 | return $o->{sub}->( $r->{data} ); | ||||
| 592 | } | ||||||
| 593 | |||||||
| 594 | sub extract_regex_element { | ||||||
| 595 | 0 | 0 | 0 | 0 | my ( $self, $h, $reg ) = @_; | ||
| 596 | 0 | 0 | my ( $d ) = $$h =~ m#$reg#s; | ||||
| 597 | 0 | 0 | return $d; | ||||
| 598 | } | ||||||
| 599 | |||||||
| 600 | sub filter_item_list { | ||||||
| 601 | 2 | 2 | 0 | 5 | my ( $self, $r, %o ) = @_; | ||
| 602 | |||||||
| 603 | 2 | 5 | my $flist = $r->{item_list}; | ||||
| 604 | |||||||
| 605 | #$r->{item_num} //= $flist->[-1]{id} // scalar( @$flist ); | ||||||
| 606 | |||||||
| 607 | 2 | 16 | $flist->[$_]{content} = $self->tidy_content( $flist->[$_]{content} ) for ( 0 .. $#$flist ); | ||||
| 608 | |||||||
| 609 | 2 | 7 | $flist = [ grep { $self->{browser}->is_item_in_range( $_->{id}, $o{min_item_num}, $o{max_item_num} ) } @$flist ]; | ||||
| 6 | 27 | ||||||
| 610 | |||||||
| 611 | 2 | 11 | $self->calc_content_wordnum( $_ ) for @$flist; | ||||
| 612 | |||||||
| 613 | 0 | 0 | $flist = [ grep { $_->{word_num} >= $o{min_content_word_num} } @$flist ] | ||||
| 614 | 2 | 50 | 6 | if ( $o{min_content_word_num} ); | |||
| 615 | |||||||
| 616 | 0 | 0 | $flist = [ grep { $_->{writer} eq $r->{writer} } @$flist ] | ||||
| 617 | 2 | 50 | 5 | if ( $o{only_poster} ); | |||
| 618 | |||||||
| 619 | 0 | 0 | $flist = [ grep { $_->{content} =~ /$o{grep_content}/s } @$flist ] | ||||
| 620 | 2 | 50 | 5 | if ( $o{grep_content} ); | |||
| 621 | |||||||
| 622 | 0 | 0 | $flist = [ grep { $_->{content} !~ /$o{filter_content}/s } @$flist ] | ||||
| 623 | 2 | 50 | 6 | if ( $o{filter_content} ); | |||
| 624 | |||||||
| 625 | 2 | 50 | 5 | $flist = [ grep { defined $_->{content} and $_->{content} =~ /\S/s } @$flist ]; | |||
| 6 | 42 | ||||||
| 626 | |||||||
| 627 | 2 | 50 | 12 | $r->{item_list} = $flist || []; | |||
| 628 | |||||||
| 629 | 2 | 5 | return $self; | ||||
| 630 | } ## end sub filter_item_list | ||||||
| 631 | |||||||
| 632 | sub calc_content_wordnum { | ||||||
| 633 | 6 | 6 | 0 | 12 | my ( $self, $f ) = @_; | ||
| 634 | 6 | 50 | 15 | return if ( $f->{word_num} ); | |||
| 635 | 6 | 50 | 12 | my $wd = $f->{content} || ''; | |||
| 636 | 6 | 46 | $wd =~ s/<[^>]+>//gs; | ||||
| 637 | 6 | 25 | $wd =~ s/\s+//sg; | ||||
| 638 | 6 | 54 | $f->{word_num} = $wd =~ s/\S//gs; | ||||
| 639 | 6 | 19 | return $f; | ||||
| 640 | } | ||||||
| 641 | |||||||
| 642 | sub tidy_writer_book { | ||||||
| 643 | 0 | 0 | 0 | 0 | my ( $self, $c ) = @_; | ||
| 644 | 0 | 0 | 0 | return unless ( defined $c ); | |||
| 645 | 0 | 0 | for ( $c ) { | ||||
| 646 | 0 | 0 | s/作\s*者://; | ||||
| 647 | 0 | 0 | s/^\s*作者-\s*//; | ||||
| 648 | 0 | 0 | s/小说全集//; | ||||
| 649 | 0 | 0 | s/作品全集//; | ||||
| 650 | 0 | 0 | s/专栏//; | ||||
| 651 | 0 | 0 | s/^.*版权属于作者([^,]+)$/$1/; | ||||
| 652 | 0 | 0 | s/\s*最新章节\s*$//; | ||||
| 653 | 0 | 0 | s/全文阅读//; | ||||
| 654 | 0 | 0 | s/在线阅读//; | ||||
| 655 | 0 | 0 | s/全集下载//; | ||||
| 656 | 0 | 0 | s/章节目录//; | ||||
| 657 | 0 | 0 | s/^\s*《(.*)》\s*$/$1/; | ||||
| 658 | 0 | 0 | s/^\s+|\s+$//g; | ||||
| 659 | 0 | 0 | s/\s+//g; | ||||
| 660 | } | ||||||
| 661 | 0 | 0 | return $c; | ||||
| 662 | } ## end sub tidy_writer_book | ||||||
| 663 | |||||||
| 664 | sub tidy_content { | ||||||
| 665 | 6 | 6 | 0 | 13 | my ( $self, $c ) = @_; | ||
| 666 | 6 | 12 | for ( $c ) { | ||||
| 667 | 6 | 50 | 12 | last unless ( $c ); | |||
| 668 | 6 | 14 | s# ##sg; | ||||
| 669 | 6 | 10 | s# #\n#sg; | ||||
| 670 | 6 | 18 | s#\s{5,}#\n#sg; | ||||
| 671 | 6 | 12 | s###sg; | ||||
| 672 | 6 | 54 | s#\s*\<[^>]+?\>#\n#sg; | ||||
| 673 | 6 | 32 | s{\n+}{\n}sg; | ||||
| 674 | 6 | 54 | s{\s*(\S.*?)\s*\n}{\n $1 }sg; |
||||
| 675 | 6 | 18 | s#\s+上一章\s+.+?下一章.+$##s; | ||||
| 676 | 6 | 11 | s#[^\n]+加入书签[^\n]+##s; | ||||
| 677 | } | ||||||
| 678 | 6 | 23 | return $c; | ||||
| 679 | } | ||||||
| 680 | |||||||
| 681 | sub tidy_string { | ||||||
| 682 | 4 | 4 | 0 | 10 | my ( $self, $c ) = @_; | ||
| 683 | 4 | 50 | 9 | $c ||= ''; | |||
| 684 | 4 | 8 | for ( $c ) { | ||||
| 685 | 4 | 14 | s/^\s+|\s+$//gs; | ||||
| 686 | 4 | 8 | s/[\*\/\\\[\(\)]+//g; | ||||
| 687 | 4 | 16 | s/[[:punct:]]//sg; | ||||
| 688 | 4 | 9 | s/[\]\s+\/\\]/-/g; | ||||
| 689 | } | ||||||
| 690 | |||||||
| 691 | 4 | 14 | return $c; | ||||
| 692 | } | ||||||
| 693 | |||||||
| 694 | sub get_inner_html { | ||||||
| 695 | 0 | 0 | 0 | my ( $self, $h ) = @_; | |||
| 696 | |||||||
| 697 | 0 | 0 | return '' unless ( $h ); | ||||
| 698 | |||||||
| 699 | 0 | my $head_i = index( $h, '>' ); | |||||
| 700 | 0 | substr( $h, 0, $head_i + 1 ) = ''; | |||||
| 701 | |||||||
| 702 | 0 | my $tail_i = rindex( $h, '<' ); | |||||
| 703 | 0 | substr( $h, $tail_i ) = ''; | |||||
| 704 | |||||||
| 705 | 0 | return $h; | |||||
| 706 | } | ||||||
| 707 | |||||||
| 708 | sub unescape_js { | ||||||
| 709 | 0 | 0 | 0 | my ( $self, $s ) = @_; | |||
| 710 | 0 | $s =~ s/%u([0-9a-f]{4})/chr(hex($1))/eigs; | |||||
| 0 | |||||||
| 711 | 0 | $s =~ s/%([0-9a-f]{2})/chr(hex($1))/eigs; | |||||
| 0 | |||||||
| 712 | 0 | return $s; | |||||
| 713 | } | ||||||
| 714 | |||||||
| 715 | sub encode_cjk_for_url { | ||||||
| 716 | 0 | 0 | 0 | my ( $self, $key ) = @_; | |||
| 717 | 0 | my $b = uc( unpack( "H*", encode( $self->charset(), $key ) ) ); | |||||
| 718 | 0 | $b =~ s/(..)/%$1/g; | |||||
| 719 | 0 | return $b; | |||||
| 720 | } | ||||||
| 721 | |||||||
| 722 | ### }}} | ||||||
| 723 | |||||||
| 724 | 1; | ||||||
| 725 |