| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package WWW::BookBot::Chinese; |
|
2
|
|
|
|
|
|
|
|
|
3
|
1
|
|
|
1
|
|
755
|
use 5.008; |
|
|
1
|
|
|
|
|
3
|
|
|
|
1
|
|
|
|
|
38
|
|
|
4
|
1
|
|
|
1
|
|
6
|
use strict; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
31
|
|
|
5
|
1
|
|
|
1
|
|
5
|
use warnings; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
35
|
|
|
6
|
1
|
|
|
1
|
|
5
|
no warnings qw(uninitialized); |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
48
|
|
|
7
|
1
|
|
|
1
|
|
5
|
use base qw(WWW::BookBot); |
|
|
1
|
|
|
|
|
7
|
|
|
|
1
|
|
|
|
|
808
|
|
|
8
|
1
|
|
|
1
|
|
13
|
use vars qw($VERSION); |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
764
|
|
|
9
|
|
|
|
|
|
|
$VERSION = '0.12'; |
|
10
|
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
#------------------------------------------------------------- |
|
12
|
|
|
|
|
|
|
# Default settings |
|
13
|
|
|
|
|
|
|
# $class->default_settings => \%settings |
|
14
|
|
|
|
|
|
|
#------------------------------------------------------------- |
|
15
|
|
|
|
|
|
|
sub default_settings { |
|
16
|
7
|
|
|
7
|
0
|
46
|
my $self = shift->SUPER::default_settings; |
|
17
|
7
|
|
|
|
|
19
|
$self->{get_language}='zh-cn'; |
|
18
|
7
|
|
|
|
|
13
|
$self->{language_decode}='gbk'; |
|
19
|
7
|
|
|
|
|
14
|
$self->{language_encode}='gbk'; |
|
20
|
7
|
|
|
|
|
17
|
$self; |
|
21
|
|
|
|
|
|
|
} |
|
22
|
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
#------------------------------------------------------------- |
|
24
|
|
|
|
|
|
|
# Redefined functions |
|
25
|
|
|
|
|
|
|
# $bot->decode_entity($content_dein_deout) => N/A |
|
26
|
|
|
|
|
|
|
# $bot->trandict_init => $bot->{translate_dict} |
|
27
|
|
|
|
|
|
|
# $bot->msg_init => $bot->{messages} |
|
28
|
|
|
|
|
|
|
#------------------------------------------------------------- |
|
29
|
|
|
|
|
|
|
sub decode_entity { |
|
30
|
|
|
|
|
|
|
#chinese novels sometimes add \x{FF1B} after unkown unicode string |
|
31
|
8
|
|
|
8
|
0
|
23
|
$_[1]=~s/(?:&\#(\d{1,5});?\x{FF1B}?)/chr($1)/esg; |
|
|
2
|
|
|
|
|
10
|
|
|
32
|
8
|
|
|
|
|
15
|
$_[1]=~s/(?:&\#[xX]([0-9a-fA-F]{1,5});?\x{FF1B}?)/chr(hex($1))/esg; |
|
|
0
|
|
|
|
|
0
|
|
|
33
|
8
|
50
|
|
|
|
16
|
$_[1]=~s/(&([0-9a-zA-Z]{1,9});?)/$WWW::BookBot::entity2char{$2} or $1/esg; |
|
|
1
|
|
|
|
|
9
|
|
|
34
|
|
|
|
|
|
|
#normalize middle dot |
|
35
|
8
|
|
|
|
|
21
|
$_[1]=~s/\x{2022}/\x{00B7}/sg; |
|
36
|
|
|
|
|
|
|
} |
|
37
|
|
|
|
|
|
|
sub trandict_init { |
|
38
|
7
|
|
|
7
|
0
|
57
|
shift->{translate_dict} = { |
|
39
|
|
|
|
|
|
|
'log' => "日志", |
|
40
|
|
|
|
|
|
|
'result' => "结果", |
|
41
|
|
|
|
|
|
|
'DB' => "数据", |
|
42
|
|
|
|
|
|
|
'debug' => "调试", |
|
43
|
|
|
|
|
|
|
} |
|
44
|
|
|
|
|
|
|
} |
|
45
|
|
|
|
|
|
|
sub msg_init { |
|
46
|
7
|
|
|
7
|
0
|
18
|
my $skip_info="\n".'$pargs->{levelspace} url=$pargs->{url}'."\n"; |
|
47
|
7
|
|
|
|
|
259
|
shift->{messages} = { |
|
48
|
|
|
|
|
|
|
TestMsg => '测试: $pargs->{TestInfo} $pargs->{TestNum}', |
|
49
|
|
|
|
|
|
|
BookStart => '$pargs->{levelspace} [$pargs->{bpos_limit}/$pargs->{book_num}] $pargs->{title_limit} ', |
|
50
|
|
|
|
|
|
|
BookBinaryOK => '$pargs->{data_len_KB} $pargs->{write_file}'."\n", |
|
51
|
|
|
|
|
|
|
BookChapterErr => ' - 无法分析'.$skip_info, |
|
52
|
|
|
|
|
|
|
BookChapterMany => '[$pargs->{chapter_num_limit}章]', |
|
53
|
|
|
|
|
|
|
BookChapterOne => '[单章节]', |
|
54
|
|
|
|
|
|
|
BookChapterOK => '$pargs->{data_len_KB}'."\n", |
|
55
|
|
|
|
|
|
|
BookTOCFinish => '$pargs->{TOC_len_KB}'."\n", |
|
56
|
|
|
|
|
|
|
CatalogInfo => '取书目: ', |
|
57
|
|
|
|
|
|
|
CatalogResultErr=> ' 0套书'."\n", |
|
58
|
|
|
|
|
|
|
CatalogResultOK => ' $pargs->{book_num}套书'."\n", |
|
59
|
|
|
|
|
|
|
CatalogURL => '$pargs->{url}', |
|
60
|
|
|
|
|
|
|
CatalogURLEmpty => '[失败] 索引的URL为空'."\n", |
|
61
|
|
|
|
|
|
|
DBBookErr => "\t".' \$bot->go_book({$pargs->{allargs}});'."\t#错误\n", |
|
62
|
|
|
|
|
|
|
DBBookOK => "\t".'#\$bot->go_book({$pargs->{allargs}});'."\n", |
|
63
|
|
|
|
|
|
|
DBCatalogErr => ' \$bot->go_catalog({$pargs->{allargs}});'."\t#错误\n", |
|
64
|
|
|
|
|
|
|
DBCatalogOK => '#\$bot->go_catalog({$pargs->{allargs}});'."\n", |
|
65
|
|
|
|
|
|
|
DBHead => <<'DATA', |
|
66
|
|
|
|
|
|
|
#!$pargs->{perlcmd} |
|
67
|
|
|
|
|
|
|
##====================================== |
|
68
|
|
|
|
|
|
|
## 自动生成的数据文件,用于$pargs->{classname} |
|
69
|
|
|
|
|
|
|
## 生成时间: $pargs->{createtime} |
|
70
|
|
|
|
|
|
|
##====================================== |
|
71
|
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
use $pargs->{classname}; |
|
73
|
|
|
|
|
|
|
my \$bot = new $pargs->{classname}; |
|
74
|
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
DATA |
|
76
|
|
|
|
|
|
|
FailClearDB => '无法清除数据文件$pargs->{filename}: $pargs->{errmsg}', |
|
77
|
|
|
|
|
|
|
FailClose => '无法关闭$self->{translate_dict}->{$pargs->{filetype}}文件$pargs->{filename}: $pargs->{errmsg}', |
|
78
|
|
|
|
|
|
|
FailMkDir => '建目录$pargs->{dir}失败: $pargs->{errmsg}', |
|
79
|
|
|
|
|
|
|
FailOpen => '无法打开$self->{translate_dict}->{$pargs->{filetype}}文件$pargs->{filename}: $pargs->{errmsg}', |
|
80
|
|
|
|
|
|
|
FailWrite => '无法写入$self->{translate_dict}->{$pargs->{filetype}}文件$pargs->{filename}: $pargs->{errmsg}', |
|
81
|
|
|
|
|
|
|
GetFail404 => <<'DATA', |
|
82
|
|
|
|
|
|
|
[$pargs->{code},失败] 找不到文件 |
|
83
|
|
|
|
|
|
|
$pargs->{url_real} |
|
84
|
|
|
|
|
|
|
DATA |
|
85
|
|
|
|
|
|
|
GetFail404Detail=> <<'DATA', |
|
86
|
|
|
|
|
|
|
[$pargs->{code},失败] 找不到文件 |
|
87
|
|
|
|
|
|
|
>>>>请求 |
|
88
|
|
|
|
|
|
|
$pargs->{req_content}<<<<响应 |
|
89
|
|
|
|
|
|
|
$pargs->{status_line} |
|
90
|
|
|
|
|
|
|
|
|
91
|
|
|
|
|
|
|
DATA |
|
92
|
|
|
|
|
|
|
GetFailRetries => <<'DATA', |
|
93
|
|
|
|
|
|
|
[$pargs->{code},失败] 重试太多,放弃 |
|
94
|
|
|
|
|
|
|
$pargs->{url_real} |
|
95
|
|
|
|
|
|
|
DATA |
|
96
|
|
|
|
|
|
|
GetFailRetriesDetail => <<'DATA', |
|
97
|
|
|
|
|
|
|
[$pargs->{code},失败] 重试太多,放弃 |
|
98
|
|
|
|
|
|
|
>>>>请求 |
|
99
|
|
|
|
|
|
|
$pargs->{req_content}<<<<响应 |
|
100
|
|
|
|
|
|
|
$pargs->{status_line} |
|
101
|
|
|
|
|
|
|
$pargs->{res_content} |
|
102
|
|
|
|
|
|
|
|
|
103
|
|
|
|
|
|
|
DATA |
|
104
|
|
|
|
|
|
|
GetURLSuccess => '$pargs->{len_KB} ', |
|
105
|
|
|
|
|
|
|
GetURLRetry => '[$pargs->{code},重试] ', |
|
106
|
|
|
|
|
|
|
GetWait => '等待..', |
|
107
|
|
|
|
|
|
|
SkipMaxLevel => '[跳过]层数>$self->{book_max_levels}'.$skip_info, |
|
108
|
|
|
|
|
|
|
SkipMedia => '[跳过]媒体文件'.$skip_info, |
|
109
|
|
|
|
|
|
|
SkipTitleEmpty => '[跳过]标题为空'.$skip_info, |
|
110
|
|
|
|
|
|
|
SkipUrlEmpty => '[跳过]地址为空'."\n", |
|
111
|
|
|
|
|
|
|
SkipVisited => '[跳过]已访问过'."\n", |
|
112
|
|
|
|
|
|
|
SkipZip => '[跳过]压缩文件'.$skip_info, |
|
113
|
|
|
|
|
|
|
}; |
|
114
|
|
|
|
|
|
|
} |
|
115
|
|
|
|
|
|
|
|
|
116
|
|
|
|
|
|
|
#------------------------------------------------------------- |
|
117
|
|
|
|
|
|
|
# patterns |
|
118
|
|
|
|
|
|
|
#------------------------------------------------------------- |
|
119
|
|
|
|
|
|
|
sub getpattern_space2_data { |
|
120
|
7
|
|
|
7
|
0
|
72
|
<<'DATA'; |
|
121
|
|
|
|
|
|
|
[ ] |
|
122
|
|
|
|
|
|
|
DATA |
|
123
|
|
|
|
|
|
|
} |
|
124
|
|
|
|
|
|
|
sub getpattern_line_head_data { |
|
125
|
7
|
|
|
7
|
0
|
23
|
' '; |
|
126
|
|
|
|
|
|
|
} |
|
127
|
|
|
|
|
|
|
sub getpattern_parentheses_data { |
|
128
|
7
|
|
|
7
|
0
|
37
|
shift->SUPER::getpattern_parentheses_data().<<'DATA'; |
|
129
|
|
|
|
|
|
|
〃 〃 |
|
130
|
|
|
|
|
|
|
‘ ’ |
|
131
|
|
|
|
|
|
|
“ ” |
|
132
|
|
|
|
|
|
|
〔 〕 |
|
133
|
|
|
|
|
|
|
〈 〉 |
|
134
|
|
|
|
|
|
|
《 》 |
|
135
|
|
|
|
|
|
|
「 」 |
|
136
|
|
|
|
|
|
|
『 』 |
|
137
|
|
|
|
|
|
|
〖 〗 |
|
138
|
|
|
|
|
|
|
【 】 |
|
139
|
|
|
|
|
|
|
′ ′ |
|
140
|
|
|
|
|
|
|
″ ″ |
|
141
|
|
|
|
|
|
|
" " |
|
142
|
|
|
|
|
|
|
' ' |
|
143
|
|
|
|
|
|
|
( ) |
|
144
|
|
|
|
|
|
|
< > |
|
145
|
|
|
|
|
|
|
[ ] |
|
146
|
|
|
|
|
|
|
` ` |
|
147
|
|
|
|
|
|
|
` ' |
|
148
|
|
|
|
|
|
|
{ } |
|
149
|
|
|
|
|
|
|
︵ ︶ |
|
150
|
|
|
|
|
|
|
︹ ︺ |
|
151
|
|
|
|
|
|
|
︿ ﹀ |
|
152
|
|
|
|
|
|
|
︽ ︾ |
|
153
|
|
|
|
|
|
|
﹁ ﹂ |
|
154
|
|
|
|
|
|
|
﹃ ﹄ |
|
155
|
|
|
|
|
|
|
︻ ︼ |
|
156
|
|
|
|
|
|
|
︷ ︸ |
|
157
|
|
|
|
|
|
|
ˋ ˊ |
|
158
|
|
|
|
|
|
|
‵ ‵ |
|
159
|
|
|
|
|
|
|
〝 〞 |
|
160
|
|
|
|
|
|
|
﹙ ﹚ |
|
161
|
|
|
|
|
|
|
﹛ ﹜ |
|
162
|
|
|
|
|
|
|
﹝ ﹞ |
|
163
|
|
|
|
|
|
|
﹤ ﹥ |
|
164
|
|
|
|
|
|
|
DATA |
|
165
|
|
|
|
|
|
|
} |
|
166
|
|
|
|
|
|
|
sub getpattern_mark_dash_data { |
|
167
|
7
|
|
|
7
|
0
|
20
|
<<'DATA'; |
|
168
|
|
|
|
|
|
|
[#-&\*\+\-=@_~ˉ—~‖…×÷∷⊙≡≈∽∞$¤¢‰§#%&*+-=@_|–―‥∣¦‐ー─-♂〇〓※︱-︴﹉-﹏﹡﹢﹣﹦﹩﹪﹫] |
|
169
|
|
|
|
|
|
|
DATA |
|
170
|
|
|
|
|
|
|
} |
|
171
|
|
|
|
|
|
|
sub getpattern_mark_wordsplit_data { |
|
172
|
7
|
|
|
7
|
0
|
24
|
<<'DATA'; |
|
173
|
|
|
|
|
|
|
[\.\,\?\!\:\;∶、。·!,.:;?︰﹐﹑﹒﹔﹕﹖﹗] |
|
174
|
|
|
|
|
|
|
DATA |
|
175
|
|
|
|
|
|
|
} |
|
176
|
|
|
|
|
|
|
sub getpattern_word_finish_data { |
|
177
|
7
|
|
|
7
|
0
|
22
|
<<'DATA'; |
|
178
|
|
|
|
|
|
|
(?:全[文书]|)[完终] |
|
179
|
|
|
|
|
|
|
DATA |
|
180
|
|
|
|
|
|
|
} |
|
181
|
|
|
|
|
|
|
sub getpattern_remove_line_by_end_data { |
|
182
|
7
|
|
|
7
|
0
|
23
|
<<'DATA'; |
|
183
|
|
|
|
|
|
|
(case) |
|
184
|
|
|
|
|
|
|
[报网社讯] |
|
185
|
|
|
|
|
|
|
[连重排整出提推扫校较编书世视文科在讨小工转][学幻论作]?(?:[载贴排版理品供出入校较描正对者屋库城路界苑线区组室]|海洋|望远镜|桃花源|-K12)(?:完成|) |
|
186
|
|
|
|
|
|
|
请(?:申请授权|保留站台信息)[。.﹒\.!﹗]? |
|
187
|
|
|
|
|
|
|
制作 |
|
188
|
|
|
|
|
|
|
[OoOo][CcCc][RrRr] |
|
189
|
|
|
|
|
|
|
采编中心 |
|
190
|
|
|
|
|
|
|
亦凡公益图书馆 |
|
191
|
|
|
|
|
|
|
龙的天空 |
|
192
|
|
|
|
|
|
|
失落的星辰 |
|
193
|
|
|
|
|
|
|
书香门第 |
|
194
|
|
|
|
|
|
|
旧雨楼 |
|
195
|
|
|
|
|
|
|
一剑小天下 |
|
196
|
|
|
|
|
|
|
竹露荷风 |
|
197
|
|
|
|
|
|
|
扬剑轩居士 |
|
198
|
|
|
|
|
|
|
幻想时代 |
|
199
|
|
|
|
|
|
|
冒险者天堂 |
|
200
|
|
|
|
|
|
|
信息中心 |
|
201
|
|
|
|
|
|
|
cnread[\.。.·﹒]net |
|
202
|
|
|
|
|
|
|
ezla[\.。.·﹒]com?[\.。.·﹒]tw |
|
203
|
|
|
|
|
|
|
thebook[\.。.·﹒]yeah[\.。.·﹒]net |
|
204
|
|
|
|
|
|
|
y(?:esho[\.。.·﹒]com/wenxue|uzispy[\.。.·﹒]yeah[\.。.·﹒]net) |
|
205
|
|
|
|
|
|
|
www[\.。.·﹒](?:v-war|oldrain)[\.。.·﹒](?:net|com) |
|
206
|
|
|
|
|
|
|
DATA |
|
207
|
|
|
|
|
|
|
} |
|
208
|
|
|
|
|
|
|
sub getpattern_remove_line_by_end_special_data { |
|
209
|
7
|
|
|
7
|
0
|
26
|
<<'DATA'; |
|
210
|
|
|
|
|
|
|
报网社讯 |
|
211
|
|
|
|
|
|
|
DATA |
|
212
|
|
|
|
|
|
|
} |
|
213
|
|
|
|
|
|
|
|
|
214
|
|
|
|
|
|
|
1; |
|
215
|
|
|
|
|
|
|
__END__ |