File Coverage

blib/lib/Novel/Robot/Parser/txt.pm

Criterion	Covered	Total	%
statement	91	91	100.0
branch	8	12	66.6
condition	4	10	40.0
subroutine	14	14	100.0
pod	1	5	20.0
total	118	132	89.3

line	stmt	bran	cond	sub	pod	time	code
1							# ABSTRACT: txt parser
2							=pod
3
4							=encoding utf8
5
6							=head1 FUNCTION
7
8							=head2 parse_novel
9
10							解析txt
11
12							my $txt_content_ref = $self->parse_novel(
13							[ '/somedir/', '/someotherdir/somefile.txt' ],
14							writer => 'some_writer',
15							book => 'some_book',
16							chapter_regex => qr/(第\d+章)/,
17							);
18
19
20							=cut
21							package Novel::Robot::Parser::txt;
22	1			1		6	use strict;
	1					1
	1					27
23	1			1		4	use warnings;
	1					2
	1					22
24	1			1		5	use base 'Novel::Robot::Parser';
	1					1
	1					85
25
26	1			1		449	use File::Find::Rule;
	1					7277
	1					6
27	1			1		49	use Encode;
	1					2
	1					80
28	1			1		5	use Encode::Locale;
	1					2
	1					51
29	1			1		7	use Encode::Detect::CJK qw/detect/;
	1					2
	1					43
30	1			1		5	use utf8;
	1					2
	1					4
31
32
33							sub parse_novel {
34	2			2	1	4	my ($self, $path, %opt) = @_;
35	2		33			13	$opt{chapter_regex} \|\|= get_default_chapter_regex();
36
37	2					2	my %data;
38	2		50			10	$data{writer} = $opt{writer} \|\| 'unknown';
39	2		50			8	$data{book} = $opt{book} \|\| 'unknown';
40
41	2	50				7	my $p_ref = ref($path) eq 'ARRAY' ? $path : [ $path ];
42	2					4	for my $p (@$p_ref){
43	2					53	my @txts = sort File::Find::Rule->file()->in($p);
44	2					1131	for my $txt (@txts){
45	2					8	my $txt_data_ref = $self->read_single_txt($txt, %opt);
46	2					16	my $txt_file = decode(locale => $txt);
47	2					126	for my $t (@$txt_data_ref){
48							#$t->{url} = $txt_file;
49	6					8	push @{$data{item_list}}, $t;
	6					14
50							}
51							}
52							}
53
54	2					14	$self->update_item_list($data{item_list});
55
56							#$data{url} = '';
57
58	2					10	return \%data;
59							}
60
61							sub get_default_chapter_regex {
62							#指定分割章节的正则表达式
63
64							#序号
65	2			2	0	8	my $r_num =
66							qr/[０１２３４５６７８９零○〇一二三四五六七八九十百千\d]+/;
67	2					6	my $r_split = qr/[上中下]/;
68	2					5	my $r_not_chap_head = qr/引子\|楔子\|尾声\|内容简介\|正文\|番外\|终章\|序言\|后记\|文案/;
69
70							#第x章，卷x，第x章(大结局)，尾声x
71	2					57	my $r_head = qr/(卷\|第\|$r_not_chap_head)?/;
72	2					5	my $r_tail = qr/(章\|卷\|回\|部\|折)?/;
73	2					4	my $r_post = qr/([.\s\-\(\/（]+.{0,35})?/;
74	2					99	my $regex_a = qr/(【?$r_head\s$r_num\s$r_tail$r_post】?)/;
75
76							#(1)，(1)xxx
77							#xxx(1)，xxx(1)yyy
78							#(1-上\|中\|下)
79	2					27	my $regex_b_index = qr/[(（]$r_num[）)]/;
80	2					24	my $regex_b_tail = qr/$regex_b_index\s*\S+/;
81	2					23	my $regex_b_head = qr/\S+\s*$regex_b_index.{0,10}/;
82	2					40	my $regex_b_split = qr/[(（]$r_num[-－]$r_split[）)]/;
83	2					88	my $regex_b = qr/$regex_b_head\|$regex_b_tail\|$regex_b_index\|$regex_b_split/;
84
85							#1、xxx，一、xxx
86	2					38	my $regex_c = qr/$r_num[、．. ].{0,10}/;
87
88							#第x卷 xxx 第x章 xxx
89							#第x卷/第x章 xxx
90	2					101	my $regex_d = qr/($regex_a(\s+.{0,10})?){2}/;
91
92							#后记 xxx
93	2					68	my $regex_e = qr/(【?$r_not_chap_head\s*$r_post】?)/;
94
95							#总体
96	2					352	my $chap_r = qr/^\s($regex_a\|$regex_b\|$regex_c\|$regex_d\|$regex_e)\s$/m;
97
98	2					24	return $chap_r;
99							}
100
101
102
103							sub read_single_txt {
104
105							#读入单个txt文件
106	2			2	0	6	my ($self, $txt, %opt) = @_;
107
108	2					5	my $charset = $self->detect_file_charset($txt);
109	1			1		7	open my $sh, "<:encoding($charset)", $txt;
	1					2
	1					8
	2					2878
110
111	2					5902	my @data;
112	2					8	my ( $single_toc, $single_content ) = ( '', '' );
113
114							#第一章
115	2					65	while (<$sh>) {
116	2	50				24	next unless /\S/;
117	2	50				27	$single_toc = /$opt{chapter_regex}/ ? $1 : $_;
118	2					4	last;
119							} ## end while (<$sh>)
120
121							#后续章节
122	2					8	while (<$sh>) {
123	26	100				65	next unless /\S/;
124	16	100				125	if ( my ($new_single_toc) = /$opt{chapter_regex}/ ) {
125	4	50	33			19	if ( $single_toc =~ /\S/ and $single_content =~ /\S/s ) {
126	4					14	push @data, { title => $single_toc, content => $single_content };
127	4					7	$single_toc = '';
128							} ## end if ( $single_toc =~ /\S/...)
129	4					9	$single_toc .= $new_single_toc . "\n";
130	4					13	$single_content = '';
131							}
132							else {
133	12					53	$single_content .= $_;
134							} ## end else [ if ( my ($new_single_toc...))]
135							} ## end while (<$sh>)
136
137	2					11	push @data, { title => $single_toc, content => $single_content };
138	2					9	$self->format_chapter_content($_) for @data;
139
140	2					35	return \@data;
141							} ## end sub read_single_txt
142
143							sub format_chapter_content {
144	6			6	0	12	my ($self, $r) = @_;
145	6					9	for ($r->{content}) {
146	6					12	s##\n#gi;
147	6					41	s#\s(.\S)\s*# $1 \n#gm;
148	6					25	s# \s* ##g;
149							} ## end for ($chap_c)
150
151	6					12	return $self;
152							}
153
154							sub detect_file_charset {
155	2			2	0	4	my ($self, $file) = @_;
156	2					69	open my $fh, '<', $file;
157	2					59	read $fh, my $text, 360;
158	2					11	return detect($text);
159							} ## end sub detect_file_charset
160
161							1;