| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package NewsExtractor::TextUtil; |
|
2
|
6
|
|
|
6
|
|
954046
|
use strict; |
|
|
6
|
|
|
|
|
43
|
|
|
|
6
|
|
|
|
|
192
|
|
|
3
|
6
|
|
|
6
|
|
32
|
use warnings; |
|
|
6
|
|
|
|
|
12
|
|
|
|
6
|
|
|
|
|
188
|
|
|
4
|
6
|
|
|
6
|
|
623
|
use Encode qw(is_utf8 decode_utf8); |
|
|
6
|
|
|
|
|
10523
|
|
|
|
6
|
|
|
|
|
393
|
|
|
5
|
6
|
|
|
6
|
|
2860
|
use Mojo::DOM; |
|
|
6
|
|
|
|
|
973047
|
|
|
|
6
|
|
|
|
|
1539
|
|
|
6
|
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
our @EXPORT = ( |
|
8
|
|
|
|
|
|
|
'u', |
|
9
|
|
|
|
|
|
|
'normalize_whitespace', |
|
10
|
|
|
|
|
|
|
'html2text', |
|
11
|
|
|
|
|
|
|
'is_empty', |
|
12
|
|
|
|
|
|
|
'parse_dateline_ymdhms', |
|
13
|
|
|
|
|
|
|
'reformat_dateline', |
|
14
|
|
|
|
|
|
|
'remove_control_characters', |
|
15
|
|
|
|
|
|
|
); |
|
16
|
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
sub u($) { |
|
18
|
43
|
50
|
|
43
|
0
|
23416
|
defined($_[0]) or return undef; |
|
19
|
|
|
|
|
|
|
|
|
20
|
43
|
|
|
|
|
97
|
my $v = "".$_[0]; |
|
21
|
43
|
100
|
|
|
|
243
|
return is_utf8($v) ? $v : decode_utf8($v); |
|
22
|
|
|
|
|
|
|
} |
|
23
|
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
sub is_empty { |
|
25
|
0
|
0
|
|
0
|
0
|
0
|
(! defined($_[0])) || $_[0] eq ''; |
|
26
|
|
|
|
|
|
|
} |
|
27
|
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
sub normalize_whitespace { |
|
29
|
2
|
|
|
2
|
0
|
6308
|
local $_ = $_[0]; |
|
30
|
2
|
|
|
|
|
17
|
s/\h+/ /g; |
|
31
|
2
|
|
|
|
|
7
|
s/\r\n/\n/g; |
|
32
|
2
|
|
|
|
|
6
|
s/\A\s+//; |
|
33
|
2
|
|
|
|
|
8
|
s/\s+\z//; |
|
34
|
2
|
|
|
|
|
6
|
return $_; |
|
35
|
|
|
|
|
|
|
} |
|
36
|
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
sub remove_control_characters { |
|
38
|
0
|
|
|
0
|
0
|
0
|
local $_ = $_[0]; |
|
39
|
0
|
|
|
|
|
0
|
s/\p{PosixCntrl}//g; |
|
40
|
0
|
|
|
|
|
0
|
return $_; |
|
41
|
|
|
|
|
|
|
} |
|
42
|
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
sub html2text { |
|
44
|
0
|
|
|
0
|
0
|
0
|
my $html = $_[0]; |
|
45
|
|
|
|
|
|
|
|
|
46
|
0
|
|
|
|
|
0
|
my $content_dom = Mojo::DOM->new('<body>' . $html . '</body>'); |
|
47
|
0
|
|
|
|
|
0
|
$content_dom->find('br')->map(replace => "\n"); |
|
48
|
0
|
|
|
|
|
0
|
$content_dom->find('div,p')->map(append => "\n\n"); |
|
49
|
|
|
|
|
|
|
|
|
50
|
0
|
|
|
|
|
0
|
my @paragraphs = grep { $_ ne '' } map { remove_control_characters($_) } map { normalize_whitespace($_) } split /\n\n+/, $content_dom->all_text; |
|
|
0
|
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
51
|
|
|
|
|
|
|
|
|
52
|
0
|
|
|
|
|
0
|
return join "\n\n", @paragraphs; |
|
53
|
|
|
|
|
|
|
} |
|
54
|
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
sub reformat_dateline { |
|
56
|
7
|
|
|
7
|
0
|
9491
|
my ($text, $offset) = @_; |
|
57
|
7
|
|
50
|
|
|
22
|
$offset //= ''; |
|
58
|
|
|
|
|
|
|
|
|
59
|
7
|
|
|
|
|
61
|
my @t = $text =~ m/([0-9]+)/g; |
|
60
|
7
|
50
|
|
|
|
21
|
return undef unless 3 <= @t; |
|
61
|
|
|
|
|
|
|
|
|
62
|
7
|
|
|
|
|
14
|
my $format_date = '%04d-%02d-%02d'; |
|
63
|
7
|
|
|
|
|
9
|
my $format_time = '%02d:%02d:%02d'; |
|
64
|
|
|
|
|
|
|
|
|
65
|
7
|
100
|
|
|
|
18
|
if (@t == 3) { |
|
66
|
2
|
|
|
|
|
6
|
my $format = $format_date . '%s'; |
|
67
|
2
|
|
|
|
|
18
|
return u(sprintf($format, $t[0], $t[1], $t[2], $offset)); |
|
68
|
|
|
|
|
|
|
} |
|
69
|
|
|
|
|
|
|
|
|
70
|
5
|
|
50
|
|
|
25
|
$t[5] //= 0; |
|
71
|
5
|
|
50
|
|
|
21
|
$t[6] //= 0; |
|
72
|
|
|
|
|
|
|
|
|
73
|
5
|
|
|
|
|
13
|
my $format = $format_date . 'T' . $format_time . '%s'; |
|
74
|
5
|
|
|
|
|
52
|
return u(sprintf($format, $t[0], $t[1], $t[2], $t[3], $t[4], $t[5], $offset)); |
|
75
|
|
|
|
|
|
|
} |
|
76
|
|
|
|
|
|
|
|
|
77
|
|
|
|
|
|
|
sub parse_dateline_ymdhms { |
|
78
|
4
|
|
|
4
|
0
|
7680
|
my ($text, $offset) = @_; |
|
79
|
|
|
|
|
|
|
|
|
80
|
4
|
|
50
|
|
|
13
|
$offset //= ''; |
|
81
|
|
|
|
|
|
|
|
|
82
|
4
|
|
|
|
|
35
|
my @t = $text =~ m/([0-9]+)/g; |
|
83
|
4
|
|
50
|
|
|
10
|
$t[3] //= 23; |
|
84
|
4
|
|
50
|
|
|
10
|
$t[4] //= 59; |
|
85
|
4
|
|
50
|
|
|
24
|
$t[5] //= 59; |
|
86
|
|
|
|
|
|
|
|
|
87
|
4
|
|
|
|
|
43
|
return u( |
|
88
|
|
|
|
|
|
|
sprintf( |
|
89
|
|
|
|
|
|
|
'%04d-%02d-%02dT%02d:%02d:%02d%s', |
|
90
|
|
|
|
|
|
|
$t[0], $t[1], $t[2], $t[3], $t[4], $t[5], $offset |
|
91
|
|
|
|
|
|
|
) |
|
92
|
|
|
|
|
|
|
); |
|
93
|
|
|
|
|
|
|
} |
|
94
|
|
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
1; |