line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package NewsExtractor::TextUtil; |
2
|
6
|
|
|
6
|
|
933351
|
use strict; |
|
6
|
|
|
|
|
72
|
|
|
6
|
|
|
|
|
201
|
|
3
|
6
|
|
|
6
|
|
33
|
use warnings; |
|
6
|
|
|
|
|
12
|
|
|
6
|
|
|
|
|
262
|
|
4
|
6
|
|
|
6
|
|
673
|
use Encode qw(is_utf8 decode_utf8); |
|
6
|
|
|
|
|
11414
|
|
|
6
|
|
|
|
|
386
|
|
5
|
6
|
|
|
6
|
|
2963
|
use Mojo::DOM; |
|
6
|
|
|
|
|
1020258
|
|
|
6
|
|
|
|
|
1721
|
|
6
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
our @EXPORT = ( |
8
|
|
|
|
|
|
|
'u', |
9
|
|
|
|
|
|
|
'normalize_whitespace', |
10
|
|
|
|
|
|
|
'html2text', |
11
|
|
|
|
|
|
|
'is_empty', |
12
|
|
|
|
|
|
|
'parse_dateline_ymdhms', |
13
|
|
|
|
|
|
|
'reformat_dateline', |
14
|
|
|
|
|
|
|
'remove_control_characters', |
15
|
|
|
|
|
|
|
); |
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
sub u($) { |
18
|
43
|
50
|
|
43
|
0
|
28710
|
defined($_[0]) or return undef; |
19
|
|
|
|
|
|
|
|
20
|
43
|
|
|
|
|
110
|
my $v = "".$_[0]; |
21
|
43
|
100
|
|
|
|
283
|
return is_utf8($v) ? $v : decode_utf8($v); |
22
|
|
|
|
|
|
|
} |
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
sub is_empty { |
25
|
0
|
0
|
|
0
|
0
|
0
|
(! defined($_[0])) || $_[0] eq ''; |
26
|
|
|
|
|
|
|
} |
27
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
sub normalize_whitespace { |
29
|
2
|
|
|
2
|
0
|
7336
|
local $_ = $_[0]; |
30
|
2
|
|
|
|
|
25
|
s/\h+/ /g; |
31
|
2
|
|
|
|
|
8
|
s/\r\n/\n/g; |
32
|
2
|
|
|
|
|
11
|
s/\A\s+//; |
33
|
2
|
|
|
|
|
12
|
s/\s+\z//; |
34
|
2
|
|
|
|
|
8
|
return $_; |
35
|
|
|
|
|
|
|
} |
36
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
sub remove_control_characters { |
38
|
0
|
|
|
0
|
0
|
0
|
local $_ = $_[0]; |
39
|
0
|
|
|
|
|
0
|
s/\p{PosixCntrl}//g; |
40
|
0
|
|
|
|
|
0
|
return $_; |
41
|
|
|
|
|
|
|
} |
42
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
sub html2text { |
44
|
0
|
|
|
0
|
0
|
0
|
my $html = $_[0]; |
45
|
|
|
|
|
|
|
|
46
|
0
|
|
|
|
|
0
|
my $content_dom = Mojo::DOM->new('<body>' . $html . '</body>'); |
47
|
0
|
|
|
|
|
0
|
$content_dom->find('br')->map(replace => "\n"); |
48
|
0
|
|
|
|
|
0
|
$content_dom->find('div,p')->map(append => "\n\n"); |
49
|
|
|
|
|
|
|
|
50
|
0
|
|
|
|
|
0
|
my @paragraphs = grep { $_ ne '' } map { remove_control_characters($_) } map { normalize_whitespace($_) } split /\n\n+/, $content_dom->all_text; |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
51
|
|
|
|
|
|
|
|
52
|
0
|
|
|
|
|
0
|
return join "\n\n", @paragraphs; |
53
|
|
|
|
|
|
|
} |
54
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
sub reformat_dateline { |
56
|
7
|
|
|
7
|
0
|
8465
|
my ($text, $offset) = @_; |
57
|
7
|
|
50
|
|
|
23
|
$offset //= ''; |
58
|
|
|
|
|
|
|
|
59
|
7
|
|
|
|
|
56
|
my @t = $text =~ m/([0-9]+)/g; |
60
|
7
|
50
|
|
|
|
23
|
return undef unless 3 <= @t; |
61
|
|
|
|
|
|
|
|
62
|
7
|
|
|
|
|
13
|
my $format_date = '%04d-%02d-%02d'; |
63
|
7
|
|
|
|
|
11
|
my $format_time = '%02d:%02d:%02d'; |
64
|
|
|
|
|
|
|
|
65
|
7
|
100
|
|
|
|
17
|
if (@t == 3) { |
66
|
2
|
|
|
|
|
5
|
my $format = $format_date . '%s'; |
67
|
2
|
|
|
|
|
17
|
return u(sprintf($format, $t[0], $t[1], $t[2], $offset)); |
68
|
|
|
|
|
|
|
} |
69
|
|
|
|
|
|
|
|
70
|
5
|
|
50
|
|
|
27
|
$t[5] //= 0; |
71
|
5
|
|
50
|
|
|
19
|
$t[6] //= 0; |
72
|
|
|
|
|
|
|
|
73
|
5
|
|
|
|
|
12
|
my $format = $format_date . 'T' . $format_time . '%s'; |
74
|
5
|
|
|
|
|
53
|
return u(sprintf($format, $t[0], $t[1], $t[2], $t[3], $t[4], $t[5], $offset)); |
75
|
|
|
|
|
|
|
} |
76
|
|
|
|
|
|
|
|
77
|
|
|
|
|
|
|
sub parse_dateline_ymdhms { |
78
|
4
|
|
|
4
|
0
|
7514
|
my ($text, $offset) = @_; |
79
|
|
|
|
|
|
|
|
80
|
4
|
|
50
|
|
|
15
|
$offset //= ''; |
81
|
|
|
|
|
|
|
|
82
|
4
|
|
|
|
|
38
|
my @t = $text =~ m/([0-9]+)/g; |
83
|
4
|
|
50
|
|
|
12
|
$t[3] //= 23; |
84
|
4
|
|
50
|
|
|
9
|
$t[4] //= 59; |
85
|
4
|
|
50
|
|
|
23
|
$t[5] //= 59; |
86
|
|
|
|
|
|
|
|
87
|
4
|
|
|
|
|
46
|
return u( |
88
|
|
|
|
|
|
|
sprintf( |
89
|
|
|
|
|
|
|
'%04d-%02d-%02dT%02d:%02d:%02d%s', |
90
|
|
|
|
|
|
|
$t[0], $t[1], $t[2], $t[3], $t[4], $t[5], $offset |
91
|
|
|
|
|
|
|
) |
92
|
|
|
|
|
|
|
); |
93
|
|
|
|
|
|
|
} |
94
|
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
1; |