line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package URL::Search; |
2
|
1
|
|
|
1
|
|
228795
|
use strict; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
28
|
|
3
|
1
|
|
|
1
|
|
19
|
use warnings; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
23
|
|
4
|
1
|
|
|
1
|
|
26
|
use v5.10.0; # recursive regex subgroups |
|
1
|
|
|
|
|
4
|
|
5
|
|
|
|
|
|
|
|
6
|
1
|
|
|
1
|
|
5
|
use Exporter 5.57 qw(import); |
|
1
|
|
|
|
|
11
|
|
|
1
|
|
|
|
|
95
|
|
7
|
|
|
|
|
|
|
|
8
|
|
|
|
|
|
|
our $VERSION = '0.06'; |
9
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
our @EXPORT_OK = qw( |
11
|
|
|
|
|
|
|
$URL_SEARCH_RE |
12
|
|
|
|
|
|
|
extract_urls |
13
|
|
|
|
|
|
|
partition_urls |
14
|
|
|
|
|
|
|
); |
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
our $URL_SEARCH_RE = do { |
17
|
|
|
|
|
|
|
my $general_unicode = qr{ |
18
|
|
|
|
|
|
|
[^\p{ASCII}\p{Control}\p{Space}\p{Punct}] |
19
|
|
|
|
|
|
|
| |
20
|
|
|
|
|
|
|
[\x{2010}\x{2011}\x{2012}\x{2013}\x{2014}\x{2015}] |
21
|
|
|
|
|
|
|
# HYPHEN, NON-BREAKING HYPHEN, |
22
|
|
|
|
|
|
|
# FIGURE DASH, EN DASH, EM DASH, HORIZONTAL BAR |
23
|
|
|
|
|
|
|
}xms; |
24
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
my $protocol = qr{ |
26
|
|
|
|
|
|
|
[Hh][Tt][Tt][Pp] [Ss]? |
27
|
|
|
|
|
|
|
}xms; |
28
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
my $unreserved_subdelims_colon = qr{ |
30
|
|
|
|
|
|
|
[a-zA-Z0-9\-._~!\$&'()*+,;=:] |
31
|
|
|
|
|
|
|
}xms; |
32
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
my $pct_enc = qr{ % [[:xdigit:]]{2} }xms; |
34
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
my $userinfo = qr{ |
36
|
|
|
|
|
|
|
$unreserved_subdelims_colon* |
37
|
|
|
|
|
|
|
(?: $pct_enc $unreserved_subdelims_colon* )* |
38
|
|
|
|
|
|
|
}xms; |
39
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
my $host = do { |
41
|
|
|
|
|
|
|
my $dec_octet = qr{ |
42
|
|
|
|
|
|
|
25[0-5] |
43
|
|
|
|
|
|
|
| |
44
|
|
|
|
|
|
|
2[0-4][0-9] |
45
|
|
|
|
|
|
|
| |
46
|
|
|
|
|
|
|
1[0-9][0-9] |
47
|
|
|
|
|
|
|
| |
48
|
|
|
|
|
|
|
[1-9][0-9] |
49
|
|
|
|
|
|
|
| |
50
|
|
|
|
|
|
|
[0-9] |
51
|
|
|
|
|
|
|
}xms; |
52
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
my $ipv4_addr = qr{ |
54
|
|
|
|
|
|
|
$dec_octet (?: \. $dec_octet ){3} |
55
|
|
|
|
|
|
|
}xms; |
56
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
my $h16 = qr{ [[:xdigit:]]{1,4} }xms; |
58
|
|
|
|
|
|
|
my $ls32 = qr{ $h16 : $h16 | $ipv4_addr }xms; |
59
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
my $ipv6_addr = qr{ |
61
|
|
|
|
|
|
|
(?: $h16 : ){6} $ls32 |
62
|
|
|
|
|
|
|
| |
63
|
|
|
|
|
|
|
:: (?: $h16 : ){5} $ls32 |
64
|
|
|
|
|
|
|
| |
65
|
|
|
|
|
|
|
(?: $h16 )? :: (?: $h16 : ){4} $ls32 |
66
|
|
|
|
|
|
|
| |
67
|
|
|
|
|
|
|
(?: $h16 (?: : $h16 ){0,1} )? :: (?: $h16 : ){3} $ls32 |
68
|
|
|
|
|
|
|
| |
69
|
|
|
|
|
|
|
(?: $h16 (?: : $h16 ){0,2} )? :: (?: $h16 : ){2} $ls32 |
70
|
|
|
|
|
|
|
| |
71
|
|
|
|
|
|
|
(?: $h16 (?: : $h16 ){0,3} )? :: $h16 : $ls32 |
72
|
|
|
|
|
|
|
| |
73
|
|
|
|
|
|
|
(?: $h16 (?: : $h16 ){0,4} )? :: $ls32 |
74
|
|
|
|
|
|
|
| |
75
|
|
|
|
|
|
|
(?: $h16 (?: : $h16 ){0,5} )? :: $h16 |
76
|
|
|
|
|
|
|
| |
77
|
|
|
|
|
|
|
(?: $h16 (?: : $h16 ){0,6} )? :: |
78
|
|
|
|
|
|
|
}xms; |
79
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
my $ipvfuture = qr{ |
81
|
|
|
|
|
|
|
v [[:xdigit:]]+ \. $unreserved_subdelims_colon+ |
82
|
|
|
|
|
|
|
}xms; |
83
|
|
|
|
|
|
|
|
84
|
|
|
|
|
|
|
my $ip_literal = qr{ |
85
|
|
|
|
|
|
|
\[ (?: $ipv6_addr | $ipvfuture ) \] |
86
|
|
|
|
|
|
|
}xms; |
87
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
my $hostname = do { |
89
|
|
|
|
|
|
|
my $alnum = qr{ |
90
|
|
|
|
|
|
|
[a-zA-Z0-9] |
91
|
|
|
|
|
|
|
| |
92
|
|
|
|
|
|
|
$general_unicode |
93
|
|
|
|
|
|
|
}xms; |
94
|
|
|
|
|
|
|
my $label = qr { |
95
|
|
|
|
|
|
|
$alnum+ (?: -+ $alnum+ )* |
96
|
|
|
|
|
|
|
}xms; |
97
|
|
|
|
|
|
|
qr{ |
98
|
|
|
|
|
|
|
$label (?: \. $label )* \.? |
99
|
|
|
|
|
|
|
}xms |
100
|
|
|
|
|
|
|
}; |
101
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
qr{ |
103
|
|
|
|
|
|
|
$hostname |
104
|
|
|
|
|
|
|
| |
105
|
|
|
|
|
|
|
$ip_literal |
106
|
|
|
|
|
|
|
}xms |
107
|
|
|
|
|
|
|
}; |
108
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
my $path = qr{ |
110
|
|
|
|
|
|
|
/ |
111
|
|
|
|
|
|
|
( |
112
|
|
|
|
|
|
|
(?: |
113
|
|
|
|
|
|
|
[a-zA-Z0-9\-._~!\$&'*+,;=:\@/] |
114
|
|
|
|
|
|
|
| |
115
|
|
|
|
|
|
|
$pct_enc |
116
|
|
|
|
|
|
|
| |
117
|
|
|
|
|
|
|
\( (?-1) \) |
118
|
|
|
|
|
|
|
| |
119
|
|
|
|
|
|
|
$general_unicode |
120
|
|
|
|
|
|
|
)* |
121
|
|
|
|
|
|
|
) |
122
|
|
|
|
|
|
|
}xms; |
123
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
my $query = qr{ |
125
|
|
|
|
|
|
|
( |
126
|
|
|
|
|
|
|
(?: |
127
|
|
|
|
|
|
|
[a-zA-Z0-9\-._~!\$&'*+,;=:\@/?\\{}] |
128
|
|
|
|
|
|
|
| |
129
|
|
|
|
|
|
|
$pct_enc |
130
|
|
|
|
|
|
|
| |
131
|
|
|
|
|
|
|
\( (?-1) \) |
132
|
|
|
|
|
|
|
| |
133
|
|
|
|
|
|
|
\[ (?-1) \] |
134
|
|
|
|
|
|
|
| |
135
|
|
|
|
|
|
|
$general_unicode |
136
|
|
|
|
|
|
|
)* |
137
|
|
|
|
|
|
|
) |
138
|
|
|
|
|
|
|
}xms; |
139
|
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
my $fragment = $query; |
141
|
|
|
|
|
|
|
|
142
|
|
|
|
|
|
|
qr{ |
143
|
|
|
|
|
|
|
$protocol :// |
144
|
|
|
|
|
|
|
(?: $userinfo \@ )? |
145
|
|
|
|
|
|
|
$host (?: : [0-9]+ )? |
146
|
|
|
|
|
|
|
$path? |
147
|
|
|
|
|
|
|
(?: \? $query )? |
148
|
|
|
|
|
|
|
(?: \# $fragment )? |
149
|
|
|
|
|
|
|
|
150
|
|
|
|
|
|
|
(?
|
151
|
|
|
|
|
|
|
}xms |
152
|
|
|
|
|
|
|
}; |
153
|
|
|
|
|
|
|
|
154
|
|
|
|
|
|
|
sub extract_urls { |
155
|
1
|
|
|
1
|
1
|
4
|
my ($text) = @_; |
156
|
1
|
|
|
|
|
10
|
my @urls; |
157
|
1
|
|
|
|
|
254
|
push @urls, $1 while $text =~ /($URL_SEARCH_RE)/g; |
158
|
|
|
|
|
|
|
@urls |
159
|
1
|
|
|
|
|
3339
|
} |
160
|
|
|
|
|
|
|
|
161
|
|
|
|
|
|
|
sub partition_urls { |
162
|
1
|
|
|
1
|
1
|
127
|
my ($text) = @_; |
163
|
1
|
|
|
|
|
2
|
my @parts; |
164
|
1
|
|
|
|
|
2
|
my $pos_prev = 0; |
165
|
1
|
|
|
|
|
310
|
while ($text =~ /($URL_SEARCH_RE)/g) { |
166
|
15
|
50
|
|
|
|
3321
|
push @parts, [TEXT => substr $text, $pos_prev, $-[0] - $pos_prev] |
167
|
|
|
|
|
|
|
if $pos_prev < $-[0]; |
168
|
15
|
|
|
|
|
50
|
push @parts, [URL => $1]; |
169
|
15
|
|
|
|
|
331
|
$pos_prev = $+[0]; |
170
|
|
|
|
|
|
|
} |
171
|
1
|
50
|
|
|
|
7
|
push @parts, [TEXT => substr $text, $pos_prev] |
172
|
|
|
|
|
|
|
if $pos_prev < length $text; |
173
|
|
|
|
|
|
|
@parts |
174
|
1
|
|
|
|
|
30
|
} |
175
|
|
|
|
|
|
|
|
176
|
|
|
|
|
|
|
'ok' |
177
|
|
|
|
|
|
|
|
178
|
|
|
|
|
|
|
__END__ |