line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package |
2
|
|
|
|
|
|
|
Twitter::Text::Regexp; # hide from PAUSE |
3
|
4
|
|
|
4
|
|
29
|
use strict; |
|
4
|
|
|
|
|
8
|
|
|
4
|
|
|
|
|
133
|
|
4
|
4
|
|
|
4
|
|
25
|
use warnings; |
|
4
|
|
|
|
|
8
|
|
|
4
|
|
|
|
|
133
|
|
5
|
4
|
|
|
4
|
|
27
|
use utf8; |
|
4
|
|
|
|
|
10
|
|
|
4
|
|
|
|
|
32
|
|
6
|
4
|
|
|
4
|
|
151
|
no if $^V lt v5.13.9, 'warnings', 'utf8'; |
|
4
|
|
|
|
|
17
|
|
|
4
|
|
|
|
|
77
|
|
7
|
|
|
|
|
|
|
|
8
|
4
|
|
|
4
|
|
628
|
use Twitter::Text::Util qw(load_yaml); |
|
4
|
|
|
|
|
10
|
|
|
4
|
|
|
|
|
3887
|
|
9
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
# internal use only, do not use this module directly. |
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
sub regex_range { |
13
|
68
|
|
|
68
|
0
|
117
|
my ($from, $to) = @_; |
14
|
|
|
|
|
|
|
|
15
|
68
|
100
|
|
|
|
113
|
if (defined $to) { |
16
|
32
|
|
|
|
|
146
|
return pack('U', $from) . '-' . pack('U', $to); |
17
|
|
|
|
|
|
|
} else { |
18
|
36
|
|
|
|
|
668
|
return pack('U', $from); |
19
|
|
|
|
|
|
|
} |
20
|
|
|
|
|
|
|
} |
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
our $TLDS = load_yaml("tld_lib.yml")->[0]; |
23
|
|
|
|
|
|
|
our $PUNCTUATION_CHARS = '!"#$%&\'()*+,-./:;<=>?@\[\]^_\`{|}~'; |
24
|
|
|
|
|
|
|
our $SPACE_CHARS = " \t\n\x0B\f\r"; |
25
|
|
|
|
|
|
|
our $CTRL_CHARS = "\x00-\x1F\x7F"; |
26
|
|
|
|
|
|
|
our $INVALID_CHARACTERS = join '', map { pack 'U', $_ } ( |
27
|
|
|
|
|
|
|
0xFFFE, 0xFEFF, # BOM |
28
|
|
|
|
|
|
|
0xFFFF, # Special |
29
|
|
|
|
|
|
|
); |
30
|
|
|
|
|
|
|
our $UNICODE_SPACES = join '', map { pack 'U*', $_ } ( |
31
|
|
|
|
|
|
|
(0x0009..0x000D), # White_Space # Cc [5] .. |
32
|
|
|
|
|
|
|
0x0020, # White_Space # Zs SPACE |
33
|
|
|
|
|
|
|
0x0085, # White_Space # Cc |
34
|
|
|
|
|
|
|
0x00A0, # White_Space # Zs NO-BREAK SPACE |
35
|
|
|
|
|
|
|
0x1680, # White_Space # Zs OGHAM SPACE MARK |
36
|
|
|
|
|
|
|
0x180E, # White_Space # Zs MONGOLIAN VOWEL SEPARATOR |
37
|
|
|
|
|
|
|
(0x2000..0x200A), # White_Space # Zs [11] EN QUAD..HAIR SPACE |
38
|
|
|
|
|
|
|
0x2028, # White_Space # Zl LINE SEPARATOR |
39
|
|
|
|
|
|
|
0x2029, # White_Space # Zp PARAGRAPH SEPARATOR |
40
|
|
|
|
|
|
|
0x202F, # White_Space # Zs NARROW NO-BREAK SPACE |
41
|
|
|
|
|
|
|
0x205F, # White_Space # Zs MEDIUM MATHEMATICAL SPACE |
42
|
|
|
|
|
|
|
0x3000, # White_Space # Zs IDEOGRAPHIC SPACE |
43
|
|
|
|
|
|
|
); |
44
|
|
|
|
|
|
|
|
45
|
|
|
|
|
|
|
our $DIRECTIONAL_CHARACTERS = join '', map { pack 'U', $_ } ( |
46
|
|
|
|
|
|
|
0x061C, # ARABIC LETTER MARK (ALM) |
47
|
|
|
|
|
|
|
0x200E, # LEFT-TO-RIGHT MARK (LRM) |
48
|
|
|
|
|
|
|
0x200F, # RIGHT-TO-LEFT MARK (RLM) |
49
|
|
|
|
|
|
|
0x202A, # LEFT-TO-RIGHT EMBEDDING (LRE) |
50
|
|
|
|
|
|
|
0x202B, # RIGHT-TO-LEFT EMBEDDING (RLE) |
51
|
|
|
|
|
|
|
0x202C, # POP DIRECTIONAL FORMATTING (PDF) |
52
|
|
|
|
|
|
|
0x202D, # LEFT-TO-RIGHT OVERRIDE (LRO) |
53
|
|
|
|
|
|
|
0x202E, # RIGHT-TO-LEFT OVERRIDE (RLO) |
54
|
|
|
|
|
|
|
0x2066, # LEFT-TO-RIGHT ISOLATE (LRI) |
55
|
|
|
|
|
|
|
0x2067, # RIGHT-TO-LEFT ISOLATE (RLI) |
56
|
|
|
|
|
|
|
0x2068, # FIRST STRONG ISOLATE (FSI) |
57
|
|
|
|
|
|
|
0x2069, # POP DIRECTIONAL ISOLATE (PDI) |
58
|
|
|
|
|
|
|
); |
59
|
|
|
|
|
|
|
our $DOMAIN_VALID_CHARS = "[^$DIRECTIONAL_CHARACTERS$PUNCTUATION_CHARS$SPACE_CHARS$CTRL_CHARS$INVALID_CHARACTERS$UNICODE_SPACES]"; |
60
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
our $LATIN_ACCENTS = join '', ( |
62
|
|
|
|
|
|
|
regex_range(0xc0, 0xd6), |
63
|
|
|
|
|
|
|
regex_range(0xd8, 0xf6), |
64
|
|
|
|
|
|
|
regex_range(0xf8, 0xff), |
65
|
|
|
|
|
|
|
regex_range(0x0100, 0x024f), |
66
|
|
|
|
|
|
|
regex_range(0x0253, 0x0254), |
67
|
|
|
|
|
|
|
regex_range(0x0256, 0x0257), |
68
|
|
|
|
|
|
|
regex_range(0x0259), |
69
|
|
|
|
|
|
|
regex_range(0x025b), |
70
|
|
|
|
|
|
|
regex_range(0x0263), |
71
|
|
|
|
|
|
|
regex_range(0x0268), |
72
|
|
|
|
|
|
|
regex_range(0x026f), |
73
|
|
|
|
|
|
|
regex_range(0x0272), |
74
|
|
|
|
|
|
|
regex_range(0x0289), |
75
|
|
|
|
|
|
|
regex_range(0x028b), |
76
|
|
|
|
|
|
|
regex_range(0x02bb), |
77
|
|
|
|
|
|
|
regex_range(0x0300, 0x036f), |
78
|
|
|
|
|
|
|
regex_range(0x1e00, 0x1eff) |
79
|
|
|
|
|
|
|
); |
80
|
|
|
|
|
|
|
our $latin_accents = qr/[$LATIN_ACCENTS]+/o; |
81
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
our $HASHTAG_LETTERS_AND_MARKS = '\p{L}\p{M}' . |
83
|
|
|
|
|
|
|
"\N{U+037f}\N{U+0528}-\N{U+052f}\N{U+08a0}-\N{U+08b2}\N{U+08e4}-\N{U+08ff}\N{U+0978}\N{U+0980}\N{U+0c00}\N{U+0c34}\N{U+0c81}\N{U+0d01}\N{U+0ede}\N{U+0edf}" . |
84
|
|
|
|
|
|
|
"\N{U+10c7}\N{U+10cd}\N{U+10fd}-\N{U+10ff}\N{U+16f1}-\N{U+16f8}\N{U+17b4}\N{U+17b5}\N{U+191d}\N{U+191e}\N{U+1ab0}-\N{U+1abe}\N{U+1bab}-\N{U+1bad}\N{U+1bba}-" . |
85
|
|
|
|
|
|
|
"\N{U+1bbf}\N{U+1cf3}-\N{U+1cf6}\N{U+1cf8}\N{U+1cf9}\N{U+1de7}-\N{U+1df5}\N{U+2cf2}\N{U+2cf3}\N{U+2d27}\N{U+2d2d}\N{U+2d66}\N{U+2d67}\N{U+9fcc}\N{U+a674}-" . |
86
|
|
|
|
|
|
|
"\N{U+a67b}\N{U+a698}-\N{U+a69d}\N{U+a69f}\N{U+a792}-\N{U+a79f}\N{U+a7aa}-\N{U+a7ad}\N{U+a7b0}\N{U+a7b1}\N{U+a7f7}-\N{U+a7f9}\N{U+a9e0}-\N{U+a9ef}\N{U+a9fa}-" . |
87
|
|
|
|
|
|
|
"\N{U+a9fe}\N{U+aa7c}-\N{U+aa7f}\N{U+aae0}-\N{U+aaef}\N{U+aaf2}-\N{U+aaf6}\N{U+ab30}-\N{U+ab5a}\N{U+ab5c}-\N{U+ab5f}\N{U+ab64}\N{U+ab65}\N{U+f870}-\N{U+f87f}" . |
88
|
|
|
|
|
|
|
"\N{U+f882}\N{U+f884}-\N{U+f89f}\N{U+f8b8}\N{U+f8c1}-\N{U+f8d6}\N{U+fa2e}\N{U+fa2f}\N{U+fe27}-\N{U+fe2d}\N{U+102e0}\N{U+1031f}\N{U+10350}-\N{U+1037a}" . |
89
|
|
|
|
|
|
|
"\N{U+10500}-\N{U+10527}\N{U+10530}-\N{U+10563}\N{U+10600}-\N{U+10736}\N{U+10740}-\N{U+10755}\N{U+10760}-\N{U+10767}" . |
90
|
|
|
|
|
|
|
"\N{U+10860}-\N{U+10876}\N{U+10880}-\N{U+1089e}\N{U+10980}-\N{U+109b7}\N{U+109be}\N{U+109bf}\N{U+10a80}-\N{U+10a9c}" . |
91
|
|
|
|
|
|
|
"\N{U+10ac0}-\N{U+10ac7}\N{U+10ac9}-\N{U+10ae6}\N{U+10b80}-\N{U+10b91}\N{U+1107f}\N{U+110d0}-\N{U+110e8}\N{U+11100}-" . |
92
|
|
|
|
|
|
|
"\N{U+11134}\N{U+11150}-\N{U+11173}\N{U+11176}\N{U+11180}-\N{U+111c4}\N{U+111da}\N{U+11200}-\N{U+11211}\N{U+11213}-" . |
93
|
|
|
|
|
|
|
"\N{U+11237}\N{U+112b0}-\N{U+112ea}\N{U+11301}-\N{U+11303}\N{U+11305}-\N{U+1130c}\N{U+1130f}\N{U+11310}\N{U+11313}-" . |
94
|
|
|
|
|
|
|
"\N{U+11328}\N{U+1132a}-\N{U+11330}\N{U+11332}\N{U+11333}\N{U+11335}-\N{U+11339}\N{U+1133c}-\N{U+11344}\N{U+11347}" . |
95
|
|
|
|
|
|
|
"\N{U+11348}\N{U+1134b}-\N{U+1134d}\N{U+11357}\N{U+1135d}-\N{U+11363}\N{U+11366}-\N{U+1136c}\N{U+11370}-\N{U+11374}" . |
96
|
|
|
|
|
|
|
"\N{U+11480}-\N{U+114c5}\N{U+114c7}\N{U+11580}-\N{U+115b5}\N{U+115b8}-\N{U+115c0}\N{U+11600}-\N{U+11640}\N{U+11644}" . |
97
|
|
|
|
|
|
|
"\N{U+11680}-\N{U+116b7}\N{U+118a0}-\N{U+118df}\N{U+118ff}\N{U+11ac0}-\N{U+11af8}\N{U+1236f}-\N{U+12398}\N{U+16a40}-" . |
98
|
|
|
|
|
|
|
"\N{U+16a5e}\N{U+16ad0}-\N{U+16aed}\N{U+16af0}-\N{U+16af4}\N{U+16b00}-\N{U+16b36}\N{U+16b40}-\N{U+16b43}\N{U+16b63}-" . |
99
|
|
|
|
|
|
|
"\N{U+16b77}\N{U+16b7d}-\N{U+16b8f}\N{U+16f00}-\N{U+16f44}\N{U+16f50}-\N{U+16f7e}\N{U+16f8f}-\N{U+16f9f}\N{U+1bc00}-" . |
100
|
|
|
|
|
|
|
"\N{U+1bc6a}\N{U+1bc70}-\N{U+1bc7c}\N{U+1bc80}-\N{U+1bc88}\N{U+1bc90}-\N{U+1bc99}\N{U+1bc9d}\N{U+1bc9e}\N{U+1e800}-" . |
101
|
|
|
|
|
|
|
"\N{U+1e8c4}\N{U+1e8d0}-\N{U+1e8d6}\N{U+1ee00}-\N{U+1ee03}\N{U+1ee05}-\N{U+1ee1f}\N{U+1ee21}\N{U+1ee22}\N{U+1ee24}" . |
102
|
|
|
|
|
|
|
"\N{U+1ee27}\N{U+1ee29}-\N{U+1ee32}\N{U+1ee34}-\N{U+1ee37}\N{U+1ee39}\N{U+1ee3b}\N{U+1ee42}\N{U+1ee47}\N{U+1ee49}" . |
103
|
|
|
|
|
|
|
"\N{U+1ee4b}\N{U+1ee4d}-\N{U+1ee4f}\N{U+1ee51}\N{U+1ee52}\N{U+1ee54}\N{U+1ee57}\N{U+1ee59}\N{U+1ee5b}\N{U+1ee5d}\N{U+1ee5f}" . |
104
|
|
|
|
|
|
|
"\N{U+1ee61}\N{U+1ee62}\N{U+1ee64}\N{U+1ee67}-\N{U+1ee6a}\N{U+1ee6c}-\N{U+1ee72}\N{U+1ee74}-\N{U+1ee77}\N{U+1ee79}-" . |
105
|
|
|
|
|
|
|
"\N{U+1ee7c}\N{U+1ee7e}\N{U+1ee80}-\N{U+1ee89}\N{U+1ee8b}-\N{U+1ee9b}\N{U+1eea1}-\N{U+1eea3}\N{U+1eea5}-\N{U+1eea9}" . |
106
|
|
|
|
|
|
|
"\N{U+1eeab}-\N{U+1eebb}"; |
107
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
our $HASHTAG_NUMERALS = "\\p{Nd}" . |
109
|
|
|
|
|
|
|
"\N{U+0de6}-\N{U+0def}\N{U+a9f0}-\N{U+a9f9}\N{U+110f0}-\N{U+110f9}\N{U+11136}-\N{U+1113f}\N{U+111d0}-\N{U+111d9}\N{U+112f0}-" . |
110
|
|
|
|
|
|
|
"\N{U+112f9}\N{U+114d0}-\N{U+114d9}\N{U+11650}-\N{U+11659}\N{U+116c0}-\N{U+116c9}\N{U+118e0}-\N{U+118e9}\N{U+16a60}-" . |
111
|
|
|
|
|
|
|
"\N{U+16a69}\N{U+16b50}-\N{U+16b59}"; |
112
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
our $HASHTAG_SPECIAL_CHARS = "_\N{U+200c}\N{U+200d}\N{U+a67e}\N{U+05be}\N{U+05f3}\N{U+05f4}\N{U+ff5e}\N{U+301c}\N{U+309b}\N{U+309c}\N{U+30a0}\N{U+30fb}\N{U+3003}\N{U+0f0b}\N{U+0f0c}\N{U+00b7}"; |
114
|
|
|
|
|
|
|
|
115
|
|
|
|
|
|
|
our $HASHTAG_LETTERS_NUMERALS = "$HASHTAG_LETTERS_AND_MARKS$HASHTAG_NUMERALS$HASHTAG_SPECIAL_CHARS"; |
116
|
|
|
|
|
|
|
our $HASHTAG_LETTERS_NUMERALS_SET = "[$HASHTAG_LETTERS_NUMERALS]"; |
117
|
|
|
|
|
|
|
our $HASHTAG_LETTERS_SET = "[$HASHTAG_LETTERS_AND_MARKS]"; |
118
|
|
|
|
|
|
|
|
119
|
|
|
|
|
|
|
our $HASHTAG = qr/(\A|\N{U+fe0e}|\N{U+fe0f}|[^&$HASHTAG_LETTERS_NUMERALS])(#|#)(?!\N{U+fe0f}|\N{U+20e3})($HASHTAG_LETTERS_NUMERALS_SET*$HASHTAG_LETTERS_SET$HASHTAG_LETTERS_NUMERALS_SET*)/i; |
120
|
|
|
|
|
|
|
|
121
|
|
|
|
|
|
|
our $valid_hashtag = qr/$HASHTAG/i; |
122
|
|
|
|
|
|
|
our $end_hashtag_match = qr/\A(?:[##]|:\/\/)/; |
123
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
our $valid_mention_preceding_chars = qr/(?:[^a-z0-9_!#\$%&*@@]|^|(?:^|[^a-z0-9_+~.-])[rR][tT]:?)/i; |
125
|
|
|
|
|
|
|
our $at_signs = qr/[@@]/; |
126
|
|
|
|
|
|
|
our $valid_mention_or_list = qr/ |
127
|
|
|
|
|
|
|
($valid_mention_preceding_chars) # $1: Preceeding character |
128
|
|
|
|
|
|
|
($at_signs) # $2: At mark |
129
|
|
|
|
|
|
|
([a-z0-9_]{1,20}) # $3: Screen name |
130
|
|
|
|
|
|
|
(\/[a-z][a-zA-Z0-9_\-]{0,24})? # $4: List (optional) |
131
|
|
|
|
|
|
|
/ix; |
132
|
|
|
|
|
|
|
our $valid_reply = qr/^(?:[$UNICODE_SPACES$DIRECTIONAL_CHARACTERS])*$at_signs([a-z0-9_]{1,20})/i; |
133
|
|
|
|
|
|
|
# Used in Extractor for final filtering |
134
|
|
|
|
|
|
|
our $end_mention_match = qr/\A(?:$at_signs|$latin_accents|:\/\/)/i; |
135
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
our $valid_subdomain = qr/(?:(?:$DOMAIN_VALID_CHARS(?:[_-]|$DOMAIN_VALID_CHARS)*)?$DOMAIN_VALID_CHARS\.)/i; |
137
|
|
|
|
|
|
|
our $valid_domain_name = qr/(?:(?:$DOMAIN_VALID_CHARS(?:[-]|$DOMAIN_VALID_CHARS)*)?$DOMAIN_VALID_CHARS\.)/i; |
138
|
|
|
|
|
|
|
|
139
|
|
|
|
|
|
|
our $GENERIC_TLDS = join '|', @{$TLDS->{generic}}; |
140
|
|
|
|
|
|
|
our $CC_TLDS = join '|', @{$TLDS->{country}}; |
141
|
|
|
|
|
|
|
|
142
|
|
|
|
|
|
|
our $valid_gTLD = qr{ |
143
|
|
|
|
|
|
|
(?: |
144
|
|
|
|
|
|
|
(?:$GENERIC_TLDS) |
145
|
|
|
|
|
|
|
(?=[^0-9a-z@+-]|$) |
146
|
|
|
|
|
|
|
) |
147
|
|
|
|
|
|
|
}ix; |
148
|
|
|
|
|
|
|
|
149
|
|
|
|
|
|
|
our $valid_ccTLD = qr{ |
150
|
|
|
|
|
|
|
(?: |
151
|
|
|
|
|
|
|
(?:$CC_TLDS) |
152
|
|
|
|
|
|
|
(?=[^0-9a-z@+-]|$) |
153
|
|
|
|
|
|
|
) |
154
|
|
|
|
|
|
|
}ix; |
155
|
|
|
|
|
|
|
our $valid_punycode = qr/(?:xn--[0-9a-z]+)/i; |
156
|
|
|
|
|
|
|
|
157
|
|
|
|
|
|
|
our $valid_domain = qr/(?: |
158
|
|
|
|
|
|
|
$valid_subdomain*$valid_domain_name |
159
|
|
|
|
|
|
|
(?:$valid_gTLD|$valid_ccTLD|$valid_punycode) |
160
|
|
|
|
|
|
|
)/ix; |
161
|
|
|
|
|
|
|
|
162
|
|
|
|
|
|
|
# This is used in Extractor |
163
|
|
|
|
|
|
|
our $valid_ascii_domain = qr/ |
164
|
|
|
|
|
|
|
(?:(?:[a-z0-9\-_]|$latin_accents)+\.)+ |
165
|
|
|
|
|
|
|
(?:$valid_gTLD|$valid_ccTLD|$valid_punycode) |
166
|
|
|
|
|
|
|
/ix; |
167
|
|
|
|
|
|
|
|
168
|
|
|
|
|
|
|
# This is used in Extractor for stricter t.co URL extraction |
169
|
|
|
|
|
|
|
our $valid_tco_url = qr/^https?:\/\/t\.co\/([a-z0-9]+)/i; |
170
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
our $valid_port_number = qr/[0-9]+/; |
172
|
|
|
|
|
|
|
|
173
|
|
|
|
|
|
|
our $valid_url_preceding_chars = qr/(?:[^A-Z0-9@@\$##$INVALID_CHARACTERS]|[$DIRECTIONAL_CHARACTERS]|^)/i; |
174
|
|
|
|
|
|
|
our $invalid_url_without_protocol_preceding_chars = qr/[-_.\/]$/; |
175
|
|
|
|
|
|
|
|
176
|
|
|
|
|
|
|
our $valid_general_url_path_chars = qr/[a-z\p{Cyrillic}0-9!\*';:=\+\,\.\$\/%#\[\]\p{Pd}_~&\|$LATIN_ACCENTS]/i; |
177
|
|
|
|
|
|
|
# Allow URL paths to contain up to two nested levels of balanced parens |
178
|
|
|
|
|
|
|
# 1. Used in Wikipedia URLs like /Primer_(film) |
179
|
|
|
|
|
|
|
# 2. Used in IIS sessions like /S(dfd346)/ |
180
|
|
|
|
|
|
|
# 3. Used in Rdio URLs like /track/We_Up_(Album_Version_(Edited))/ |
181
|
|
|
|
|
|
|
our $valid_url_balanced_parens = qr/ |
182
|
|
|
|
|
|
|
\( |
183
|
|
|
|
|
|
|
(?: |
184
|
|
|
|
|
|
|
$valid_general_url_path_chars+ |
185
|
|
|
|
|
|
|
| |
186
|
|
|
|
|
|
|
# allow one nested level of balanced parentheses |
187
|
|
|
|
|
|
|
(?: |
188
|
|
|
|
|
|
|
$valid_general_url_path_chars* |
189
|
|
|
|
|
|
|
\( |
190
|
|
|
|
|
|
|
$valid_general_url_path_chars+ |
191
|
|
|
|
|
|
|
\) |
192
|
|
|
|
|
|
|
$valid_general_url_path_chars* |
193
|
|
|
|
|
|
|
) |
194
|
|
|
|
|
|
|
) |
195
|
|
|
|
|
|
|
\) |
196
|
|
|
|
|
|
|
/ix; |
197
|
|
|
|
|
|
|
# Valid end-of-path chracters (so /foo. does not gobble the period). |
198
|
|
|
|
|
|
|
# 1. Allow = for empty URL parameters and other URL-join artifacts |
199
|
|
|
|
|
|
|
our $valid_url_path_ending_chars = qr/[a-z\p{Cyrillic}0-9=_#\/\+\-$LATIN_ACCENTS]|(?:$valid_url_balanced_parens)/i; |
200
|
|
|
|
|
|
|
our $valid_url_path = qr/(?: |
201
|
|
|
|
|
|
|
(?: |
202
|
|
|
|
|
|
|
$valid_general_url_path_chars* |
203
|
|
|
|
|
|
|
(?:$valid_url_balanced_parens $valid_general_url_path_chars*)* |
204
|
|
|
|
|
|
|
$valid_url_path_ending_chars |
205
|
|
|
|
|
|
|
)|(?:$valid_general_url_path_chars+\/) |
206
|
|
|
|
|
|
|
)/ix; |
207
|
|
|
|
|
|
|
our $valid_url_query_chars = qr/[a-z0-9!?\*'\(\);:&=\+\$\/%#\[\]\-_\.,~|@]/i; |
208
|
|
|
|
|
|
|
our $valid_url_query_ending_chars = qr/[a-z0-9_&=#\/\-]/i; |
209
|
|
|
|
|
|
|
our $valid_url = qr{ |
210
|
|
|
|
|
|
|
( # $1 total match |
211
|
|
|
|
|
|
|
($valid_url_preceding_chars) # $2 Preceeding chracter |
212
|
|
|
|
|
|
|
( # $3 URL |
213
|
|
|
|
|
|
|
(https?:\/\/)? # $4 Protocol (optional) |
214
|
|
|
|
|
|
|
($valid_domain) # $5 Domain(s) |
215
|
|
|
|
|
|
|
(?::($valid_port_number))? # $6 Port number (optional) |
216
|
|
|
|
|
|
|
(/$valid_url_path*)? # $7 URL Path and anchor |
217
|
|
|
|
|
|
|
(\?$valid_url_query_chars*$valid_url_query_ending_chars)? # $8 Query String |
218
|
|
|
|
|
|
|
) |
219
|
|
|
|
|
|
|
)}ix; |
220
|
|
|
|
|
|
|
|
221
|
|
|
|
|
|
|
our $cashtag = qr/[a-z]{1,6}(?:[._][a-z]{1,2})?/i; |
222
|
|
|
|
|
|
|
our $valid_cashtag = qr/(^|[$UNICODE_SPACES$DIRECTIONAL_CHARACTERS])(\$)($cashtag)(?=$|\s|[$PUNCTUATION_CHARS])/i; |
223
|
|
|
|
|
|
|
|
224
|
|
|
|
|
|
|
# These URL validation pattern strings are based on the ABNF from RFC 3986 |
225
|
|
|
|
|
|
|
our $validate_url_unreserved = qr/[a-z\p{Cyrillic}0-9\p{Pd}._~]/i; |
226
|
|
|
|
|
|
|
our $validate_url_pct_encoded = qr/(?:%[0-9a-f]{2})/i; |
227
|
|
|
|
|
|
|
our $validate_url_sub_delims = qr/[!\$&'()*+,;=]/i; |
228
|
|
|
|
|
|
|
our $validate_url_pchar = qr/(?: |
229
|
|
|
|
|
|
|
$validate_url_unreserved| |
230
|
|
|
|
|
|
|
$validate_url_pct_encoded| |
231
|
|
|
|
|
|
|
$validate_url_sub_delims| |
232
|
|
|
|
|
|
|
[:\|@] |
233
|
|
|
|
|
|
|
)/ix; |
234
|
|
|
|
|
|
|
|
235
|
|
|
|
|
|
|
our $validate_url_scheme = qr/(?:[a-z][a-z0-9+\-.]*)/i; |
236
|
|
|
|
|
|
|
our $validate_url_userinfo = qr/(?: |
237
|
|
|
|
|
|
|
$validate_url_unreserved| |
238
|
|
|
|
|
|
|
$validate_url_pct_encoded| |
239
|
|
|
|
|
|
|
$validate_url_sub_delims| |
240
|
|
|
|
|
|
|
: |
241
|
|
|
|
|
|
|
)*/ix; |
242
|
|
|
|
|
|
|
|
243
|
|
|
|
|
|
|
our $validate_url_dec_octet = qr/(?:[0-9]|(?:[1-9][0-9])|(?:1[0-9]{2})|(?:2[0-4][0-9])|(?:25[0-5]))/i; |
244
|
|
|
|
|
|
|
our $validate_url_ipv4 = |
245
|
|
|
|
|
|
|
qr/(?:$validate_url_dec_octet(?:\.$validate_url_dec_octet){3})/ix; |
246
|
|
|
|
|
|
|
|
247
|
|
|
|
|
|
|
# Punting on real IPv6 validation for now |
248
|
|
|
|
|
|
|
our $validate_url_ipv6 = qr/(?:\[[a-f0-9:\.]+\])/i; |
249
|
|
|
|
|
|
|
|
250
|
|
|
|
|
|
|
# Also punting on IPvFuture for now |
251
|
|
|
|
|
|
|
our $validate_url_ip = qr/(?: |
252
|
|
|
|
|
|
|
$validate_url_ipv4| |
253
|
|
|
|
|
|
|
$validate_url_ipv6 |
254
|
|
|
|
|
|
|
)/ix; |
255
|
|
|
|
|
|
|
|
256
|
|
|
|
|
|
|
# This is more strict than the rfc specifies |
257
|
|
|
|
|
|
|
our $validate_url_subdomain_segment = qr/(?:[a-z0-9](?:[a-z0-9_\-]*[a-z0-9])?)/i; |
258
|
|
|
|
|
|
|
our $validate_url_domain_segment = qr/(?:[a-z0-9](?:[a-z0-9\-]*[a-z0-9])?)/i; |
259
|
|
|
|
|
|
|
our $validate_url_domain_tld = qr/(?:[a-z](?:[a-z0-9\-]*[a-z0-9])?)/i; |
260
|
|
|
|
|
|
|
our $validate_url_domain = qr/(?:(?:$validate_url_subdomain_segment\.)* |
261
|
|
|
|
|
|
|
(?:$validate_url_domain_segment\.) |
262
|
|
|
|
|
|
|
$validate_url_domain_tld)/ix; |
263
|
|
|
|
|
|
|
|
264
|
|
|
|
|
|
|
our $validate_url_host = qr/(?: |
265
|
|
|
|
|
|
|
$validate_url_ip| |
266
|
|
|
|
|
|
|
$validate_url_domain |
267
|
|
|
|
|
|
|
)/ix; |
268
|
|
|
|
|
|
|
|
269
|
|
|
|
|
|
|
# Unencoded internationalized domains - this doesn't check for invalid UTF-8 sequences |
270
|
|
|
|
|
|
|
our $validate_url_unicode_subdomain_segment = |
271
|
|
|
|
|
|
|
qr/(?:(?:[a-z0-9]|[^\x00-\x7f])(?:(?:[a-z0-9_\-]|[^\x00-\x7f])*(?:[a-z0-9]|[^\x00-\x7f]))?)/ix; |
272
|
|
|
|
|
|
|
our $validate_url_unicode_domain_segment = |
273
|
|
|
|
|
|
|
qr/(?:(?:[a-z0-9]|[^\x00-\x7f])(?:(?:[a-z0-9\-]|[^\x00-\x7f])*(?:[a-z0-9]|[^\x00-\x7f]))?)/ix; |
274
|
|
|
|
|
|
|
our $validate_url_unicode_domain_tld = |
275
|
|
|
|
|
|
|
qr/(?:(?:[a-z]|[^\x00-\x7f])(?:(?:[a-z0-9\-]|[^\x00-\x7f])*(?:[a-z0-9]|[^\x00-\x7f]))?)/ix; |
276
|
|
|
|
|
|
|
our $validate_url_unicode_domain = qr/(?:(?:$validate_url_unicode_subdomain_segment\.)* |
277
|
|
|
|
|
|
|
(?:$validate_url_unicode_domain_segment\.) |
278
|
|
|
|
|
|
|
$validate_url_unicode_domain_tld)/ix; |
279
|
|
|
|
|
|
|
|
280
|
|
|
|
|
|
|
our $validate_url_unicode_host = qr/(?: |
281
|
|
|
|
|
|
|
$validate_url_ip| |
282
|
|
|
|
|
|
|
$validate_url_unicode_domain |
283
|
|
|
|
|
|
|
)/ix; |
284
|
|
|
|
|
|
|
|
285
|
|
|
|
|
|
|
our $validate_url_port = qr/[0-9]{1,5}/; |
286
|
|
|
|
|
|
|
|
287
|
|
|
|
|
|
|
our $validate_url_unicode_authority = qr{ |
288
|
|
|
|
|
|
|
(?:($validate_url_userinfo)@)? # $1 userinfo |
289
|
|
|
|
|
|
|
($validate_url_unicode_host) # $2 host |
290
|
|
|
|
|
|
|
(?::($validate_url_port))? # $3 port |
291
|
|
|
|
|
|
|
}ix; |
292
|
|
|
|
|
|
|
|
293
|
|
|
|
|
|
|
our $validate_url_authority = qr{ |
294
|
|
|
|
|
|
|
(?:($validate_url_userinfo)@)? # $1 userinfo |
295
|
|
|
|
|
|
|
($validate_url_host) # $2 host |
296
|
|
|
|
|
|
|
(?::($validate_url_port))? # $3 port |
297
|
|
|
|
|
|
|
}ix; |
298
|
|
|
|
|
|
|
|
299
|
|
|
|
|
|
|
our $validate_url_path = qr{(/$validate_url_pchar*)*}i; |
300
|
|
|
|
|
|
|
our $validate_url_query = qr{($validate_url_pchar|/|\?)*}i; |
301
|
|
|
|
|
|
|
our $validate_url_fragment = qr{($validate_url_pchar|/|\?)*}i; |
302
|
|
|
|
|
|
|
|
303
|
|
|
|
|
|
|
# Modified version of RFC 3986 Appendix B |
304
|
|
|
|
|
|
|
our $validate_url_unencoded = qr{ |
305
|
|
|
|
|
|
|
\A # Full URL |
306
|
|
|
|
|
|
|
(?: |
307
|
|
|
|
|
|
|
([^:/?#]+):// # $1 Scheme |
308
|
|
|
|
|
|
|
)? |
309
|
|
|
|
|
|
|
([^/?#]*) # $2 Authority |
310
|
|
|
|
|
|
|
([^?#]*) # $3 Path |
311
|
|
|
|
|
|
|
(?: |
312
|
|
|
|
|
|
|
\?([^#]*) # $4 Query |
313
|
|
|
|
|
|
|
)? |
314
|
|
|
|
|
|
|
(?: |
315
|
|
|
|
|
|
|
\#(.*) # $5 Fragment |
316
|
|
|
|
|
|
|
)?\z |
317
|
|
|
|
|
|
|
}ix; |
318
|
|
|
|
|
|
|
|
319
|
|
|
|
|
|
|
1; |