File Coverage

blib/lib/Twitter/Text/Regexp.pm
Criterion Covered Total %
statement 19 19 100.0
branch 2 2 100.0
condition n/a
subroutine 6 6 100.0
pod 0 1 0.0
total 27 28 96.4


line stmt bran cond sub pod time code
1             package
2             Twitter::Text::Regexp; # hide from PAUSE
3 4     4   28 use strict;
  4         9  
  4         126  
4 4     4   25 use warnings;
  4         8  
  4         111  
5 4     4   19 use utf8;
  4         7  
  4         30  
6 4     4   139 no if $^V lt v5.13.9, 'warnings', 'utf8'; ## no critic (ValuesAndExpressions::ProhibitMismatchedOperators)
  4         7  
  4         76  
7              
8 4     4   508 use Twitter::Text::Util qw(load_yaml);
  4         10  
  4         3507  
9              
10             # internal use only, do not use this module directly.
11              
12             sub regex_range {
13 68     68 0 116 my ($from, $to) = @_;
14              
15 68 100       122 if (defined $to) {
16 32         137 return pack('U', $from) . '-' . pack('U', $to);
17             } else {
18 36         640 return pack('U', $from);
19             }
20             }
21              
22             ## no critic (RegularExpressions::ProhibitComplexRegexes)
23              
24             our $TLDS = load_yaml("tld_lib.yml")->[0];
25             our $PUNCTUATION_CHARS = '!"#$%&\'()*+,-./:;<=>?@\[\]^_\`{|}~';
26             our $SPACE_CHARS = " \t\n\x0B\f\r";
27             our $CTRL_CHARS = "\x00-\x1F\x7F";
28             our $INVALID_CHARACTERS = join '', map { pack 'U', $_ } (
29             0xFFFE, 0xFEFF, # BOM
30             0xFFFF, # Special
31             );
32             our $UNICODE_SPACES = join '', map { pack 'U*', $_ } (
33             (0x0009..0x000D), # White_Space # Cc [5] ..
34             0x0020, # White_Space # Zs SPACE
35             0x0085, # White_Space # Cc
36             0x00A0, # White_Space # Zs NO-BREAK SPACE
37             0x1680, # White_Space # Zs OGHAM SPACE MARK
38             0x180E, # White_Space # Zs MONGOLIAN VOWEL SEPARATOR
39             (0x2000..0x200A), # White_Space # Zs [11] EN QUAD..HAIR SPACE
40             0x2028, # White_Space # Zl LINE SEPARATOR
41             0x2029, # White_Space # Zp PARAGRAPH SEPARATOR
42             0x202F, # White_Space # Zs NARROW NO-BREAK SPACE
43             0x205F, # White_Space # Zs MEDIUM MATHEMATICAL SPACE
44             0x3000, # White_Space # Zs IDEOGRAPHIC SPACE
45             );
46              
47             our $DIRECTIONAL_CHARACTERS = join '', map { pack 'U', $_ } (
48             0x061C, # ARABIC LETTER MARK (ALM)
49             0x200E, # LEFT-TO-RIGHT MARK (LRM)
50             0x200F, # RIGHT-TO-LEFT MARK (RLM)
51             0x202A, # LEFT-TO-RIGHT EMBEDDING (LRE)
52             0x202B, # RIGHT-TO-LEFT EMBEDDING (RLE)
53             0x202C, # POP DIRECTIONAL FORMATTING (PDF)
54             0x202D, # LEFT-TO-RIGHT OVERRIDE (LRO)
55             0x202E, # RIGHT-TO-LEFT OVERRIDE (RLO)
56             0x2066, # LEFT-TO-RIGHT ISOLATE (LRI)
57             0x2067, # RIGHT-TO-LEFT ISOLATE (RLI)
58             0x2068, # FIRST STRONG ISOLATE (FSI)
59             0x2069, # POP DIRECTIONAL ISOLATE (PDI)
60             );
61             our $DOMAIN_VALID_CHARS = "[^$DIRECTIONAL_CHARACTERS$PUNCTUATION_CHARS$SPACE_CHARS$CTRL_CHARS$INVALID_CHARACTERS$UNICODE_SPACES]";
62              
63             our $LATIN_ACCENTS = join '', (
64             regex_range(0xc0, 0xd6),
65             regex_range(0xd8, 0xf6),
66             regex_range(0xf8, 0xff),
67             regex_range(0x0100, 0x024f),
68             regex_range(0x0253, 0x0254),
69             regex_range(0x0256, 0x0257),
70             regex_range(0x0259),
71             regex_range(0x025b),
72             regex_range(0x0263),
73             regex_range(0x0268),
74             regex_range(0x026f),
75             regex_range(0x0272),
76             regex_range(0x0289),
77             regex_range(0x028b),
78             regex_range(0x02bb),
79             regex_range(0x0300, 0x036f),
80             regex_range(0x1e00, 0x1eff)
81             );
82             our $latin_accents = qr/[$LATIN_ACCENTS]+/o;
83              
84             our $HASHTAG_LETTERS_AND_MARKS = '\p{L}\p{M}' .
85             "\N{U+037f}\N{U+0528}-\N{U+052f}\N{U+08a0}-\N{U+08b2}\N{U+08e4}-\N{U+08ff}\N{U+0978}\N{U+0980}\N{U+0c00}\N{U+0c34}\N{U+0c81}\N{U+0d01}\N{U+0ede}\N{U+0edf}" .
86             "\N{U+10c7}\N{U+10cd}\N{U+10fd}-\N{U+10ff}\N{U+16f1}-\N{U+16f8}\N{U+17b4}\N{U+17b5}\N{U+191d}\N{U+191e}\N{U+1ab0}-\N{U+1abe}\N{U+1bab}-\N{U+1bad}\N{U+1bba}-" .
87             "\N{U+1bbf}\N{U+1cf3}-\N{U+1cf6}\N{U+1cf8}\N{U+1cf9}\N{U+1de7}-\N{U+1df5}\N{U+2cf2}\N{U+2cf3}\N{U+2d27}\N{U+2d2d}\N{U+2d66}\N{U+2d67}\N{U+9fcc}\N{U+a674}-" .
88             "\N{U+a67b}\N{U+a698}-\N{U+a69d}\N{U+a69f}\N{U+a792}-\N{U+a79f}\N{U+a7aa}-\N{U+a7ad}\N{U+a7b0}\N{U+a7b1}\N{U+a7f7}-\N{U+a7f9}\N{U+a9e0}-\N{U+a9ef}\N{U+a9fa}-" .
89             "\N{U+a9fe}\N{U+aa7c}-\N{U+aa7f}\N{U+aae0}-\N{U+aaef}\N{U+aaf2}-\N{U+aaf6}\N{U+ab30}-\N{U+ab5a}\N{U+ab5c}-\N{U+ab5f}\N{U+ab64}\N{U+ab65}\N{U+f870}-\N{U+f87f}" .
90             "\N{U+f882}\N{U+f884}-\N{U+f89f}\N{U+f8b8}\N{U+f8c1}-\N{U+f8d6}\N{U+fa2e}\N{U+fa2f}\N{U+fe27}-\N{U+fe2d}\N{U+102e0}\N{U+1031f}\N{U+10350}-\N{U+1037a}" .
91             "\N{U+10500}-\N{U+10527}\N{U+10530}-\N{U+10563}\N{U+10600}-\N{U+10736}\N{U+10740}-\N{U+10755}\N{U+10760}-\N{U+10767}" .
92             "\N{U+10860}-\N{U+10876}\N{U+10880}-\N{U+1089e}\N{U+10980}-\N{U+109b7}\N{U+109be}\N{U+109bf}\N{U+10a80}-\N{U+10a9c}" .
93             "\N{U+10ac0}-\N{U+10ac7}\N{U+10ac9}-\N{U+10ae6}\N{U+10b80}-\N{U+10b91}\N{U+1107f}\N{U+110d0}-\N{U+110e8}\N{U+11100}-" .
94             "\N{U+11134}\N{U+11150}-\N{U+11173}\N{U+11176}\N{U+11180}-\N{U+111c4}\N{U+111da}\N{U+11200}-\N{U+11211}\N{U+11213}-" .
95             "\N{U+11237}\N{U+112b0}-\N{U+112ea}\N{U+11301}-\N{U+11303}\N{U+11305}-\N{U+1130c}\N{U+1130f}\N{U+11310}\N{U+11313}-" .
96             "\N{U+11328}\N{U+1132a}-\N{U+11330}\N{U+11332}\N{U+11333}\N{U+11335}-\N{U+11339}\N{U+1133c}-\N{U+11344}\N{U+11347}" .
97             "\N{U+11348}\N{U+1134b}-\N{U+1134d}\N{U+11357}\N{U+1135d}-\N{U+11363}\N{U+11366}-\N{U+1136c}\N{U+11370}-\N{U+11374}" .
98             "\N{U+11480}-\N{U+114c5}\N{U+114c7}\N{U+11580}-\N{U+115b5}\N{U+115b8}-\N{U+115c0}\N{U+11600}-\N{U+11640}\N{U+11644}" .
99             "\N{U+11680}-\N{U+116b7}\N{U+118a0}-\N{U+118df}\N{U+118ff}\N{U+11ac0}-\N{U+11af8}\N{U+1236f}-\N{U+12398}\N{U+16a40}-" .
100             "\N{U+16a5e}\N{U+16ad0}-\N{U+16aed}\N{U+16af0}-\N{U+16af4}\N{U+16b00}-\N{U+16b36}\N{U+16b40}-\N{U+16b43}\N{U+16b63}-" .
101             "\N{U+16b77}\N{U+16b7d}-\N{U+16b8f}\N{U+16f00}-\N{U+16f44}\N{U+16f50}-\N{U+16f7e}\N{U+16f8f}-\N{U+16f9f}\N{U+1bc00}-" .
102             "\N{U+1bc6a}\N{U+1bc70}-\N{U+1bc7c}\N{U+1bc80}-\N{U+1bc88}\N{U+1bc90}-\N{U+1bc99}\N{U+1bc9d}\N{U+1bc9e}\N{U+1e800}-" .
103             "\N{U+1e8c4}\N{U+1e8d0}-\N{U+1e8d6}\N{U+1ee00}-\N{U+1ee03}\N{U+1ee05}-\N{U+1ee1f}\N{U+1ee21}\N{U+1ee22}\N{U+1ee24}" .
104             "\N{U+1ee27}\N{U+1ee29}-\N{U+1ee32}\N{U+1ee34}-\N{U+1ee37}\N{U+1ee39}\N{U+1ee3b}\N{U+1ee42}\N{U+1ee47}\N{U+1ee49}" .
105             "\N{U+1ee4b}\N{U+1ee4d}-\N{U+1ee4f}\N{U+1ee51}\N{U+1ee52}\N{U+1ee54}\N{U+1ee57}\N{U+1ee59}\N{U+1ee5b}\N{U+1ee5d}\N{U+1ee5f}" .
106             "\N{U+1ee61}\N{U+1ee62}\N{U+1ee64}\N{U+1ee67}-\N{U+1ee6a}\N{U+1ee6c}-\N{U+1ee72}\N{U+1ee74}-\N{U+1ee77}\N{U+1ee79}-" .
107             "\N{U+1ee7c}\N{U+1ee7e}\N{U+1ee80}-\N{U+1ee89}\N{U+1ee8b}-\N{U+1ee9b}\N{U+1eea1}-\N{U+1eea3}\N{U+1eea5}-\N{U+1eea9}" .
108             "\N{U+1eeab}-\N{U+1eebb}";
109              
110             our $HASHTAG_NUMERALS = "\\p{Nd}" .
111             "\N{U+0de6}-\N{U+0def}\N{U+a9f0}-\N{U+a9f9}\N{U+110f0}-\N{U+110f9}\N{U+11136}-\N{U+1113f}\N{U+111d0}-\N{U+111d9}\N{U+112f0}-" .
112             "\N{U+112f9}\N{U+114d0}-\N{U+114d9}\N{U+11650}-\N{U+11659}\N{U+116c0}-\N{U+116c9}\N{U+118e0}-\N{U+118e9}\N{U+16a60}-" .
113             "\N{U+16a69}\N{U+16b50}-\N{U+16b59}";
114              
115             our $HASHTAG_SPECIAL_CHARS = "_\N{U+200c}\N{U+200d}\N{U+a67e}\N{U+05be}\N{U+05f3}\N{U+05f4}\N{U+ff5e}\N{U+301c}\N{U+309b}\N{U+309c}\N{U+30a0}\N{U+30fb}\N{U+3003}\N{U+0f0b}\N{U+0f0c}\N{U+00b7}";
116              
117             our $HASHTAG_LETTERS_NUMERALS = "$HASHTAG_LETTERS_AND_MARKS$HASHTAG_NUMERALS$HASHTAG_SPECIAL_CHARS";
118             our $HASHTAG_LETTERS_NUMERALS_SET = "[$HASHTAG_LETTERS_NUMERALS]";
119             our $HASHTAG_LETTERS_SET = "[$HASHTAG_LETTERS_AND_MARKS]";
120              
121             our $HASHTAG = qr/(\A|\N{U+fe0e}|\N{U+fe0f}|[^&$HASHTAG_LETTERS_NUMERALS])(#|#)(?!\N{U+fe0f}|\N{U+20e3})($HASHTAG_LETTERS_NUMERALS_SET*$HASHTAG_LETTERS_SET$HASHTAG_LETTERS_NUMERALS_SET*)/i;
122              
123             our $valid_hashtag = qr/$HASHTAG/i;
124             our $end_hashtag_match = qr/\A(?:[##]|:\/\/)/;
125              
126             our $valid_mention_preceding_chars = qr/(?:[^a-z0-9_!#\$%&*@@]|^|(?:^|[^a-z0-9_+~.-])[rR][tT]:?)/i;
127             our $at_signs = qr/[@@]/;
128             our $valid_mention_or_list = qr/
129             ($valid_mention_preceding_chars) # $1: Preceeding character
130             ($at_signs) # $2: At mark
131             ([a-z0-9_]{1,20}) # $3: Screen name
132             (\/[a-z][a-zA-Z0-9_\-]{0,24})? # $4: List (optional)
133             /ix;
134             our $valid_reply = qr/^(?:[$UNICODE_SPACES$DIRECTIONAL_CHARACTERS])*$at_signs([a-z0-9_]{1,20})/i;
135             # Used in Extractor for final filtering
136             our $end_mention_match = qr/\A(?:$at_signs|$latin_accents|:\/\/)/i;
137              
138             our $valid_subdomain = qr/(?:(?:$DOMAIN_VALID_CHARS(?:[_-]|$DOMAIN_VALID_CHARS)*)?$DOMAIN_VALID_CHARS\.)/i;
139             our $valid_domain_name = qr/(?:(?:$DOMAIN_VALID_CHARS(?:[-]|$DOMAIN_VALID_CHARS)*)?$DOMAIN_VALID_CHARS\.)/i;
140              
141             our $GENERIC_TLDS = join '|', @{$TLDS->{generic}};
142             our $CC_TLDS = join '|', @{$TLDS->{country}};
143              
144             our $valid_gTLD = qr{
145             (?:
146             (?:$GENERIC_TLDS)
147             (?=[^0-9a-z@+-]|$)
148             )
149             }ix;
150              
151             our $valid_ccTLD = qr{
152             (?:
153             (?:$CC_TLDS)
154             (?=[^0-9a-z@+-]|$)
155             )
156             }ix;
157             our $valid_punycode = qr/(?:xn--[0-9a-z]+)/i;
158              
159             our $valid_domain = qr/(?:
160             $valid_subdomain*$valid_domain_name
161             (?:$valid_gTLD|$valid_ccTLD|$valid_punycode)
162             )/ix;
163              
164             # This is used in Extractor
165             our $valid_ascii_domain = qr/
166             (?:(?:[a-z0-9\-_]|$latin_accents)+\.)+
167             (?:$valid_gTLD|$valid_ccTLD|$valid_punycode)
168             /ix;
169              
170             # This is used in Extractor for stricter t.co URL extraction
171             our $valid_tco_url = qr/^https?:\/\/t\.co\/([a-z0-9]+)/i;
172              
173             our $valid_port_number = qr/[0-9]+/;
174              
175             our $valid_url_preceding_chars = qr/(?:[^A-Z0-9@@\$##$INVALID_CHARACTERS]|[$DIRECTIONAL_CHARACTERS]|^)/i;
176             our $invalid_url_without_protocol_preceding_chars = qr/[-_.\/]$/;
177              
178             our $valid_general_url_path_chars = qr/[a-z\p{Cyrillic}0-9!\*';:=\+\,\.\$\/%#\[\]\p{Pd}_~&\|$LATIN_ACCENTS]/i;
179             # Allow URL paths to contain up to two nested levels of balanced parens
180             # 1. Used in Wikipedia URLs like /Primer_(film)
181             # 2. Used in IIS sessions like /S(dfd346)/
182             # 3. Used in Rdio URLs like /track/We_Up_(Album_Version_(Edited))/
183             our $valid_url_balanced_parens = qr/
184             \(
185             (?:
186             $valid_general_url_path_chars+
187             |
188             # allow one nested level of balanced parentheses
189             (?:
190             $valid_general_url_path_chars*
191             \(
192             $valid_general_url_path_chars+
193             \)
194             $valid_general_url_path_chars*
195             )
196             )
197             \)
198             /ix;
199             # Valid end-of-path chracters (so /foo. does not gobble the period).
200             # 1. Allow =&# for empty URL parameters and other URL-join artifacts
201             our $valid_url_path_ending_chars = qr/[a-z\p{Cyrillic}0-9=_#\/\+\-$LATIN_ACCENTS]|(?:$valid_url_balanced_parens)/i;
202             our $valid_url_path = qr/(?:
203             (?:
204             $valid_general_url_path_chars*
205             (?:$valid_url_balanced_parens $valid_general_url_path_chars*)*
206             $valid_url_path_ending_chars
207             )|(?:$valid_general_url_path_chars+\/)
208             )/ix;
209             our $valid_url_query_chars = qr/[a-z0-9!?\*'\(\);:&=\+\$\/%#\[\]\-_\.,~|@]/i;
210             our $valid_url_query_ending_chars = qr/[a-z0-9_&=#\/\-]/i;
211             our $valid_url = qr{
212             ( # $1 total match
213             ($valid_url_preceding_chars) # $2 Preceeding chracter
214             ( # $3 URL
215             (https?:\/\/)? # $4 Protocol (optional)
216             ($valid_domain) # $5 Domain(s)
217             (?::($valid_port_number))? # $6 Port number (optional)
218             (/$valid_url_path*)? # $7 URL Path and anchor
219             (\?$valid_url_query_chars*$valid_url_query_ending_chars)? # $8 Query String
220             )
221             )}ix;
222              
223             our $cashtag = qr/[a-z]{1,6}(?:[._][a-z]{1,2})?/i;
224             our $valid_cashtag = qr/(^|[$UNICODE_SPACES$DIRECTIONAL_CHARACTERS])(\$)($cashtag)(?=$|\s|[$PUNCTUATION_CHARS])/i;
225              
226             # These URL validation pattern strings are based on the ABNF from RFC 3986
227             our $validate_url_unreserved = qr/[a-z\p{Cyrillic}0-9\p{Pd}._~]/i;
228             our $validate_url_pct_encoded = qr/(?:%[0-9a-f]{2})/i;
229             our $validate_url_sub_delims = qr/[!\$&'()*+,;=]/i;
230             our $validate_url_pchar = qr/(?:
231             $validate_url_unreserved|
232             $validate_url_pct_encoded|
233             $validate_url_sub_delims|
234             [:\|@]
235             )/ix;
236              
237             our $validate_url_scheme = qr/(?:[a-z][a-z0-9+\-.]*)/i;
238             our $validate_url_userinfo = qr/(?:
239             $validate_url_unreserved|
240             $validate_url_pct_encoded|
241             $validate_url_sub_delims|
242             :
243             )*/ix;
244              
245             our $validate_url_dec_octet = qr/(?:[0-9]|(?:[1-9][0-9])|(?:1[0-9]{2})|(?:2[0-4][0-9])|(?:25[0-5]))/i;
246             our $validate_url_ipv4 =
247             qr/(?:$validate_url_dec_octet(?:\.$validate_url_dec_octet){3})/ix;
248              
249             # Punting on real IPv6 validation for now
250             our $validate_url_ipv6 = qr/(?:\[[a-f0-9:\.]+\])/i;
251              
252             # Also punting on IPvFuture for now
253             our $validate_url_ip = qr/(?:
254             $validate_url_ipv4|
255             $validate_url_ipv6
256             )/ix;
257              
258             # This is more strict than the rfc specifies
259             our $validate_url_subdomain_segment = qr/(?:[a-z0-9](?:[a-z0-9_\-]*[a-z0-9])?)/i;
260             our $validate_url_domain_segment = qr/(?:[a-z0-9](?:[a-z0-9\-]*[a-z0-9])?)/i;
261             our $validate_url_domain_tld = qr/(?:[a-z](?:[a-z0-9\-]*[a-z0-9])?)/i;
262             our $validate_url_domain = qr/(?:(?:$validate_url_subdomain_segment\.)*
263             (?:$validate_url_domain_segment\.)
264             $validate_url_domain_tld)/ix;
265              
266             our $validate_url_host = qr/(?:
267             $validate_url_ip|
268             $validate_url_domain
269             )/ix;
270              
271             # Unencoded internationalized domains - this doesn't check for invalid UTF-8 sequences
272             our $validate_url_unicode_subdomain_segment =
273             qr/(?:(?:[a-z0-9]|[^\x00-\x7f])(?:(?:[a-z0-9_\-]|[^\x00-\x7f])*(?:[a-z0-9]|[^\x00-\x7f]))?)/ix;
274             our $validate_url_unicode_domain_segment =
275             qr/(?:(?:[a-z0-9]|[^\x00-\x7f])(?:(?:[a-z0-9\-]|[^\x00-\x7f])*(?:[a-z0-9]|[^\x00-\x7f]))?)/ix;
276             our $validate_url_unicode_domain_tld =
277             qr/(?:(?:[a-z]|[^\x00-\x7f])(?:(?:[a-z0-9\-]|[^\x00-\x7f])*(?:[a-z0-9]|[^\x00-\x7f]))?)/ix;
278             our $validate_url_unicode_domain = qr/(?:(?:$validate_url_unicode_subdomain_segment\.)*
279             (?:$validate_url_unicode_domain_segment\.)
280             $validate_url_unicode_domain_tld)/ix;
281              
282             our $validate_url_unicode_host = qr/(?:
283             $validate_url_ip|
284             $validate_url_unicode_domain
285             )/ix;
286              
287             our $validate_url_port = qr/[0-9]{1,5}/;
288              
289             our $validate_url_unicode_authority = qr{
290             (?:($validate_url_userinfo)@)? # $1 userinfo
291             ($validate_url_unicode_host) # $2 host
292             (?::($validate_url_port))? # $3 port
293             }ix;
294              
295             our $validate_url_authority = qr{
296             (?:($validate_url_userinfo)@)? # $1 userinfo
297             ($validate_url_host) # $2 host
298             (?::($validate_url_port))? # $3 port
299             }ix;
300              
301             our $validate_url_path = qr{(/$validate_url_pchar*)*}i;
302             our $validate_url_query = qr{($validate_url_pchar|/|\?)*}i;
303             our $validate_url_fragment = qr{($validate_url_pchar|/|\?)*}i;
304              
305             # Modified version of RFC 3986 Appendix B
306             our $validate_url_unencoded = qr{
307             \A # Full URL
308             (?:
309             ([^:/?#]+):// # $1 Scheme
310             )?
311             ([^/?#]*) # $2 Authority
312             ([^?#]*) # $3 Path
313             (?:
314             \?([^#]*) # $4 Query
315             )?
316             (?:
317             \#(.*) # $5 Fragment
318             )?\z
319             }ix;
320              
321             1;