File Coverage

blib/lib/Twitter/Text/Regexp.pm
Criterion Covered Total %
statement 19 19 100.0
branch 2 2 100.0
condition n/a
subroutine 6 6 100.0
pod 0 1 0.0
total 27 28 96.4


line stmt bran cond sub pod time code
1             package
2             Twitter::Text::Regexp; # hide from PAUSE
3 4     4   29 use strict;
  4         8  
  4         133  
4 4     4   25 use warnings;
  4         8  
  4         133  
5 4     4   27 use utf8;
  4         10  
  4         32  
6 4     4   151 no if $^V lt v5.13.9, 'warnings', 'utf8';
  4         17  
  4         77  
7              
8 4     4   628 use Twitter::Text::Util qw(load_yaml);
  4         10  
  4         3887  
9              
10             # internal use only, do not use this module directly.
11              
12             sub regex_range {
13 68     68 0 117 my ($from, $to) = @_;
14              
15 68 100       113 if (defined $to) {
16 32         146 return pack('U', $from) . '-' . pack('U', $to);
17             } else {
18 36         668 return pack('U', $from);
19             }
20             }
21              
22             our $TLDS = load_yaml("tld_lib.yml")->[0];
23             our $PUNCTUATION_CHARS = '!"#$%&\'()*+,-./:;<=>?@\[\]^_\`{|}~';
24             our $SPACE_CHARS = " \t\n\x0B\f\r";
25             our $CTRL_CHARS = "\x00-\x1F\x7F";
26             our $INVALID_CHARACTERS = join '', map { pack 'U', $_ } (
27             0xFFFE, 0xFEFF, # BOM
28             0xFFFF, # Special
29             );
30             our $UNICODE_SPACES = join '', map { pack 'U*', $_ } (
31             (0x0009..0x000D), # White_Space # Cc [5] ..
32             0x0020, # White_Space # Zs SPACE
33             0x0085, # White_Space # Cc
34             0x00A0, # White_Space # Zs NO-BREAK SPACE
35             0x1680, # White_Space # Zs OGHAM SPACE MARK
36             0x180E, # White_Space # Zs MONGOLIAN VOWEL SEPARATOR
37             (0x2000..0x200A), # White_Space # Zs [11] EN QUAD..HAIR SPACE
38             0x2028, # White_Space # Zl LINE SEPARATOR
39             0x2029, # White_Space # Zp PARAGRAPH SEPARATOR
40             0x202F, # White_Space # Zs NARROW NO-BREAK SPACE
41             0x205F, # White_Space # Zs MEDIUM MATHEMATICAL SPACE
42             0x3000, # White_Space # Zs IDEOGRAPHIC SPACE
43             );
44              
45             our $DIRECTIONAL_CHARACTERS = join '', map { pack 'U', $_ } (
46             0x061C, # ARABIC LETTER MARK (ALM)
47             0x200E, # LEFT-TO-RIGHT MARK (LRM)
48             0x200F, # RIGHT-TO-LEFT MARK (RLM)
49             0x202A, # LEFT-TO-RIGHT EMBEDDING (LRE)
50             0x202B, # RIGHT-TO-LEFT EMBEDDING (RLE)
51             0x202C, # POP DIRECTIONAL FORMATTING (PDF)
52             0x202D, # LEFT-TO-RIGHT OVERRIDE (LRO)
53             0x202E, # RIGHT-TO-LEFT OVERRIDE (RLO)
54             0x2066, # LEFT-TO-RIGHT ISOLATE (LRI)
55             0x2067, # RIGHT-TO-LEFT ISOLATE (RLI)
56             0x2068, # FIRST STRONG ISOLATE (FSI)
57             0x2069, # POP DIRECTIONAL ISOLATE (PDI)
58             );
59             our $DOMAIN_VALID_CHARS = "[^$DIRECTIONAL_CHARACTERS$PUNCTUATION_CHARS$SPACE_CHARS$CTRL_CHARS$INVALID_CHARACTERS$UNICODE_SPACES]";
60              
61             our $LATIN_ACCENTS = join '', (
62             regex_range(0xc0, 0xd6),
63             regex_range(0xd8, 0xf6),
64             regex_range(0xf8, 0xff),
65             regex_range(0x0100, 0x024f),
66             regex_range(0x0253, 0x0254),
67             regex_range(0x0256, 0x0257),
68             regex_range(0x0259),
69             regex_range(0x025b),
70             regex_range(0x0263),
71             regex_range(0x0268),
72             regex_range(0x026f),
73             regex_range(0x0272),
74             regex_range(0x0289),
75             regex_range(0x028b),
76             regex_range(0x02bb),
77             regex_range(0x0300, 0x036f),
78             regex_range(0x1e00, 0x1eff)
79             );
80             our $latin_accents = qr/[$LATIN_ACCENTS]+/o;
81              
82             our $HASHTAG_LETTERS_AND_MARKS = '\p{L}\p{M}' .
83             "\N{U+037f}\N{U+0528}-\N{U+052f}\N{U+08a0}-\N{U+08b2}\N{U+08e4}-\N{U+08ff}\N{U+0978}\N{U+0980}\N{U+0c00}\N{U+0c34}\N{U+0c81}\N{U+0d01}\N{U+0ede}\N{U+0edf}" .
84             "\N{U+10c7}\N{U+10cd}\N{U+10fd}-\N{U+10ff}\N{U+16f1}-\N{U+16f8}\N{U+17b4}\N{U+17b5}\N{U+191d}\N{U+191e}\N{U+1ab0}-\N{U+1abe}\N{U+1bab}-\N{U+1bad}\N{U+1bba}-" .
85             "\N{U+1bbf}\N{U+1cf3}-\N{U+1cf6}\N{U+1cf8}\N{U+1cf9}\N{U+1de7}-\N{U+1df5}\N{U+2cf2}\N{U+2cf3}\N{U+2d27}\N{U+2d2d}\N{U+2d66}\N{U+2d67}\N{U+9fcc}\N{U+a674}-" .
86             "\N{U+a67b}\N{U+a698}-\N{U+a69d}\N{U+a69f}\N{U+a792}-\N{U+a79f}\N{U+a7aa}-\N{U+a7ad}\N{U+a7b0}\N{U+a7b1}\N{U+a7f7}-\N{U+a7f9}\N{U+a9e0}-\N{U+a9ef}\N{U+a9fa}-" .
87             "\N{U+a9fe}\N{U+aa7c}-\N{U+aa7f}\N{U+aae0}-\N{U+aaef}\N{U+aaf2}-\N{U+aaf6}\N{U+ab30}-\N{U+ab5a}\N{U+ab5c}-\N{U+ab5f}\N{U+ab64}\N{U+ab65}\N{U+f870}-\N{U+f87f}" .
88             "\N{U+f882}\N{U+f884}-\N{U+f89f}\N{U+f8b8}\N{U+f8c1}-\N{U+f8d6}\N{U+fa2e}\N{U+fa2f}\N{U+fe27}-\N{U+fe2d}\N{U+102e0}\N{U+1031f}\N{U+10350}-\N{U+1037a}" .
89             "\N{U+10500}-\N{U+10527}\N{U+10530}-\N{U+10563}\N{U+10600}-\N{U+10736}\N{U+10740}-\N{U+10755}\N{U+10760}-\N{U+10767}" .
90             "\N{U+10860}-\N{U+10876}\N{U+10880}-\N{U+1089e}\N{U+10980}-\N{U+109b7}\N{U+109be}\N{U+109bf}\N{U+10a80}-\N{U+10a9c}" .
91             "\N{U+10ac0}-\N{U+10ac7}\N{U+10ac9}-\N{U+10ae6}\N{U+10b80}-\N{U+10b91}\N{U+1107f}\N{U+110d0}-\N{U+110e8}\N{U+11100}-" .
92             "\N{U+11134}\N{U+11150}-\N{U+11173}\N{U+11176}\N{U+11180}-\N{U+111c4}\N{U+111da}\N{U+11200}-\N{U+11211}\N{U+11213}-" .
93             "\N{U+11237}\N{U+112b0}-\N{U+112ea}\N{U+11301}-\N{U+11303}\N{U+11305}-\N{U+1130c}\N{U+1130f}\N{U+11310}\N{U+11313}-" .
94             "\N{U+11328}\N{U+1132a}-\N{U+11330}\N{U+11332}\N{U+11333}\N{U+11335}-\N{U+11339}\N{U+1133c}-\N{U+11344}\N{U+11347}" .
95             "\N{U+11348}\N{U+1134b}-\N{U+1134d}\N{U+11357}\N{U+1135d}-\N{U+11363}\N{U+11366}-\N{U+1136c}\N{U+11370}-\N{U+11374}" .
96             "\N{U+11480}-\N{U+114c5}\N{U+114c7}\N{U+11580}-\N{U+115b5}\N{U+115b8}-\N{U+115c0}\N{U+11600}-\N{U+11640}\N{U+11644}" .
97             "\N{U+11680}-\N{U+116b7}\N{U+118a0}-\N{U+118df}\N{U+118ff}\N{U+11ac0}-\N{U+11af8}\N{U+1236f}-\N{U+12398}\N{U+16a40}-" .
98             "\N{U+16a5e}\N{U+16ad0}-\N{U+16aed}\N{U+16af0}-\N{U+16af4}\N{U+16b00}-\N{U+16b36}\N{U+16b40}-\N{U+16b43}\N{U+16b63}-" .
99             "\N{U+16b77}\N{U+16b7d}-\N{U+16b8f}\N{U+16f00}-\N{U+16f44}\N{U+16f50}-\N{U+16f7e}\N{U+16f8f}-\N{U+16f9f}\N{U+1bc00}-" .
100             "\N{U+1bc6a}\N{U+1bc70}-\N{U+1bc7c}\N{U+1bc80}-\N{U+1bc88}\N{U+1bc90}-\N{U+1bc99}\N{U+1bc9d}\N{U+1bc9e}\N{U+1e800}-" .
101             "\N{U+1e8c4}\N{U+1e8d0}-\N{U+1e8d6}\N{U+1ee00}-\N{U+1ee03}\N{U+1ee05}-\N{U+1ee1f}\N{U+1ee21}\N{U+1ee22}\N{U+1ee24}" .
102             "\N{U+1ee27}\N{U+1ee29}-\N{U+1ee32}\N{U+1ee34}-\N{U+1ee37}\N{U+1ee39}\N{U+1ee3b}\N{U+1ee42}\N{U+1ee47}\N{U+1ee49}" .
103             "\N{U+1ee4b}\N{U+1ee4d}-\N{U+1ee4f}\N{U+1ee51}\N{U+1ee52}\N{U+1ee54}\N{U+1ee57}\N{U+1ee59}\N{U+1ee5b}\N{U+1ee5d}\N{U+1ee5f}" .
104             "\N{U+1ee61}\N{U+1ee62}\N{U+1ee64}\N{U+1ee67}-\N{U+1ee6a}\N{U+1ee6c}-\N{U+1ee72}\N{U+1ee74}-\N{U+1ee77}\N{U+1ee79}-" .
105             "\N{U+1ee7c}\N{U+1ee7e}\N{U+1ee80}-\N{U+1ee89}\N{U+1ee8b}-\N{U+1ee9b}\N{U+1eea1}-\N{U+1eea3}\N{U+1eea5}-\N{U+1eea9}" .
106             "\N{U+1eeab}-\N{U+1eebb}";
107              
108             our $HASHTAG_NUMERALS = "\\p{Nd}" .
109             "\N{U+0de6}-\N{U+0def}\N{U+a9f0}-\N{U+a9f9}\N{U+110f0}-\N{U+110f9}\N{U+11136}-\N{U+1113f}\N{U+111d0}-\N{U+111d9}\N{U+112f0}-" .
110             "\N{U+112f9}\N{U+114d0}-\N{U+114d9}\N{U+11650}-\N{U+11659}\N{U+116c0}-\N{U+116c9}\N{U+118e0}-\N{U+118e9}\N{U+16a60}-" .
111             "\N{U+16a69}\N{U+16b50}-\N{U+16b59}";
112              
113             our $HASHTAG_SPECIAL_CHARS = "_\N{U+200c}\N{U+200d}\N{U+a67e}\N{U+05be}\N{U+05f3}\N{U+05f4}\N{U+ff5e}\N{U+301c}\N{U+309b}\N{U+309c}\N{U+30a0}\N{U+30fb}\N{U+3003}\N{U+0f0b}\N{U+0f0c}\N{U+00b7}";
114              
115             our $HASHTAG_LETTERS_NUMERALS = "$HASHTAG_LETTERS_AND_MARKS$HASHTAG_NUMERALS$HASHTAG_SPECIAL_CHARS";
116             our $HASHTAG_LETTERS_NUMERALS_SET = "[$HASHTAG_LETTERS_NUMERALS]";
117             our $HASHTAG_LETTERS_SET = "[$HASHTAG_LETTERS_AND_MARKS]";
118              
119             our $HASHTAG = qr/(\A|\N{U+fe0e}|\N{U+fe0f}|[^&$HASHTAG_LETTERS_NUMERALS])(#|#)(?!\N{U+fe0f}|\N{U+20e3})($HASHTAG_LETTERS_NUMERALS_SET*$HASHTAG_LETTERS_SET$HASHTAG_LETTERS_NUMERALS_SET*)/i;
120              
121             our $valid_hashtag = qr/$HASHTAG/i;
122             our $end_hashtag_match = qr/\A(?:[##]|:\/\/)/;
123              
124             our $valid_mention_preceding_chars = qr/(?:[^a-z0-9_!#\$%&*@@]|^|(?:^|[^a-z0-9_+~.-])[rR][tT]:?)/i;
125             our $at_signs = qr/[@@]/;
126             our $valid_mention_or_list = qr/
127             ($valid_mention_preceding_chars) # $1: Preceeding character
128             ($at_signs) # $2: At mark
129             ([a-z0-9_]{1,20}) # $3: Screen name
130             (\/[a-z][a-zA-Z0-9_\-]{0,24})? # $4: List (optional)
131             /ix;
132             our $valid_reply = qr/^(?:[$UNICODE_SPACES$DIRECTIONAL_CHARACTERS])*$at_signs([a-z0-9_]{1,20})/i;
133             # Used in Extractor for final filtering
134             our $end_mention_match = qr/\A(?:$at_signs|$latin_accents|:\/\/)/i;
135              
136             our $valid_subdomain = qr/(?:(?:$DOMAIN_VALID_CHARS(?:[_-]|$DOMAIN_VALID_CHARS)*)?$DOMAIN_VALID_CHARS\.)/i;
137             our $valid_domain_name = qr/(?:(?:$DOMAIN_VALID_CHARS(?:[-]|$DOMAIN_VALID_CHARS)*)?$DOMAIN_VALID_CHARS\.)/i;
138              
139             our $GENERIC_TLDS = join '|', @{$TLDS->{generic}};
140             our $CC_TLDS = join '|', @{$TLDS->{country}};
141              
142             our $valid_gTLD = qr{
143             (?:
144             (?:$GENERIC_TLDS)
145             (?=[^0-9a-z@+-]|$)
146             )
147             }ix;
148              
149             our $valid_ccTLD = qr{
150             (?:
151             (?:$CC_TLDS)
152             (?=[^0-9a-z@+-]|$)
153             )
154             }ix;
155             our $valid_punycode = qr/(?:xn--[0-9a-z]+)/i;
156              
157             our $valid_domain = qr/(?:
158             $valid_subdomain*$valid_domain_name
159             (?:$valid_gTLD|$valid_ccTLD|$valid_punycode)
160             )/ix;
161              
162             # This is used in Extractor
163             our $valid_ascii_domain = qr/
164             (?:(?:[a-z0-9\-_]|$latin_accents)+\.)+
165             (?:$valid_gTLD|$valid_ccTLD|$valid_punycode)
166             /ix;
167              
168             # This is used in Extractor for stricter t.co URL extraction
169             our $valid_tco_url = qr/^https?:\/\/t\.co\/([a-z0-9]+)/i;
170              
171             our $valid_port_number = qr/[0-9]+/;
172              
173             our $valid_url_preceding_chars = qr/(?:[^A-Z0-9@@\$##$INVALID_CHARACTERS]|[$DIRECTIONAL_CHARACTERS]|^)/i;
174             our $invalid_url_without_protocol_preceding_chars = qr/[-_.\/]$/;
175              
176             our $valid_general_url_path_chars = qr/[a-z\p{Cyrillic}0-9!\*';:=\+\,\.\$\/%#\[\]\p{Pd}_~&\|$LATIN_ACCENTS]/i;
177             # Allow URL paths to contain up to two nested levels of balanced parens
178             # 1. Used in Wikipedia URLs like /Primer_(film)
179             # 2. Used in IIS sessions like /S(dfd346)/
180             # 3. Used in Rdio URLs like /track/We_Up_(Album_Version_(Edited))/
181             our $valid_url_balanced_parens = qr/
182             \(
183             (?:
184             $valid_general_url_path_chars+
185             |
186             # allow one nested level of balanced parentheses
187             (?:
188             $valid_general_url_path_chars*
189             \(
190             $valid_general_url_path_chars+
191             \)
192             $valid_general_url_path_chars*
193             )
194             )
195             \)
196             /ix;
197             # Valid end-of-path chracters (so /foo. does not gobble the period).
198             # 1. Allow =&# for empty URL parameters and other URL-join artifacts
199             our $valid_url_path_ending_chars = qr/[a-z\p{Cyrillic}0-9=_#\/\+\-$LATIN_ACCENTS]|(?:$valid_url_balanced_parens)/i;
200             our $valid_url_path = qr/(?:
201             (?:
202             $valid_general_url_path_chars*
203             (?:$valid_url_balanced_parens $valid_general_url_path_chars*)*
204             $valid_url_path_ending_chars
205             )|(?:$valid_general_url_path_chars+\/)
206             )/ix;
207             our $valid_url_query_chars = qr/[a-z0-9!?\*'\(\);:&=\+\$\/%#\[\]\-_\.,~|@]/i;
208             our $valid_url_query_ending_chars = qr/[a-z0-9_&=#\/\-]/i;
209             our $valid_url = qr{
210             ( # $1 total match
211             ($valid_url_preceding_chars) # $2 Preceeding chracter
212             ( # $3 URL
213             (https?:\/\/)? # $4 Protocol (optional)
214             ($valid_domain) # $5 Domain(s)
215             (?::($valid_port_number))? # $6 Port number (optional)
216             (/$valid_url_path*)? # $7 URL Path and anchor
217             (\?$valid_url_query_chars*$valid_url_query_ending_chars)? # $8 Query String
218             )
219             )}ix;
220              
221             our $cashtag = qr/[a-z]{1,6}(?:[._][a-z]{1,2})?/i;
222             our $valid_cashtag = qr/(^|[$UNICODE_SPACES$DIRECTIONAL_CHARACTERS])(\$)($cashtag)(?=$|\s|[$PUNCTUATION_CHARS])/i;
223              
224             # These URL validation pattern strings are based on the ABNF from RFC 3986
225             our $validate_url_unreserved = qr/[a-z\p{Cyrillic}0-9\p{Pd}._~]/i;
226             our $validate_url_pct_encoded = qr/(?:%[0-9a-f]{2})/i;
227             our $validate_url_sub_delims = qr/[!\$&'()*+,;=]/i;
228             our $validate_url_pchar = qr/(?:
229             $validate_url_unreserved|
230             $validate_url_pct_encoded|
231             $validate_url_sub_delims|
232             [:\|@]
233             )/ix;
234              
235             our $validate_url_scheme = qr/(?:[a-z][a-z0-9+\-.]*)/i;
236             our $validate_url_userinfo = qr/(?:
237             $validate_url_unreserved|
238             $validate_url_pct_encoded|
239             $validate_url_sub_delims|
240             :
241             )*/ix;
242              
243             our $validate_url_dec_octet = qr/(?:[0-9]|(?:[1-9][0-9])|(?:1[0-9]{2})|(?:2[0-4][0-9])|(?:25[0-5]))/i;
244             our $validate_url_ipv4 =
245             qr/(?:$validate_url_dec_octet(?:\.$validate_url_dec_octet){3})/ix;
246              
247             # Punting on real IPv6 validation for now
248             our $validate_url_ipv6 = qr/(?:\[[a-f0-9:\.]+\])/i;
249              
250             # Also punting on IPvFuture for now
251             our $validate_url_ip = qr/(?:
252             $validate_url_ipv4|
253             $validate_url_ipv6
254             )/ix;
255              
256             # This is more strict than the rfc specifies
257             our $validate_url_subdomain_segment = qr/(?:[a-z0-9](?:[a-z0-9_\-]*[a-z0-9])?)/i;
258             our $validate_url_domain_segment = qr/(?:[a-z0-9](?:[a-z0-9\-]*[a-z0-9])?)/i;
259             our $validate_url_domain_tld = qr/(?:[a-z](?:[a-z0-9\-]*[a-z0-9])?)/i;
260             our $validate_url_domain = qr/(?:(?:$validate_url_subdomain_segment\.)*
261             (?:$validate_url_domain_segment\.)
262             $validate_url_domain_tld)/ix;
263              
264             our $validate_url_host = qr/(?:
265             $validate_url_ip|
266             $validate_url_domain
267             )/ix;
268              
269             # Unencoded internationalized domains - this doesn't check for invalid UTF-8 sequences
270             our $validate_url_unicode_subdomain_segment =
271             qr/(?:(?:[a-z0-9]|[^\x00-\x7f])(?:(?:[a-z0-9_\-]|[^\x00-\x7f])*(?:[a-z0-9]|[^\x00-\x7f]))?)/ix;
272             our $validate_url_unicode_domain_segment =
273             qr/(?:(?:[a-z0-9]|[^\x00-\x7f])(?:(?:[a-z0-9\-]|[^\x00-\x7f])*(?:[a-z0-9]|[^\x00-\x7f]))?)/ix;
274             our $validate_url_unicode_domain_tld =
275             qr/(?:(?:[a-z]|[^\x00-\x7f])(?:(?:[a-z0-9\-]|[^\x00-\x7f])*(?:[a-z0-9]|[^\x00-\x7f]))?)/ix;
276             our $validate_url_unicode_domain = qr/(?:(?:$validate_url_unicode_subdomain_segment\.)*
277             (?:$validate_url_unicode_domain_segment\.)
278             $validate_url_unicode_domain_tld)/ix;
279              
280             our $validate_url_unicode_host = qr/(?:
281             $validate_url_ip|
282             $validate_url_unicode_domain
283             )/ix;
284              
285             our $validate_url_port = qr/[0-9]{1,5}/;
286              
287             our $validate_url_unicode_authority = qr{
288             (?:($validate_url_userinfo)@)? # $1 userinfo
289             ($validate_url_unicode_host) # $2 host
290             (?::($validate_url_port))? # $3 port
291             }ix;
292              
293             our $validate_url_authority = qr{
294             (?:($validate_url_userinfo)@)? # $1 userinfo
295             ($validate_url_host) # $2 host
296             (?::($validate_url_port))? # $3 port
297             }ix;
298              
299             our $validate_url_path = qr{(/$validate_url_pchar*)*}i;
300             our $validate_url_query = qr{($validate_url_pchar|/|\?)*}i;
301             our $validate_url_fragment = qr{($validate_url_pchar|/|\?)*}i;
302              
303             # Modified version of RFC 3986 Appendix B
304             our $validate_url_unencoded = qr{
305             \A # Full URL
306             (?:
307             ([^:/?#]+):// # $1 Scheme
308             )?
309             ([^/?#]*) # $2 Authority
310             ([^?#]*) # $3 Path
311             (?:
312             \?([^#]*) # $4 Query
313             )?
314             (?:
315             \#(.*) # $5 Fragment
316             )?\z
317             }ix;
318              
319             1;