File Coverage

blib/lib/Twitter/Text.pm

Criterion	Covered	Total	%
statement	253	253	100.0
branch	91	94	96.8
condition	55	66	83.3
subroutine	36	36	100.0
pod	12	16	75.0
total	447	465	96.1

line	stmt	bran	cond	sub	pod	time	code
1							package Twitter::Text;
2	4			4		2377	use 5.014001;
	4					18
3	4			4		30	use strict;
	4					10
	4					92
4	4			4		20	use warnings;
	4					8
	4					114
5	4			4		623	use utf8;
	4					20
	4					36
6							use constant {
7	4					466	DEFAULT_TCO_URL_LENGTHS => {
8							short_url_length => 23,
9							},
10							MAX_WEIGHTENED_LENGTH => 280,
11							MAX_URL_LENGTH => 4096,
12							MAX_TCO_SLUG_LENGTH => 40,
13							URL_PROTOCOL_LENGTH => length 'https://',
14	4			4		236	};
	4					8
15	4			4		34	use Carp qw(croak);
	4					7
	4					195
16	4			4		24	use Exporter 'import';
	4					7
	4					168
17	4			4		34	use List::Util qw(min);
	4					8
	4					275
18	4			4		1981	use List::UtilsBy qw(nsort_by);
	4					7756
	4					271
19	4			4		1830	use Net::IDN::Encode qw(domain_to_ascii);
	4					391777
	4					695
20	4			4		1224	use Twitter::Text::Configuration;
	4					33
	4					180
21	4			4		2118	use Twitter::Text::Regexp;
	4					16
	4					428
22	4			4		2368	use Twitter::Text::Regexp::Emoji;
	4					11
	4					202
23	4			4		34	use Unicode::Normalize qw(NFC);
	4					9
	4					10818
24
25							our $VERSION = "0.07";
26							our @EXPORT = (
27							# Extraction
28							qw(
29							extract_cashtags
30							extract_cashtags_with_indices
31							extract_hashtags
32							extract_hashtags_with_indices
33							extract_mentioned_screen_names
34							extract_mentioned_screen_names_with_indices
35							extract_mentions_or_lists_with_indices
36							extract_urls
37							extract_urls_with_indices
38							),
39							# Validation
40							qw(
41							is_valid_hashtag
42							is_valid_list
43							is_valid_tweet
44							is_valid_url
45							is_valid_username
46							parse_tweet
47							),
48							);
49
50							sub extract_emoji_with_indices {
51	24			24	0	291	my ($text) = @_;
52	24					45	my $emoji = [];
53
54	24					10608	while ($text =~ /($Twitter::Text::Regexp::Emoji::valid_emoji)/g) {
55	318					719	my $emoji_text = $1;
56	318					1421	my $start_position = $-[1];
57	318					1437	my $end_position = $+[1];
58	318					4479	push @$emoji, {
59							emoji => $emoji_text,
60							indices => [ $start_position, $end_position ],
61							};
62							}
63	24					85	return $emoji;
64							}
65
66							sub _remove_overlapping_entities {
67	1			1		3	my ($entities) = @_;
68
69	1			4		11	$entities = [ nsort_by { $_->{indices}->[0] } @$entities ];
	4					20
70							# remove duplicates
71	1					20	my $ret = [];
72	1					2	my $prev;
73
74	1					3	for my $entity (@$entities) {
75	4	100	100			16	unless ($prev && $prev->{indices}->[1] > $entity->{indices}->[0]) {
76	2					3	push @$ret, $entity;
77							}
78	4					6	$prev = $entity;
79							}
80	1					3	return $ret;
81							}
82
83							sub extract_cashtags {
84	8			8	0	7931	my ($text) = @_;
85	8					17	return [ map { $_->{cashtag} } @{ extract_cashtags_with_indices($text) } ];
	10					36
	8					19
86							}
87
88							sub extract_cashtags_with_indices {
89	11			11	0	6558	my ($text) = @_;
90
91	11	100				54	return [] unless $text =~ /\$/;
92
93	10					19	my $tags = [];
94
95	10					232	while ($text =~ /($Twitter::Text::Regexp::valid_cashtag)/g) {
96	14					52	my ($before, $dollar, $cash_text) = ($2, $3, $4);
97	14					34	my $start_position = $-[3];
98	14					35	my $end_position = $+[4];
99	14					130	push @$tags, {
100							cashtag => $cash_text,
101							indices => [ $start_position, $end_position ],
102							};
103							}
104
105	10					37	return $tags;
106							}
107
108							sub extract_hashtags {
109	75			75	1	969725	my ($text) = @_;
110	75					108	return [ map { $_->{hashtag} } @{ extract_hashtags_with_indices($text) } ];
	134					378
	75					142
111							}
112
113							sub extract_hashtags_with_indices {
114	85			85	1	12175	my ($text, $options) = @_;
115
116	85	100				393	return [] unless $text =~ /[#＃]/;
117
118	84	100				222	$options->{check_url_overlap} = 1 unless exists $options->{check_url_overlap};
119
120	84					113	my $tags = [];
121
122	84					1553	while ($text =~ /($Twitter::Text::Regexp::valid_hashtag)/gp) {
123	151					3568	my ($before, $hash, $hash_text) = ($2, $3, $4);
124	151					316	my $start_position = $-[3];
125	151					305	my $end_position = $+[4];
126	151					234	my $after = ${^POSTMATCH};
127
128	151	100				516	unless ($after =~ $Twitter::Text::Regexp::end_hashtag_match) {
129	149					1439	push @$tags, {
130							hashtag => $hash_text,
131							indices => [ $start_position, $end_position ],
132							};
133							}
134							}
135
136	84	100				206	if ($options->{check_url_overlap}) {
137	83					166	my $urls = extract_urls_with_indices($text);
138
139	83	100				197	if (@$urls) {
140	1					3	$tags = [ @$tags, @$urls ];
141							# remove duplicates
142	1					5	$tags = _remove_overlapping_entities($tags);
143							# remove URL entities
144	1					3	$tags = [ grep { $_->{hashtag} } @$tags ];
	2					5
145							}
146							}
147
148	84					205	return $tags;
149							}
150
151							sub extract_mentioned_screen_names {
152	27			27	1	13807	my ($text) = @_;
153	27					35	return [ map { $_->{screen_name} } @{ extract_mentioned_screen_names_with_indices($text) } ];
	28					86
	27					67
154							}
155
156							sub extract_mentioned_screen_names_with_indices {
157	32			32	1	6897	my ($text) = @_;
158
159	32	100				79	return [] unless $text;
160
161	31					40	my $possible_screen_name = [];
162
163	31					37	for my $mention_or_list (@{ extract_mentions_or_lists_with_indices($text) }) {
	31					60
164	32	100				59	next if length $mention_or_list->{list_slug};
165							push @$possible_screen_name, {
166							screen_name => $mention_or_list->{screen_name},
167							indices => $mention_or_list->{indices},
168	31					94	};
169							}
170
171	31					91	return $possible_screen_name;
172							}
173
174							sub extract_mentions_or_lists_with_indices {
175	37			37	1	8009	my ($text) = @_;
176
177	37	100				183	return [] unless $text =~ /[@＠]/;
178
179	36					60	my $possible_entries = [];
180
181	36					516	while ($text =~ /($Twitter::Text::Regexp::valid_mention_or_list)/gp) {
182	42					151	my ($before, $at, $screen_name, $list_slug) = ($2, $3, $4, $5);
183	42					101	my $start_position = $-[4] - 1;
184	42	100				125	my $end_position = $+[ defined $list_slug ? 5 : 4 ];
185	42					78	my $after = ${^POSTMATCH};
186
187	42	100				204	unless ($after =~ $Twitter::Text::Regexp::end_mention_match) {
188	38		100			430	push @$possible_entries, {
189							screen_name => $screen_name,
190							list_slug => $list_slug \|\| '',
191							indices => [ $start_position, $end_position ],
192							};
193							}
194							}
195	36					90	return $possible_entries;
196							}
197
198							sub extract_urls {
199	1665			1665	1	6931974	my ($text) = @_;
200	1665					3817	my $urls = extract_urls_with_indices($text);
201	1665					3092	return [ map { $_->{url} } @$urls ];
	1674					8166
202							}
203
204							sub extract_urls_with_indices {
205	1813			1813	1	48343	my ($text, $options) = @_;
206	1813		100			9937	$options \|\|= {
207							extract_url_without_protocol => 1,
208							};
209
210	1813	100	100			11002	return [] unless $text && ($options->{extract_url_without_protocol} ? $text =~ /\./ : $text =~ /:/);
		100
211
212	1707					3414	my $urls = [];
213
214	1707					70385	while ($text =~ /($Twitter::Text::Regexp::valid_url)/g) {
215	1765					21539	my $before = $3;
216	1765					3882	my $url = $4;
217	1765					3383	my $protocol = $5;
218	1765					3095	my $domain = $6;
219	1765					2896	my $path = $8;
220	1765					8298	my ($start, $end) = ($-[4], $+[4]);
221
222	1765	100				5415	if (!$protocol) {
223	59	100	66			619	next if !$options->{extract_url_without_protocol} \|\| $before =~ $Twitter::Text::Regexp::invalid_url_without_protocol_preceding_chars;
224	44					69	my $last_url;
225
226	44					8030	while ($domain =~ /($Twitter::Text::Regexp::valid_ascii_domain)/g) {
227	55					139	my $ascii_domain = $1;
228	55	100				141	next unless _is_valid_domain(length $url, $ascii_domain, $protocol);
229	53					250	$last_url = {
230							url => $ascii_domain,
231							indices => [ $start + $-[0], $start + $+[0] ],
232							};
233	53					575	push @$urls, $last_url;
234							}
235
236							# no ASCII-only domain found. Skip the entire URL
237	44	100				146	next unless $last_url;
238
239							# last_url only contains domain. Need to add path and query if they exist.
240	42	100				570	if ($path) {
241							# last_url was not added. Add it to urls here.
242	15					286	$last_url->{url} = $url =~ s/$domain/$last_url->{url}/re;
	15					52
243	15					222	$last_url->{indices}->[1] = $end;
244							}
245							} else {
246	1706	100				10162	if ($url =~ /($Twitter::Text::Regexp::valid_tco_url)/) {
247	15	100	66			94	next if $2 && length $2 >= MAX_TCO_SLUG_LENGTH;
248	14					31	$url = $1;
249	14					27	$end = $start + length $url;
250							}
251
252	1705	100				5357	next unless _is_valid_domain(length $url, $domain, $protocol);
253
254	1695					26112	push @$urls, {
255							url => $url,
256							indices => [ $start, $end ],
257							};
258
259							}
260							}
261
262	1707					11738	return $urls;
263							}
264
265							sub _is_valid_domain {
266	1760			1760		4053	my ($url_length, $domain, $protocol) = @_;
267	1760	50				3552	croak 'invalid empty domain' unless $domain;
268
269	1760					3115	my $original_domain_length = length $domain;
270	1760					2614	my $encoded_domain = eval { domain_to_ascii($domain) };
	1760					4900
271
272	1760	100				307979	if ($@) {
273	12					3715	return 0;
274							}
275	1748					3565	my $updated_domain_length = length $encoded_domain;
276	1748	100				3862	$url_length += $updated_domain_length - $original_domain_length if $updated_domain_length > $original_domain_length;
277	1748	100				3167	$url_length += URL_PROTOCOL_LENGTH unless $protocol;
278	1748					5249	return $url_length <= MAX_URL_LENGTH;
279							}
280
281							sub is_valid_tweet {
282	8			8	0	640	my ($text) = @_;
283							return parse_tweet(
284							$text,
285							{
286							config => Twitter::Text::Configuration::V1,
287							}
288	8					24	)->{valid};
289							}
290
291							sub is_valid_hashtag {
292	8			8	1	6039	my ($hashtag) = @_;
293
294	8	100				35	return 0 unless length $hashtag;
295
296	7					19	my $extracted = extract_hashtags($hashtag);
297	7		66			49	return scalar(@$extracted) == 1 && $extracted->[0] eq (substr $hashtag, 1);
298							}
299
300							sub is_valid_list {
301	6			6	1	4460	my ($username_list) = @_;
302	6		66			209	return !!($username_list =~ /\A($Twitter::Text::Regexp::valid_mention_or_list)\z/ && $2 eq '' && $5 && length $5);
303							}
304
305							sub is_valid_url {
306	33			33	1	31272	my ($url, %opts) = @_;
307	33	100				88	my $unicode_domains = exists $opts{unicode_domains} ? $opts{unicode_domains} : 1;
308	33	100				71	my $require_protocol = exists $opts{require_protocol} ? $opts{require_protocol} : 1;
309
310	33	100				87	return 0 unless $url;
311
312	31					407	my ($url_parts) = $url =~ /($Twitter::Text::Regexp::validate_url_unencoded)/;
313	31	50	33			161	return 0 unless $url_parts && $url_parts eq $url;
314
315	31					153	my ($scheme, $authorithy, $path, $query, $fragment) = ($2, $3, $4, $5, $6);
316	31	100	100			105	return 0 unless ((!$require_protocol \|\| (_valid_match($scheme, $Twitter::Text::Regexp::validate_url_scheme) && $scheme =~ /\Ahttps?\Z/i))
			100
			66
			100
317							&& _valid_match($path, $Twitter::Text::Regexp::validate_url_path)
318							&& _valid_match($query, $Twitter::Text::Regexp::validate_url_query, 1)
319							&& _valid_match($fragment, $Twitter::Text::Regexp::validate_url_fragment, 1));
320
321	28		66			120	return ($unicode_domains && _valid_match($authorithy, $Twitter::Text::Regexp::validate_url_unicode_authority))
322							\|\| (!$unicode_domains && _valid_match($authorithy, $Twitter::Text::Regexp::validate_url_authority));
323							}
324
325							sub _valid_match {
326	142			142		7588	my ($string, $regex, $optional) = @_;
327	142	100	100			3396	return (defined $string && ($string =~ /($regex)/) && $1 eq $string) unless $optional;
328	58		100			340	return !(defined $string && (!($string =~ /($regex)/) \|\| $1 ne $string));
329							}
330
331							sub is_valid_username {
332	5			5	1	3801	my ($username) = @_;
333
334	5	100				37	return 0 unless $username;
335
336	4					11	my $extracted = extract_mentioned_screen_names($username);
337	4		66			30	return scalar(@$extracted) == 1 && $extracted->[0] eq substr($username, 1);
338							}
339
340							sub parse_tweet {
341	52			52	1	58166	my ($text, $options) = @_;
342							# merge options
343	52		100			200	$options \|\|= {};
344	52					85	$options->{$_} = DEFAULT_TCO_URL_LENGTHS()->{$_} for keys %{ DEFAULT_TCO_URL_LENGTHS() };
	52					209
345
346	52					3390	my $normalized_text = NFC($text);
347
348	52	100				386	return _empty_parse_results() unless length $normalized_text > 0;
349
350	51		66			207	my $config = $options->{config} \|\| Twitter::Text::Configuration::default_configuration;
351	51					784	my $scale = $config->{scale};
352	51					103	my $max_weighted_tweet_length = $config->{maxWeightedTweetLength};
353	51					88	my $scaled_max_weighted_tweet_length = $max_weighted_tweet_length * $scale;
354	51					84	my $transformed_url_length = $config->{transformedURLLength} * $scale;
355	51					88	my $ranges = $config->{ranges};
356
357	51					132	my $url_entities = extract_urls_with_indices($normalized_text);
358	51	100				249	my $emoji_entities = $config->{emojiParsingEnabled} ? extract_emoji_with_indices($normalized_text) : [];
359
360	51					98	my $has_invalid_chars = 0;
361	51					72	my $weighted_count = 0;
362	51					111	my $offset = 0;
363	51					67	my $display_offset = 0;
364	51					76	my $valid_offset = 0;
365
366	51					121	while ($offset < length $normalized_text) {
367	29340					41097	my $char_weight = $config->{defaultWeight};
368	29340					35602	my $entity_length = 0;
369
370	29340					43083	for my $url_entity (@$url_entities) {
371	26249	100				50044	if ($url_entity->{indices}->[0] == $offset) {
372	37					65	$entity_length = $url_entity->{indices}->[1] - $url_entity->{indices}->[0];
373	37					60	$weighted_count += $transformed_url_length;
374	37					59	$offset += $entity_length;
375	37					60	$display_offset += $entity_length;
376
377	37	100				74	if ($weighted_count <= $scaled_max_weighted_tweet_length) {
378	31					40	$valid_offset += $entity_length;
379							}
380							# Finding a match breaks the loop
381	37					56	last;
382							}
383							}
384
385	29340					41421	for my $emoji_entity (@$emoji_entities) {
386	22967	100				37740	if ($emoji_entity->{indices}->[0] == $offset) {
387	318					417	$entity_length = $emoji_entity->{indices}->[1] - $emoji_entity->{indices}->[0];
388	318					359	$weighted_count += $char_weight; # the default weight
389	318					371	$offset += $entity_length;
390	318					371	$display_offset += $entity_length;
391
392	318	100				498	if ($weighted_count <= $scaled_max_weighted_tweet_length) {
393	298					346	$valid_offset += $entity_length;
394							}
395							# Finding a match breaks the loop
396	318					382	last;
397							}
398							}
399
400	29340	100				47563	next if $entity_length > 0;
401
402	28985	50				48331	if ($offset < length $normalized_text) {
403	28985					48933	my $code_point = substr $normalized_text, $offset, 1;
404
405	28985					41969	for my $range (@$ranges) {
406	32573					62107	my ($chr) = unpack 'U', $code_point;
407	32573					55917	my ($range_start, $range_end) = ($range->{start}, $range->{end});
408
409	32573	100	100			87656	if ($range_start <= $chr && $chr <= $range_end) {
410	26811					35914	$char_weight = $range->{weight};
411	26811					38408	last;
412							}
413							}
414
415	28985					34382	$weighted_count += $char_weight;
416
417	28985	100				54432	$has_invalid_chars = _contains_invalid($code_point) unless $has_invalid_chars;
418	28985					47987	my $codepoint_length = length $code_point;
419	28985					34532	$offset += $codepoint_length;
420	28985					32935	$display_offset += $codepoint_length;
421
422	28985	100	100			93745	if (!$has_invalid_chars && ($weighted_count <= $scaled_max_weighted_tweet_length)) {
423	5190					10029	$valid_offset += $codepoint_length;
424							}
425							}
426							}
427
428	51					307	my $normalized_text_offset = length($text) - length($normalized_text);
429	51					118	my $scaled_weighted_length = $weighted_count / $scale;
430	51		100			193	my $is_valid = !$has_invalid_chars && ($scaled_weighted_length <= $max_weighted_tweet_length);
431	51					148	my $permilage = int($scaled_weighted_length * 1000 / $max_weighted_tweet_length);
432
433							return +{
434	51	100				695	weighted_length => $scaled_weighted_length,
435							valid => $is_valid ? 1 : 0,
436							permillage => $permilage,
437							display_range_start => 0,
438							display_range_end => $display_offset + $normalized_text_offset - 1,
439							valid_range_start => 0,
440							valid_range_end => $valid_offset + $normalized_text_offset - 1,
441							};
442							}
443
444							sub _empty_parse_results {
445							return {
446	1			1		13	weighted_length => 0,
447							valid => 0,
448							permillage => 0,
449							display_range_start => 0,
450							display_range_end => 0,
451							valid_range_start => 0,
452							valid_range_end => 0,
453							};
454							}
455
456							sub _contains_invalid {
457	28981			28981		45185	my ($text) = @_;
458
459	28981	100	66			76798	return 0 if !$text \|\| length $text == 0;
460	28978					142388	return $text =~ qr/[$Twitter::Text::Regexp::INVALID_CHARACTERS]/;
461							}
462
463							1;
464							__END__