File Coverage

blib/lib/Twitter/Text.pm

Criterion	Covered	Total	%
statement	258	258	100.0
branch	91	94	96.8
condition	55	66	83.3
subroutine	37	37	100.0
pod	12	16	75.0
total	453	471	96.1

line	stmt	bran	cond	sub	pod	time	code
1							package Twitter::Text;
2	4			4		2292	use 5.010000;
	4					13
3	4			4		23	use strict;
	4					7
	4					81
4	4			4		22	use warnings;
	4					6
	4					177
5	4			4		538	use utf8;
	4					17
	4					27
6	4			4		679	no if $^V lt v5.13.9, 'warnings', 'utf8'; ## no critic (ValuesAndExpressions::ProhibitMismatchedOperators)
	4					16
	4					72
7
8							use constant {
9	4					456	DEFAULT_TCO_URL_LENGTHS => {
10							short_url_length => 23,
11							},
12							MAX_WEIGHTENED_LENGTH => 280,
13							MAX_URL_LENGTH => 4096,
14							MAX_TCO_SLUG_LENGTH => 40,
15							URL_PROTOCOL_LENGTH => length 'https://',
16	4			4		274	};
	4					7
17	4			4		26	use Carp qw(croak);
	4					7
	4					198
18	4			4		34	use Exporter 'import';
	4					8
	4					148
19	4			4		37	use List::Util qw(min);
	4					9
	4					247
20	4			4		1950	use List::UtilsBy qw(nsort_by);
	4					7146
	4					244
21	4			4		1843	use Net::IDN::Encode qw(domain_to_ascii);
	4					367502
	4					290
22	4			4		1150	use Twitter::Text::Configuration;
	4					11
	4					134
23	4			4		2113	use Twitter::Text::Regexp;
	4					61
	4					397
24	4			4		2695	use Twitter::Text::Regexp::Emoji;
	4					13
	4					237
25	4			4		38	use Unicode::Normalize qw(NFC);
	4					89
	4					10122
26
27							our $VERSION = "0.08";
28							our @EXPORT = (
29							# Extraction
30							qw(
31							extract_cashtags
32							extract_cashtags_with_indices
33							extract_hashtags
34							extract_hashtags_with_indices
35							extract_mentioned_screen_names
36							extract_mentioned_screen_names_with_indices
37							extract_mentions_or_lists_with_indices
38							extract_urls
39							extract_urls_with_indices
40							),
41							# Validation
42							qw(
43							is_valid_hashtag
44							is_valid_list
45							is_valid_tweet
46							is_valid_url
47							is_valid_username
48							parse_tweet
49							),
50							);
51
52							sub extract_emoji_with_indices {
53	24			24	0	272	my ($text) = @_;
54	24					53	my $emoji = [];
55
56	24					9186	while ($text =~ /($Twitter::Text::Regexp::Emoji::valid_emoji)/g) {
57	318					647	my $emoji_text = $1;
58	318					1398	my $start_position = $-[1];
59	318					1312	my $end_position = $+[1];
60	318					3717	push @$emoji, {
61							emoji => $emoji_text,
62							indices => [ $start_position, $end_position ],
63							};
64							}
65	24					106	return $emoji;
66							}
67
68							sub _remove_overlapping_entities {
69	1			1		4	my ($entities) = @_;
70
71	1			4		13	$entities = [ nsort_by { $_->{indices}->[0] } @$entities ];
	4					26
72							# remove duplicates
73	1					24	my $ret = [];
74	1					3	my $prev;
75
76	1					3	for my $entity (@$entities) {
77	4	100	100			19	unless ($prev && $prev->{indices}->[1] > $entity->{indices}->[0]) {
78	2					4	push @$ret, $entity;
79							}
80	4					8	$prev = $entity;
81							}
82	1					4	return $ret;
83							}
84
85							sub extract_cashtags {
86	8			8	0	8074	my ($text) = @_;
87	8					16	return [ map { $_->{cashtag} } @{ extract_cashtags_with_indices($text) } ];
	10					33
	8					20
88							}
89
90							sub extract_cashtags_with_indices {
91	11			11	0	7019	my ($text) = @_;
92
93	11	100				51	return [] unless $text =~ /\$/;
94
95	10					22	my $tags = [];
96
97	10					224	while ($text =~ /($Twitter::Text::Regexp::valid_cashtag)/g) {
98	14					51	my ($before, $dollar, $cash_text) = ($2, $3, $4);
99	14					35	my $start_position = $-[3];
100	14					36	my $end_position = $+[4];
101	14					113	push @$tags, {
102							cashtag => $cash_text,
103							indices => [ $start_position, $end_position ],
104							};
105							}
106
107	10					34	return $tags;
108							}
109
110							sub extract_hashtags {
111	75			75	1	1230779	my ($text) = @_;
112	75					126	return [ map { $_->{hashtag} } @{ extract_hashtags_with_indices($text) } ];
	134					465
	75					163
113							}
114
115							sub extract_hashtags_with_indices {
116	85			85	1	15211	my ($text, $options) = @_;
117
118	85	100				460	return [] unless $text =~ /[#＃]/;
119
120	84	100				267	$options->{check_url_overlap} = 1 unless exists $options->{check_url_overlap};
121
122	84					140	my $tags = [];
123
124	84					1868	while ($text =~ /($Twitter::Text::Regexp::valid_hashtag)/gp) {
125	151					3729	my ($before, $hash, $hash_text) = ($2, $3, $4);
126	151					411	my $start_position = $-[3];
127	151					419	my $end_position = $+[4];
128	151					336	my $after = ${^POSTMATCH};
129
130	151	100				608	unless ($after =~ $Twitter::Text::Regexp::end_hashtag_match) {
131	149					1723	push @$tags, {
132							hashtag => $hash_text,
133							indices => [ $start_position, $end_position ],
134							};
135							}
136							}
137
138	84	100				220	if ($options->{check_url_overlap}) {
139	83					178	my $urls = extract_urls_with_indices($text);
140
141	83	100				216	if (@$urls) {
142	1					3	$tags = [ @$tags, @$urls ];
143							# remove duplicates
144	1					5	$tags = _remove_overlapping_entities($tags);
145							# remove URL entities
146	1					4	$tags = [ grep { $_->{hashtag} } @$tags ];
	2					7
147							}
148							}
149
150	84					244	return $tags;
151							}
152
153							sub extract_mentioned_screen_names {
154	27			27	1	16873	my ($text) = @_;
155	27					49	return [ map { $_->{screen_name} } @{ extract_mentioned_screen_names_with_indices($text) } ];
	28					118
	27					56
156							}
157
158							sub extract_mentioned_screen_names_with_indices {
159	32			32	1	8262	my ($text) = @_;
160
161	32	100				83	return [] unless $text;
162
163	31					52	my $possible_screen_name = [];
164
165	31					46	for my $mention_or_list (@{ extract_mentions_or_lists_with_indices($text) }) {
	31					78
166	32	100				70	next if length $mention_or_list->{list_slug};
167							push @$possible_screen_name, {
168							screen_name => $mention_or_list->{screen_name},
169							indices => $mention_or_list->{indices},
170	31					125	};
171							}
172
173	31					117	return $possible_screen_name;
174							}
175
176							sub extract_mentions_or_lists_with_indices {
177	37			37	1	9946	my ($text) = @_;
178
179	37	100				183	return [] unless $text =~ /[@＠]/;
180
181	36					64	my $possible_entries = [];
182
183	36					567	while ($text =~ /($Twitter::Text::Regexp::valid_mention_or_list)/gp) {
184	42					179	my ($before, $at, $screen_name, $list_slug) = ($2, $3, $4, $5);
185	42					126	my $start_position = $-[4] - 1;
186	42	100				152	my $end_position = $+[ defined $list_slug ? 5 : 4 ];
187	42					90	my $after = ${^POSTMATCH};
188
189	42	100				216	unless ($after =~ $Twitter::Text::Regexp::end_mention_match) {
190	38		100			426	push @$possible_entries, {
191							screen_name => $screen_name,
192							list_slug => $list_slug \|\| '',
193							indices => [ $start_position, $end_position ],
194							};
195							}
196							}
197	36					101	return $possible_entries;
198							}
199
200							sub extract_urls {
201	1665			1665	1	6304572	my ($text) = @_;
202	1665					3330	my $urls = extract_urls_with_indices($text);
203	1665					2994	return [ map { $_->{url} } @$urls ];
	1674					6274
204							}
205
206							sub extract_urls_with_indices {
207	1813			1813	1	50239	my ($text, $options) = @_;
208	1813		100			8438	$options \|\|= {
209							extract_url_without_protocol => 1,
210							};
211
212	1813	100	100			10615	return [] unless $text && ($options->{extract_url_without_protocol} ? $text =~ /\./ : $text =~ /:/);
		100
213
214	1707					2936	my $urls = [];
215
216	1707					61576	while ($text =~ /($Twitter::Text::Regexp::valid_url)/g) {
217	1765					19725	my $before = $3;
218	1765					3359	my $url = $4;
219	1765					2880	my $protocol = $5;
220	1765					2722	my $domain = $6;
221	1765					2513	my $path = $8;
222	1765					7179	my ($start, $end) = ($-[4], $+[4]);
223
224	1765	100				4623	if (!$protocol) {
225	59	100	66			701	next if !$options->{extract_url_without_protocol} \|\| $before =~ $Twitter::Text::Regexp::invalid_url_without_protocol_preceding_chars;
226	44					73	my $last_url;
227
228	44					8528	while ($domain =~ /($Twitter::Text::Regexp::valid_ascii_domain)/g) {
229	55					162	my $ascii_domain = $1;
230	55	100				164	next unless _is_valid_domain(length $url, $ascii_domain, $protocol);
231	53					285	$last_url = {
232							url => $ascii_domain,
233							indices => [ $start + $-[0], $start + $+[0] ],
234							};
235	53					645	push @$urls, $last_url;
236							}
237
238							# no ASCII-only domain found. Skip the entire URL
239	44	100				135	next unless $last_url;
240
241							# last_url only contains domain. Need to add path and query if they exist.
242	42	100				659	if ($path) {
243							# last_url was not added. Add it to urls here.
244	15					27	my $last_url_after = $url;
245	15					285	$last_url_after =~ s/$domain/$last_url->{url}/e;
	15					57
246	15					35	$last_url->{url} = $last_url_after;
247	15					244	$last_url->{indices}->[1] = $end;
248							}
249							} else {
250	1706	100				9051	if ($url =~ /($Twitter::Text::Regexp::valid_tco_url)/) {
251	15	100	66			85	next if $2 && length $2 >= MAX_TCO_SLUG_LENGTH;
252	14					28	$url = $1;
253	14					27	$end = $start + length $url;
254							}
255
256	1705	100				4894	next unless _is_valid_domain(length $url, $domain, $protocol);
257
258	1695					21470	push @$urls, {
259							url => $url,
260							indices => [ $start, $end ],
261							};
262
263							}
264							}
265
266	1707					12512	return $urls;
267							}
268
269							sub _is_valid_domain {
270	1760			1760		3215	my ($url_length, $domain, $protocol) = @_;
271	1760	50				3107	croak 'invalid empty domain' unless $domain;
272
273	1760					2715	my $original_domain_length = length $domain;
274	1760					2241	my $encoded_domain = eval { domain_to_ascii($domain) };
	1760					4478
275
276	1760	100				297556	if ($@) {
277	12					3098	return 0;
278							}
279	1748					2810	my $updated_domain_length = length $encoded_domain;
280	1748	100				3270	$url_length += $updated_domain_length - $original_domain_length if $updated_domain_length > $original_domain_length;
281	1748	100				2856	$url_length += URL_PROTOCOL_LENGTH unless $protocol;
282	1748					4423	return $url_length <= MAX_URL_LENGTH;
283							}
284
285							sub is_valid_tweet {
286	8			8	0	532	my ($text) = @_;
287							return parse_tweet(
288							$text,
289							{
290							config => Twitter::Text::Configuration::V1,
291							}
292	8					16	)->{valid};
293							}
294
295							sub is_valid_hashtag {
296	8			8	1	4913	my ($hashtag) = @_;
297
298	8	100				28	return 0 unless length $hashtag;
299
300	7					16	my $extracted = extract_hashtags($hashtag);
301	7		66			39	return scalar(@$extracted) == 1 && $extracted->[0] eq (substr $hashtag, 1);
302							}
303
304							sub is_valid_list {
305	6			6	1	3696	my ($username_list) = @_;
306	6		66			201	return !!($username_list =~ /\A($Twitter::Text::Regexp::valid_mention_or_list)\z/ && $2 eq '' && $5 && length $5);
307							}
308
309							sub is_valid_url {
310	33			33	1	25407	my ($url, %opts) = @_;
311	33	100				97	my $unicode_domains = exists $opts{unicode_domains} ? $opts{unicode_domains} : 1;
312	33	100				66	my $require_protocol = exists $opts{require_protocol} ? $opts{require_protocol} : 1;
313
314	33	100				79	return 0 unless $url;
315
316	31					336	my ($url_parts) = $url =~ /($Twitter::Text::Regexp::validate_url_unencoded)/;
317	31	50	33			150	return 0 unless $url_parts && $url_parts eq $url;
318
319	31					140	my ($scheme, $authorithy, $path, $query, $fragment) = ($2, $3, $4, $5, $6);
320	31	100	100			96	return 0 unless ((!$require_protocol \|\| (_valid_match($scheme, $Twitter::Text::Regexp::validate_url_scheme) && $scheme =~ /\Ahttps?\Z/i))
			100
			66
			100
321							&& _valid_match($path, $Twitter::Text::Regexp::validate_url_path)
322							&& _valid_match($query, $Twitter::Text::Regexp::validate_url_query, 1)
323							&& _valid_match($fragment, $Twitter::Text::Regexp::validate_url_fragment, 1));
324
325	28		66			96	return ($unicode_domains && _valid_match($authorithy, $Twitter::Text::Regexp::validate_url_unicode_authority))
326							\|\| (!$unicode_domains && _valid_match($authorithy, $Twitter::Text::Regexp::validate_url_authority));
327							}
328
329							sub _valid_match {
330	142			142		6289	my ($string, $regex, $optional) = @_;
331	142	100	100			2792	return (defined $string && ($string =~ /($regex)/) && $1 eq $string) unless $optional;
332	58		100			308	return !(defined $string && (!($string =~ /($regex)/) \|\| $1 ne $string));
333							}
334
335							sub is_valid_username {
336	5			5	1	3176	my ($username) = @_;
337
338	5	100				42	return 0 unless $username;
339
340	4					11	my $extracted = extract_mentioned_screen_names($username);
341	4		66			30	return scalar(@$extracted) == 1 && $extracted->[0] eq substr($username, 1);
342							}
343
344							## no critic (Subroutines::ProhibitExcessComplexity)
345							sub parse_tweet {
346	52			52	1	60020	my ($text, $options) = @_;
347							# merge options
348	52		100			260	$options \|\|= {};
349	52					91	$options->{$_} = DEFAULT_TCO_URL_LENGTHS()->{$_} for keys %{ DEFAULT_TCO_URL_LENGTHS() };
	52					264
350
351	52					2869	my $normalized_text = NFC($text);
352
353	52	100				339	return _empty_parse_results() unless length $normalized_text > 0;
354
355	51		66			241	my $config = $options->{config} \|\| Twitter::Text::Configuration::default_configuration;
356	51					733	my $scale = $config->{scale};
357	51					81	my $max_weighted_tweet_length = $config->{maxWeightedTweetLength};
358	51					101	my $scaled_max_weighted_tweet_length = $max_weighted_tweet_length * $scale;
359	51					94	my $transformed_url_length = $config->{transformedURLLength} * $scale;
360	51					94	my $ranges = $config->{ranges};
361
362	51					148	my $url_entities = extract_urls_with_indices($normalized_text);
363	51	100				322	my $emoji_entities = $config->{emojiParsingEnabled} ? extract_emoji_with_indices($normalized_text) : [];
364
365	51					122	my $has_invalid_chars = 0;
366	51					87	my $weighted_count = 0;
367	51					79	my $offset = 0;
368	51					78	my $display_offset = 0;
369	51					88	my $valid_offset = 0;
370
371	51					147	while ($offset < length $normalized_text) {
372	29340					33461	my $char_weight = $config->{defaultWeight};
373	29340					28417	my $entity_length = 0;
374
375	29340					36921	for my $url_entity (@$url_entities) {
376	26249	100				41555	if ($url_entity->{indices}->[0] == $offset) {
377	37					62	$entity_length = $url_entity->{indices}->[1] - $url_entity->{indices}->[0];
378	37					49	$weighted_count += $transformed_url_length;
379	37					40	$offset += $entity_length;
380	37					40	$display_offset += $entity_length;
381
382	37	100				65	if ($weighted_count <= $scaled_max_weighted_tweet_length) {
383	31					41	$valid_offset += $entity_length;
384							}
385							# Finding a match breaks the loop
386	37					47	last;
387							}
388							}
389
390	29340					32578	for my $emoji_entity (@$emoji_entities) {
391	22967	100				34700	if ($emoji_entity->{indices}->[0] == $offset) {
392	318					419	$entity_length = $emoji_entity->{indices}->[1] - $emoji_entity->{indices}->[0];
393	318					363	$weighted_count += $char_weight; # the default weight
394	318					337	$offset += $entity_length;
395	318					337	$display_offset += $entity_length;
396
397	318	100				445	if ($weighted_count <= $scaled_max_weighted_tweet_length) {
398	298					330	$valid_offset += $entity_length;
399							}
400							# Finding a match breaks the loop
401	318					375	last;
402							}
403							}
404
405	29340	100				38575	next if $entity_length > 0;
406
407	28985	50				39713	if ($offset < length $normalized_text) {
408	28985					38562	my $code_point = substr $normalized_text, $offset, 1;
409
410	28985					33528	for my $range (@$ranges) {
411	32573					52973	my ($chr) = unpack 'U', $code_point;
412	32573					45214	my ($range_start, $range_end) = ($range->{start}, $range->{end});
413
414	32573	100	100			71645	if ($range_start <= $chr && $chr <= $range_end) {
415	26811					28358	$char_weight = $range->{weight};
416	26811					32508	last;
417							}
418							}
419
420	28985					28741	$weighted_count += $char_weight;
421
422	28985	100				47665	$has_invalid_chars = _contains_invalid($code_point) unless $has_invalid_chars;
423	28985					44285	my $codepoint_length = length $code_point;
424	28985					28928	$offset += $codepoint_length;
425	28985					27535	$display_offset += $codepoint_length;
426
427	28985	100	100			82664	if (!$has_invalid_chars && ($weighted_count <= $scaled_max_weighted_tweet_length)) {
428	5190					8537	$valid_offset += $codepoint_length;
429							}
430							}
431							}
432
433	51					309	my $normalized_text_offset = length($text) - length($normalized_text);
434	51					147	my $scaled_weighted_length = $weighted_count / $scale;
435	51		100			292	my $is_valid = !$has_invalid_chars && ($scaled_weighted_length <= $max_weighted_tweet_length);
436	51					151	my $permilage = int($scaled_weighted_length * 1000 / $max_weighted_tweet_length);
437
438							return +{
439	51	100				1015	weighted_length => $scaled_weighted_length,
440							valid => $is_valid ? 1 : 0,
441							permillage => $permilage,
442							display_range_start => 0,
443							display_range_end => $display_offset + $normalized_text_offset - 1,
444							valid_range_start => 0,
445							valid_range_end => $valid_offset + $normalized_text_offset - 1,
446							};
447							}
448							## use critic
449
450							sub _empty_parse_results {
451							return {
452	1			1		10	weighted_length => 0,
453							valid => 0,
454							permillage => 0,
455							display_range_start => 0,
456							display_range_end => 0,
457							valid_range_start => 0,
458							valid_range_end => 0,
459							};
460							}
461
462							sub _contains_invalid {
463	28981			28981		39447	my ($text) = @_;
464
465	28981	100	66			66378	return 0 if !$text \|\| length $text == 0;
466	28978					135395	return $text =~ qr/[$Twitter::Text::Regexp::INVALID_CHARACTERS]/;
467							}
468
469							1;
470							__END__