File Coverage

blib/lib/Twitter/Text.pm

Criterion	Covered	Total	%
statement	258	258	100.0
branch	91	94	96.8
condition	55	66	83.3
subroutine	37	37	100.0
pod	12	16	75.0
total	453	471	96.1

line	stmt	bran	cond	sub	pod	time	code
1							package Twitter::Text;
2	4			4		2618	use 5.010000;
	4					16
3	4			4		23	use strict;
	4					8
	4					82
4	4			4		20	use warnings;
	4					7
	4					101
5	4			4		619	use utf8;
	4					21
	4					34
6	4			4		812	no if $^V lt v5.13.9, 'warnings', 'utf8';
	4					20
	4					80
7
8							use constant {
9	4					487	DEFAULT_TCO_URL_LENGTHS => {
10							short_url_length => 23,
11							},
12							MAX_WEIGHTENED_LENGTH => 280,
13							MAX_URL_LENGTH => 4096,
14							MAX_TCO_SLUG_LENGTH => 40,
15							URL_PROTOCOL_LENGTH => length 'https://',
16	4			4		306	};
	4					22
17	4			4		26	use Carp qw(croak);
	4					21
	4					224
18	4			4		25	use Exporter 'import';
	4					8
	4					161
19	4			4		32	use List::Util qw(min);
	4					10
	4					273
20	4			4		2128	use List::UtilsBy qw(nsort_by);
	4					7976
	4					274
21	4			4		2132	use Net::IDN::Encode qw(domain_to_ascii);
	4					418945
	4					323
22	4			4		1410	use Twitter::Text::Configuration;
	4					11
	4					159
23	4			4		2231	use Twitter::Text::Regexp;
	4					84
	4					368
24	4			4		2350	use Twitter::Text::Regexp::Emoji;
	4					11
	4					206
25	4			4		29	use Unicode::Normalize qw(NFC);
	4					78
	4					10789
26
27							our $VERSION = "0.07_01";
28							our @EXPORT = (
29							# Extraction
30							qw(
31							extract_cashtags
32							extract_cashtags_with_indices
33							extract_hashtags
34							extract_hashtags_with_indices
35							extract_mentioned_screen_names
36							extract_mentioned_screen_names_with_indices
37							extract_mentions_or_lists_with_indices
38							extract_urls
39							extract_urls_with_indices
40							),
41							# Validation
42							qw(
43							is_valid_hashtag
44							is_valid_list
45							is_valid_tweet
46							is_valid_url
47							is_valid_username
48							parse_tweet
49							),
50							);
51
52							sub extract_emoji_with_indices {
53	24			24	0	245	my ($text) = @_;
54	24					52	my $emoji = [];
55
56	24					10669	while ($text =~ /($Twitter::Text::Regexp::Emoji::valid_emoji)/g) {
57	318					726	my $emoji_text = $1;
58	318					1418	my $start_position = $-[1];
59	318					1407	my $end_position = $+[1];
60	318					4261	push @$emoji, {
61							emoji => $emoji_text,
62							indices => [ $start_position, $end_position ],
63							};
64							}
65	24					81	return $emoji;
66							}
67
68							sub _remove_overlapping_entities {
69	1			1		4	my ($entities) = @_;
70
71	1			4		13	$entities = [ nsort_by { $_->{indices}->[0] } @$entities ];
	4					38
72							# remove duplicates
73	1					26	my $ret = [];
74	1					2	my $prev;
75
76	1					4	for my $entity (@$entities) {
77	4	100	100			20	unless ($prev && $prev->{indices}->[1] > $entity->{indices}->[0]) {
78	2					5	push @$ret, $entity;
79							}
80	4					7	$prev = $entity;
81							}
82	1					4	return $ret;
83							}
84
85							sub extract_cashtags {
86	8			8	0	8187	my ($text) = @_;
87	8					17	return [ map { $_->{cashtag} } @{ extract_cashtags_with_indices($text) } ];
	10					36
	8					18
88							}
89
90							sub extract_cashtags_with_indices {
91	11			11	0	6862	my ($text) = @_;
92
93	11	100				52	return [] unless $text =~ /\$/;
94
95	10					20	my $tags = [];
96
97	10					232	while ($text =~ /($Twitter::Text::Regexp::valid_cashtag)/g) {
98	14					51	my ($before, $dollar, $cash_text) = ($2, $3, $4);
99	14					36	my $start_position = $-[3];
100	14					39	my $end_position = $+[4];
101	14					112	push @$tags, {
102							cashtag => $cash_text,
103							indices => [ $start_position, $end_position ],
104							};
105							}
106
107	10					35	return $tags;
108							}
109
110							sub extract_hashtags {
111	75			75	1	1190522	my ($text) = @_;
112	75					117	return [ map { $_->{hashtag} } @{ extract_hashtags_with_indices($text) } ];
	134					474
	75					168
113							}
114
115							sub extract_hashtags_with_indices {
116	85			85	1	15594	my ($text, $options) = @_;
117
118	85	100				452	return [] unless $text =~ /[#＃]/;
119
120	84	100				276	$options->{check_url_overlap} = 1 unless exists $options->{check_url_overlap};
121
122	84					321	my $tags = [];
123
124	84					1786	while ($text =~ /($Twitter::Text::Regexp::valid_hashtag)/gp) {
125	151					4022	my ($before, $hash, $hash_text) = ($2, $3, $4);
126	151					388	my $start_position = $-[3];
127	151					391	my $end_position = $+[4];
128	151					311	my $after = ${^POSTMATCH};
129
130	151	100				591	unless ($after =~ $Twitter::Text::Regexp::end_hashtag_match) {
131	149					1745	push @$tags, {
132							hashtag => $hash_text,
133							indices => [ $start_position, $end_position ],
134							};
135							}
136							}
137
138	84	100				215	if ($options->{check_url_overlap}) {
139	83					177	my $urls = extract_urls_with_indices($text);
140
141	83	100				229	if (@$urls) {
142	1					4	$tags = [ @$tags, @$urls ];
143							# remove duplicates
144	1					5	$tags = _remove_overlapping_entities($tags);
145							# remove URL entities
146	1					12	$tags = [ grep { $_->{hashtag} } @$tags ];
	2					6
147							}
148							}
149
150	84					259	return $tags;
151							}
152
153							sub extract_mentioned_screen_names {
154	27			27	1	17054	my ($text) = @_;
155	27					46	return [ map { $_->{screen_name} } @{ extract_mentioned_screen_names_with_indices($text) } ];
	28					103
	27					62
156							}
157
158							sub extract_mentioned_screen_names_with_indices {
159	32			32	1	8476	my ($text) = @_;
160
161	32	100				96	return [] unless $text;
162
163	31					55	my $possible_screen_name = [];
164
165	31					45	for my $mention_or_list (@{ extract_mentions_or_lists_with_indices($text) }) {
	31					68
166	32	100				84	next if length $mention_or_list->{list_slug};
167							push @$possible_screen_name, {
168							screen_name => $mention_or_list->{screen_name},
169							indices => $mention_or_list->{indices},
170	31					89	};
171							}
172
173	31					109	return $possible_screen_name;
174							}
175
176							sub extract_mentions_or_lists_with_indices {
177	37			37	1	10134	my ($text) = @_;
178
179	37	100				176	return [] unless $text =~ /[@＠]/;
180
181	36					66	my $possible_entries = [];
182
183	36					636	while ($text =~ /($Twitter::Text::Regexp::valid_mention_or_list)/gp) {
184	42					185	my ($before, $at, $screen_name, $list_slug) = ($2, $3, $4, $5);
185	42					122	my $start_position = $-[4] - 1;
186	42	100				154	my $end_position = $+[ defined $list_slug ? 5 : 4 ];
187	42					96	my $after = ${^POSTMATCH};
188
189	42	100				213	unless ($after =~ $Twitter::Text::Regexp::end_mention_match) {
190	38		100			435	push @$possible_entries, {
191							screen_name => $screen_name,
192							list_slug => $list_slug \|\| '',
193							indices => [ $start_position, $end_position ],
194							};
195							}
196							}
197	36					117	return $possible_entries;
198							}
199
200							sub extract_urls {
201	1665			1665	1	6769053	my ($text) = @_;
202	1665					3664	my $urls = extract_urls_with_indices($text);
203	1665					3262	return [ map { $_->{url} } @$urls ];
	1674					7355
204							}
205
206							sub extract_urls_with_indices {
207	1813			1813	1	52795	my ($text, $options) = @_;
208	1813		100			9848	$options \|\|= {
209							extract_url_without_protocol => 1,
210							};
211
212	1813	100	100			11753	return [] unless $text && ($options->{extract_url_without_protocol} ? $text =~ /\./ : $text =~ /:/);
		100
213
214	1707					3357	my $urls = [];
215
216	1707					71345	while ($text =~ /($Twitter::Text::Regexp::valid_url)/g) {
217	1765					21372	my $before = $3;
218	1765					3735	my $url = $4;
219	1765					3415	my $protocol = $5;
220	1765					3131	my $domain = $6;
221	1765					2905	my $path = $8;
222	1765					8366	my ($start, $end) = ($-[4], $+[4]);
223
224	1765	100				5270	if (!$protocol) {
225	59	100	66			720	next if !$options->{extract_url_without_protocol} \|\| $before =~ $Twitter::Text::Regexp::invalid_url_without_protocol_preceding_chars;
226	44					74	my $last_url;
227
228	44					9074	while ($domain =~ /($Twitter::Text::Regexp::valid_ascii_domain)/g) {
229	55					189	my $ascii_domain = $1;
230	55	100				163	next unless _is_valid_domain(length $url, $ascii_domain, $protocol);
231	53					294	$last_url = {
232							url => $ascii_domain,
233							indices => [ $start + $-[0], $start + $+[0] ],
234							};
235	53					644	push @$urls, $last_url;
236							}
237
238							# no ASCII-only domain found. Skip the entire URL
239	44	100				143	next unless $last_url;
240
241							# last_url only contains domain. Need to add path and query if they exist.
242	42	100				738	if ($path) {
243							# last_url was not added. Add it to urls here.
244	15					24	my $last_url_after = $url;
245	15					316	$last_url_after =~ s/$domain/$last_url->{url}/e;
	15					59
246	15					35	$last_url->{url} = $last_url_after;
247	15					236	$last_url->{indices}->[1] = $end;
248							}
249							} else {
250	1706	100				10420	if ($url =~ /($Twitter::Text::Regexp::valid_tco_url)/) {
251	15	100	66			93	next if $2 && length $2 >= MAX_TCO_SLUG_LENGTH;
252	14					29	$url = $1;
253	14					29	$end = $start + length $url;
254							}
255
256	1705	100				5830	next unless _is_valid_domain(length $url, $domain, $protocol);
257
258	1695					25233	push @$urls, {
259							url => $url,
260							indices => [ $start, $end ],
261							};
262
263							}
264							}
265
266	1707					13183	return $urls;
267							}
268
269							sub _is_valid_domain {
270	1760			1760		3684	my ($url_length, $domain, $protocol) = @_;
271	1760	50				3924	croak 'invalid empty domain' unless $domain;
272
273	1760					3237	my $original_domain_length = length $domain;
274	1760					2753	my $encoded_domain = eval { domain_to_ascii($domain) };
	1760					5395
275
276	1760	100				313293	if ($@) {
277	12					3724	return 0;
278							}
279	1748					3355	my $updated_domain_length = length $encoded_domain;
280	1748	100				3494	$url_length += $updated_domain_length - $original_domain_length if $updated_domain_length > $original_domain_length;
281	1748	100				3430	$url_length += URL_PROTOCOL_LENGTH unless $protocol;
282	1748					5169	return $url_length <= MAX_URL_LENGTH;
283							}
284
285							sub is_valid_tweet {
286	8			8	0	627	my ($text) = @_;
287							return parse_tweet(
288							$text,
289							{
290							config => Twitter::Text::Configuration::V1,
291							}
292	8					21	)->{valid};
293							}
294
295							sub is_valid_hashtag {
296	8			8	1	5731	my ($hashtag) = @_;
297
298	8	100				32	return 0 unless length $hashtag;
299
300	7					19	my $extracted = extract_hashtags($hashtag);
301	7		66			47	return scalar(@$extracted) == 1 && $extracted->[0] eq (substr $hashtag, 1);
302							}
303
304							sub is_valid_list {
305	6			6	1	4382	my ($username_list) = @_;
306	6		66			208	return !!($username_list =~ /\A($Twitter::Text::Regexp::valid_mention_or_list)\z/ && $2 eq '' && $5 && length $5);
307							}
308
309							sub is_valid_url {
310	33			33	1	30588	my ($url, %opts) = @_;
311	33	100				94	my $unicode_domains = exists $opts{unicode_domains} ? $opts{unicode_domains} : 1;
312	33	100				67	my $require_protocol = exists $opts{require_protocol} ? $opts{require_protocol} : 1;
313
314	33	100				80	return 0 unless $url;
315
316	31					450	my ($url_parts) = $url =~ /($Twitter::Text::Regexp::validate_url_unencoded)/;
317	31	50	33			171	return 0 unless $url_parts && $url_parts eq $url;
318
319	31					154	my ($scheme, $authorithy, $path, $query, $fragment) = ($2, $3, $4, $5, $6);
320	31	100	100			108	return 0 unless ((!$require_protocol \|\| (_valid_match($scheme, $Twitter::Text::Regexp::validate_url_scheme) && $scheme =~ /\Ahttps?\Z/i))
			100
			66
			100
321							&& _valid_match($path, $Twitter::Text::Regexp::validate_url_path)
322							&& _valid_match($query, $Twitter::Text::Regexp::validate_url_query, 1)
323							&& _valid_match($fragment, $Twitter::Text::Regexp::validate_url_fragment, 1));
324
325	28		66			131	return ($unicode_domains && _valid_match($authorithy, $Twitter::Text::Regexp::validate_url_unicode_authority))
326							\|\| (!$unicode_domains && _valid_match($authorithy, $Twitter::Text::Regexp::validate_url_authority));
327							}
328
329							sub _valid_match {
330	142			142		7558	my ($string, $regex, $optional) = @_;
331	142	100	100			3311	return (defined $string && ($string =~ /($regex)/) && $1 eq $string) unless $optional;
332	58		100			340	return !(defined $string && (!($string =~ /($regex)/) \|\| $1 ne $string));
333							}
334
335							sub is_valid_username {
336	5			5	1	3745	my ($username) = @_;
337
338	5	100				33	return 0 unless $username;
339
340	4					11	my $extracted = extract_mentioned_screen_names($username);
341	4		66			29	return scalar(@$extracted) == 1 && $extracted->[0] eq substr($username, 1);
342							}
343
344							sub parse_tweet {
345	52			52	1	57541	my ($text, $options) = @_;
346							# merge options
347	52		100			196	$options \|\|= {};
348	52					81	$options->{$_} = DEFAULT_TCO_URL_LENGTHS()->{$_} for keys %{ DEFAULT_TCO_URL_LENGTHS() };
	52					220
349
350	52					3343	my $normalized_text = NFC($text);
351
352	52	100				363	return _empty_parse_results() unless length $normalized_text > 0;
353
354	51		66			194	my $config = $options->{config} \|\| Twitter::Text::Configuration::default_configuration;
355	51					841	my $scale = $config->{scale};
356	51					76	my $max_weighted_tweet_length = $config->{maxWeightedTweetLength};
357	51					87	my $scaled_max_weighted_tweet_length = $max_weighted_tweet_length * $scale;
358	51					84	my $transformed_url_length = $config->{transformedURLLength} * $scale;
359	51					86	my $ranges = $config->{ranges};
360
361	51					119	my $url_entities = extract_urls_with_indices($normalized_text);
362	51	100				247	my $emoji_entities = $config->{emojiParsingEnabled} ? extract_emoji_with_indices($normalized_text) : [];
363
364	51					99	my $has_invalid_chars = 0;
365	51					78	my $weighted_count = 0;
366	51					66	my $offset = 0;
367	51					77	my $display_offset = 0;
368	51					71	my $valid_offset = 0;
369
370	51					120	while ($offset < length $normalized_text) {
371	29340					42287	my $char_weight = $config->{defaultWeight};
372	29340					36403	my $entity_length = 0;
373
374	29340					45850	for my $url_entity (@$url_entities) {
375	26249	100				52246	if ($url_entity->{indices}->[0] == $offset) {
376	37					66	$entity_length = $url_entity->{indices}->[1] - $url_entity->{indices}->[0];
377	37					48	$weighted_count += $transformed_url_length;
378	37					62	$offset += $entity_length;
379	37					44	$display_offset += $entity_length;
380
381	37	100				76	if ($weighted_count <= $scaled_max_weighted_tweet_length) {
382	31					41	$valid_offset += $entity_length;
383							}
384							# Finding a match breaks the loop
385	37					57	last;
386							}
387							}
388
389	29340					40156	for my $emoji_entity (@$emoji_entities) {
390	22967	100				37685	if ($emoji_entity->{indices}->[0] == $offset) {
391	318					424	$entity_length = $emoji_entity->{indices}->[1] - $emoji_entity->{indices}->[0];
392	318					379	$weighted_count += $char_weight; # the default weight
393	318					370	$offset += $entity_length;
394	318					372	$display_offset += $entity_length;
395
396	318	100				487	if ($weighted_count <= $scaled_max_weighted_tweet_length) {
397	298					352	$valid_offset += $entity_length;
398							}
399							# Finding a match breaks the loop
400	318					389	last;
401							}
402							}
403
404	29340	100				47474	next if $entity_length > 0;
405
406	28985	50				49194	if ($offset < length $normalized_text) {
407	28985					47180	my $code_point = substr $normalized_text, $offset, 1;
408
409	28985					41709	for my $range (@$ranges) {
410	32573					60098	my ($chr) = unpack 'U', $code_point;
411	32573					59200	my ($range_start, $range_end) = ($range->{start}, $range->{end});
412
413	32573	100	100			90270	if ($range_start <= $chr && $chr <= $range_end) {
414	26811					35780	$char_weight = $range->{weight};
415	26811					41435	last;
416							}
417							}
418
419	28985					34897	$weighted_count += $char_weight;
420
421	28985	100				58429	$has_invalid_chars = _contains_invalid($code_point) unless $has_invalid_chars;
422	28985					47535	my $codepoint_length = length $code_point;
423	28985					36534	$offset += $codepoint_length;
424	28985					33200	$display_offset += $codepoint_length;
425
426	28985	100	100			97330	if (!$has_invalid_chars && ($weighted_count <= $scaled_max_weighted_tweet_length)) {
427	5190					10354	$valid_offset += $codepoint_length;
428							}
429							}
430							}
431
432	51					289	my $normalized_text_offset = length($text) - length($normalized_text);
433	51					112	my $scaled_weighted_length = $weighted_count / $scale;
434	51		100			173	my $is_valid = !$has_invalid_chars && ($scaled_weighted_length <= $max_weighted_tweet_length);
435	51					116	my $permilage = int($scaled_weighted_length * 1000 / $max_weighted_tweet_length);
436
437							return +{
438	51	100				671	weighted_length => $scaled_weighted_length,
439							valid => $is_valid ? 1 : 0,
440							permillage => $permilage,
441							display_range_start => 0,
442							display_range_end => $display_offset + $normalized_text_offset - 1,
443							valid_range_start => 0,
444							valid_range_end => $valid_offset + $normalized_text_offset - 1,
445							};
446							}
447
448							sub _empty_parse_results {
449							return {
450	1			1		12	weighted_length => 0,
451							valid => 0,
452							permillage => 0,
453							display_range_start => 0,
454							display_range_end => 0,
455							valid_range_start => 0,
456							valid_range_end => 0,
457							};
458							}
459
460							sub _contains_invalid {
461	28981			28981		43318	my ($text) = @_;
462
463	28981	100	66			81081	return 0 if !$text \|\| length $text == 0;
464	28978					146356	return $text =~ qr/[$Twitter::Text::Regexp::INVALID_CHARACTERS]/;
465							}
466
467							1;
468							__END__