File Coverage

blib/lib/Perl/Critic/Policy/RegularExpressions/ProhibitComplexRegexes.pm

Criterion	Covered	Total	%
statement	45	45	100.0
branch	14	16	87.5
condition			n/a
subroutine	14	14	100.0
pod	4	5	80.0
total	77	80	96.2

line	stmt	bran	sub	pod	time	code
1						package Perl::Critic::Policy::RegularExpressions::ProhibitComplexRegexes;
2
3	40		40		27950	use 5.010001;
	40				196
4	40		40		278	use strict;
	40				137
	40				1008
5	40		40		245	use warnings;
	40				128
	40				1044
6
7	40		40		258	use Carp;
	40				111
	40				2513
8	40		40		301	use English qw(-no_match_vars);
	40				135
	40				282
9	40		40		14728	use List::Util qw{ min };
	40				123
	40				2739
10	40		40		304	use Readonly;
	40				144
	40				2066
11
12	40		40		400	use Perl::Critic::Utils qw{ :booleans :severities };
	40				127
	40				2005
13
14	40		40		6582	use parent 'Perl::Critic::Policy';
	40				148
	40				326
15
16						our $VERSION = '1.148';
17
18						#-----------------------------------------------------------------------------
19
20						Readonly::Scalar my $DESC => q{Split long regexps into smaller qr// chunks};
21						Readonly::Scalar my $EXPL => [261];
22
23						Readonly::Scalar my $MAX_LITERAL_LENGTH => 7;
24						Readonly::Scalar my $MAX_VARIABLE_LENGTH => 4;
25
26						#-----------------------------------------------------------------------------
27
28						sub supported_parameters {
29						return (
30						{
31	97		97	0	2151	name => 'max_characters',
32						description =>
33						'The maximum number of characters to allow in a regular expression.',
34						default_string => '60',
35						behavior => 'integer',
36						integer_minimum => 1,
37						},
38						);
39						}
40
41	80		80	1	408	sub default_severity { return $SEVERITY_MEDIUM }
42	86		86	1	386	sub default_themes { return qw( core pbp maintenance ) }
43	37		37	1	163	sub applies_to { return qw(PPI::Token::Regexp::Match
44						PPI::Token::Regexp::Substitute
45						PPI::Token::QuoteLike::Regexp) }
46
47						#-----------------------------------------------------------------------------
48
49						sub violates {
50	24		24	1	69	my ( $self, $elem, $document ) = @_;
51
52						# Optimization: if its short enough now, parsing won't make it longer
53	24	100			100	return if $self->{_max_characters} >= length $elem->get_match_string();
54
55	20	50			450	my $re = $document->ppix_regexp_from_element( $elem )
56						or return; # Abort on syntax error.
57	20	100			217143	$re->failures()
58						and return; # Abort if parse errors found.
59	19	50			157	my $qr = $re->regular_expression()
60						or return; # Abort if no regular expression.
61
62	19				428	my $length = 0;
63						# We use map { $_->tokens() } qr->children() rather than just
64						# $qr->tokens() because we are not interested in the delimiters.
65	19				62	foreach my $token ( map { $_->tokens() } $qr->children() ) {
	392				1839
66
67						# Do not count whitespace or comments
68	392	100			2229	$token->significant() or next;
69
70	363	100			1892	if ( $token->isa( 'PPIx::Regexp::Token::Interpolation' ) ) {
		100
71
72						# Do not penalize long variable names
73	12				47	$length += min( $MAX_VARIABLE_LENGTH, length $token->content() );
74
75						} elsif ( $token->isa( 'PPIx::Regexp::Token::Literal' ) ) {
76
77						# Do not penalize long literals like \p{...}
78	350				699	$length += min( $MAX_LITERAL_LENGTH, length $token->content() );
79
80						} else {
81
82						# Take everything else at face value
83	1				10	$length += length $token->content();
84
85						}
86
87						}
88
89	19	100			198	return if $self->{_max_characters} >= $length;
90
91	6				44	return $self->violation( $DESC, $EXPL, $elem );
92						}
93
94						1;
95
96						__END__
97
98						#-----------------------------------------------------------------------------
99
100						=pod
101
102						=for stopwords BNF Tatsuhiko Miyagawa
103
104						=head1 NAME
105
106						Perl::Critic::Policy::RegularExpressions::ProhibitComplexRegexes - Split long regexps into smaller C<qr//> chunks.
107
108
109						=head1 AFFILIATION
110
111						This Policy is part of the core L<Perl::Critic\|Perl::Critic>
112						distribution.
113
114
115						=head1 DESCRIPTION
116
117						Big regexps are hard to read, perhaps even the hardest part of Perl.
118						A good practice to write digestible chunks of regexp and put them
119						together. This policy flags any regexp that is longer than C<N>
120						characters, where C<N> is a configurable value that defaults to 60.
121						If the regexp uses the C<x> flag, then the length is computed after
122						parsing out any comments or whitespace.
123
124						Unfortunately the use of descriptive (and therefore longish) variable
125						names can cause regexps to be in violation of this policy, so
126						interpolated variables are counted as 4 characters no matter how long
127						their names actually are.
128
129
130						=head1 CASE STUDY
131
132						As an example, look at the regexp used to match email addresses in
133						L<Email::Valid::Loose\|Email::Valid::Loose> (tweaked lightly to wrap
134						for POD)
135
136						(?x-ism:(?:[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+(?![^(\040)<>@,;:".\\\[\]
137						\000-\037\x80-\xff])\|"[^\\\x80-\xff\n\015"]*(?:\\[^\x80-\xff][^\\\x80-\xff\n\015
138						"])")(?:(?:[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+(?![^(\040)<>@,;:".\\\[
139						\]\000-\037\x80-\xff])\|"[^\\\x80-\xff\n\015"]*(?:\\[^\x80-\xff][^\\\x80-\xff\n
140						\015"])")\|\.)*\@(?:[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+(?![^(\040)<>@,
141						;:".\\\[\]\000-\037\x80-\xff])\|\[(?:[^\\\x80-\xff\n\015\[\]]\|\\[^\x80-\xff])*\]
142						)(?:\.(?:[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+(?![^(\040)<>@,;:".\\\[\]\000
143						-\037\x80-\xff])\|\[(?:[^\\\x80-\xff\n\015\[\]]\|\\[^\x80-\xff])\])))
144
145						which is constructed from the following code:
146
147						my $esc = '\\\\';
148						my $period = '\.';
149						my $space = '\040';
150						my $open_br = '\[';
151						my $close_br = '\]';
152						my $nonASCII = '\x80-\xff';
153						my $ctrl = '\000-\037';
154						my $cr_list = '\n\015';
155						my $qtext = qq/[^$esc$nonASCII$cr_list\"]/; # "
156						my $dtext = qq/[^$esc$nonASCII$cr_list$open_br$close_br]/;
157						my $quoted_pair = qq<$esc>.qq<[^$nonASCII]>;
158						my $atom_char = qq/[^($space)<>\@,;:\".$esc$open_br$close_br$ctrl$nonASCII]/;# "
159						my $atom = qq<$atom_char+(?!$atom_char)>;
160						my $quoted_str = qq<\"$qtext(?:$quoted_pair$qtext)*\">; # "
161						my $word = qq<(?:$atom\|$quoted_str)>;
162						my $domain_ref = $atom;
163						my $domain_lit = qq<$open_br(?:$dtext\|$quoted_pair)*$close_br>;
164						my $sub_domain = qq<(?:$domain_ref\|$domain_lit)>;
165						my $domain = qq<$sub_domain(?:$period$sub_domain)*>;
166						my $local_part = qq<$word(?:$word\|$period)*>; # This part is modified
167						$Addr_spec_re = qr<$local_part\@$domain>;
168
169						If you read the code from bottom to top, it is quite readable. And,
170						you can even see the one violation of RFC822 that Tatsuhiko Miyagawa
171						deliberately put into Email::Valid::Loose to allow periods. Look for
172						the C<\|\.> in the upper regexp to see that same deviation.
173
174						One could certainly argue that the top regexp could be re-written more
175						legibly with C<m//x> and comments. But the bottom version is
176						self-documenting and, for example, doesn't repeat C<\x80-\xff> 18
177						times. Furthermore, it's much easier to compare the second version
178						against the source BNF grammar in RFC 822 to judge whether the
179						implementation is sound even before running tests.
180
181
182						=head1 CONFIGURATION
183
184						This policy allows regexps up to C<N> characters long, where C<N>
185						defaults to 60. You can override this to set it to a different number
186						with the C<max_characters> setting. To do this, put entries in a
187						F<.perlcriticrc> file like this:
188
189						[RegularExpressions::ProhibitComplexRegexes]
190						max_characters = 40
191
192
193						=head1 CREDITS
194
195						Initial development of this policy was supported by a grant from the
196						Perl Foundation.
197
198
199						=head1 AUTHOR
200
201						Chris Dolan <cdolan@cpan.org>
202
203
204						=head1 COPYRIGHT
205
206						Copyright (c) 2007-2011 Chris Dolan. Many rights reserved.
207
208						This program is free software; you can redistribute it and/or modify
209						it under the same terms as Perl itself. The full text of this license
210						can be found in the LICENSE file included with this module
211
212						=cut
213
214						# Local Variables:
215						# mode: cperl
216						# cperl-indent-level: 4
217						# fill-column: 78
218						# indent-tabs-mode: nil
219						# c-indentation-style: bsd
220						# End:
221						# ex: set ts=8 sts=4 sw=4 tw=78 ft=perl expandtab shiftround :