line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Perl::Critic::Policy::RegularExpressions::ProhibitComplexRegexes; |
2
|
|
|
|
|
|
|
|
3
|
40
|
|
|
40
|
|
27950
|
use 5.010001; |
|
40
|
|
|
|
|
196
|
|
4
|
40
|
|
|
40
|
|
278
|
use strict; |
|
40
|
|
|
|
|
137
|
|
|
40
|
|
|
|
|
1008
|
|
5
|
40
|
|
|
40
|
|
245
|
use warnings; |
|
40
|
|
|
|
|
128
|
|
|
40
|
|
|
|
|
1044
|
|
6
|
|
|
|
|
|
|
|
7
|
40
|
|
|
40
|
|
258
|
use Carp; |
|
40
|
|
|
|
|
111
|
|
|
40
|
|
|
|
|
2513
|
|
8
|
40
|
|
|
40
|
|
301
|
use English qw(-no_match_vars); |
|
40
|
|
|
|
|
135
|
|
|
40
|
|
|
|
|
282
|
|
9
|
40
|
|
|
40
|
|
14728
|
use List::Util qw{ min }; |
|
40
|
|
|
|
|
123
|
|
|
40
|
|
|
|
|
2739
|
|
10
|
40
|
|
|
40
|
|
304
|
use Readonly; |
|
40
|
|
|
|
|
144
|
|
|
40
|
|
|
|
|
2066
|
|
11
|
|
|
|
|
|
|
|
12
|
40
|
|
|
40
|
|
400
|
use Perl::Critic::Utils qw{ :booleans :severities }; |
|
40
|
|
|
|
|
127
|
|
|
40
|
|
|
|
|
2005
|
|
13
|
|
|
|
|
|
|
|
14
|
40
|
|
|
40
|
|
6582
|
use parent 'Perl::Critic::Policy'; |
|
40
|
|
|
|
|
148
|
|
|
40
|
|
|
|
|
326
|
|
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
our $VERSION = '1.148'; |
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
#----------------------------------------------------------------------------- |
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
Readonly::Scalar my $DESC => q{Split long regexps into smaller qr// chunks}; |
21
|
|
|
|
|
|
|
Readonly::Scalar my $EXPL => [261]; |
22
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
Readonly::Scalar my $MAX_LITERAL_LENGTH => 7; |
24
|
|
|
|
|
|
|
Readonly::Scalar my $MAX_VARIABLE_LENGTH => 4; |
25
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
#----------------------------------------------------------------------------- |
27
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
sub supported_parameters { |
29
|
|
|
|
|
|
|
return ( |
30
|
|
|
|
|
|
|
{ |
31
|
97
|
|
|
97
|
0
|
2151
|
name => 'max_characters', |
32
|
|
|
|
|
|
|
description => |
33
|
|
|
|
|
|
|
'The maximum number of characters to allow in a regular expression.', |
34
|
|
|
|
|
|
|
default_string => '60', |
35
|
|
|
|
|
|
|
behavior => 'integer', |
36
|
|
|
|
|
|
|
integer_minimum => 1, |
37
|
|
|
|
|
|
|
}, |
38
|
|
|
|
|
|
|
); |
39
|
|
|
|
|
|
|
} |
40
|
|
|
|
|
|
|
|
41
|
80
|
|
|
80
|
1
|
408
|
sub default_severity { return $SEVERITY_MEDIUM } |
42
|
86
|
|
|
86
|
1
|
386
|
sub default_themes { return qw( core pbp maintenance ) } |
43
|
37
|
|
|
37
|
1
|
163
|
sub applies_to { return qw(PPI::Token::Regexp::Match |
44
|
|
|
|
|
|
|
PPI::Token::Regexp::Substitute |
45
|
|
|
|
|
|
|
PPI::Token::QuoteLike::Regexp) } |
46
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
#----------------------------------------------------------------------------- |
48
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
sub violates { |
50
|
24
|
|
|
24
|
1
|
69
|
my ( $self, $elem, $document ) = @_; |
51
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
# Optimization: if its short enough now, parsing won't make it longer |
53
|
24
|
100
|
|
|
|
100
|
return if $self->{_max_characters} >= length $elem->get_match_string(); |
54
|
|
|
|
|
|
|
|
55
|
20
|
50
|
|
|
|
450
|
my $re = $document->ppix_regexp_from_element( $elem ) |
56
|
|
|
|
|
|
|
or return; # Abort on syntax error. |
57
|
20
|
100
|
|
|
|
217143
|
$re->failures() |
58
|
|
|
|
|
|
|
and return; # Abort if parse errors found. |
59
|
19
|
50
|
|
|
|
157
|
my $qr = $re->regular_expression() |
60
|
|
|
|
|
|
|
or return; # Abort if no regular expression. |
61
|
|
|
|
|
|
|
|
62
|
19
|
|
|
|
|
428
|
my $length = 0; |
63
|
|
|
|
|
|
|
# We use map { $_->tokens() } qr->children() rather than just |
64
|
|
|
|
|
|
|
# $qr->tokens() because we are not interested in the delimiters. |
65
|
19
|
|
|
|
|
62
|
foreach my $token ( map { $_->tokens() } $qr->children() ) { |
|
392
|
|
|
|
|
1839
|
|
66
|
|
|
|
|
|
|
|
67
|
|
|
|
|
|
|
# Do not count whitespace or comments |
68
|
392
|
100
|
|
|
|
2229
|
$token->significant() or next; |
69
|
|
|
|
|
|
|
|
70
|
363
|
100
|
|
|
|
1892
|
if ( $token->isa( 'PPIx::Regexp::Token::Interpolation' ) ) { |
|
|
100
|
|
|
|
|
|
71
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
# Do not penalize long variable names |
73
|
12
|
|
|
|
|
47
|
$length += min( $MAX_VARIABLE_LENGTH, length $token->content() ); |
74
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
} elsif ( $token->isa( 'PPIx::Regexp::Token::Literal' ) ) { |
76
|
|
|
|
|
|
|
|
77
|
|
|
|
|
|
|
# Do not penalize long literals like \p{...} |
78
|
350
|
|
|
|
|
699
|
$length += min( $MAX_LITERAL_LENGTH, length $token->content() ); |
79
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
} else { |
81
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
# Take everything else at face value |
83
|
1
|
|
|
|
|
10
|
$length += length $token->content(); |
84
|
|
|
|
|
|
|
|
85
|
|
|
|
|
|
|
} |
86
|
|
|
|
|
|
|
|
87
|
|
|
|
|
|
|
} |
88
|
|
|
|
|
|
|
|
89
|
19
|
100
|
|
|
|
198
|
return if $self->{_max_characters} >= $length; |
90
|
|
|
|
|
|
|
|
91
|
6
|
|
|
|
|
44
|
return $self->violation( $DESC, $EXPL, $elem ); |
92
|
|
|
|
|
|
|
} |
93
|
|
|
|
|
|
|
|
94
|
|
|
|
|
|
|
1; |
95
|
|
|
|
|
|
|
|
96
|
|
|
|
|
|
|
__END__ |
97
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
#----------------------------------------------------------------------------- |
99
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
=pod |
101
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
=for stopwords BNF Tatsuhiko Miyagawa |
103
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
=head1 NAME |
105
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
Perl::Critic::Policy::RegularExpressions::ProhibitComplexRegexes - Split long regexps into smaller C<qr//> chunks. |
107
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
=head1 AFFILIATION |
110
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
This Policy is part of the core L<Perl::Critic|Perl::Critic> |
112
|
|
|
|
|
|
|
distribution. |
113
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
|
115
|
|
|
|
|
|
|
=head1 DESCRIPTION |
116
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
Big regexps are hard to read, perhaps even the hardest part of Perl. |
118
|
|
|
|
|
|
|
A good practice to write digestible chunks of regexp and put them |
119
|
|
|
|
|
|
|
together. This policy flags any regexp that is longer than C<N> |
120
|
|
|
|
|
|
|
characters, where C<N> is a configurable value that defaults to 60. |
121
|
|
|
|
|
|
|
If the regexp uses the C<x> flag, then the length is computed after |
122
|
|
|
|
|
|
|
parsing out any comments or whitespace. |
123
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
Unfortunately the use of descriptive (and therefore longish) variable |
125
|
|
|
|
|
|
|
names can cause regexps to be in violation of this policy, so |
126
|
|
|
|
|
|
|
interpolated variables are counted as 4 characters no matter how long |
127
|
|
|
|
|
|
|
their names actually are. |
128
|
|
|
|
|
|
|
|
129
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
=head1 CASE STUDY |
131
|
|
|
|
|
|
|
|
132
|
|
|
|
|
|
|
As an example, look at the regexp used to match email addresses in |
133
|
|
|
|
|
|
|
L<Email::Valid::Loose|Email::Valid::Loose> (tweaked lightly to wrap |
134
|
|
|
|
|
|
|
for POD) |
135
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
(?x-ism:(?:[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+(?![^(\040)<>@,;:".\\\[\] |
137
|
|
|
|
|
|
|
\000-\037\x80-\xff])|"[^\\\x80-\xff\n\015"]*(?:\\[^\x80-\xff][^\\\x80-\xff\n\015 |
138
|
|
|
|
|
|
|
"]*)*")(?:(?:[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+(?![^(\040)<>@,;:".\\\[ |
139
|
|
|
|
|
|
|
\]\000-\037\x80-\xff])|"[^\\\x80-\xff\n\015"]*(?:\\[^\x80-\xff][^\\\x80-\xff\n |
140
|
|
|
|
|
|
|
\015"]*)*")|\.)*\@(?:[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+(?![^(\040)<>@, |
141
|
|
|
|
|
|
|
;:".\\\[\]\000-\037\x80-\xff])|\[(?:[^\\\x80-\xff\n\015\[\]]|\\[^\x80-\xff])*\] |
142
|
|
|
|
|
|
|
)(?:\.(?:[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+(?![^(\040)<>@,;:".\\\[\]\000 |
143
|
|
|
|
|
|
|
-\037\x80-\xff])|\[(?:[^\\\x80-\xff\n\015\[\]]|\\[^\x80-\xff])*\]))*) |
144
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
which is constructed from the following code: |
146
|
|
|
|
|
|
|
|
147
|
|
|
|
|
|
|
my $esc = '\\\\'; |
148
|
|
|
|
|
|
|
my $period = '\.'; |
149
|
|
|
|
|
|
|
my $space = '\040'; |
150
|
|
|
|
|
|
|
my $open_br = '\['; |
151
|
|
|
|
|
|
|
my $close_br = '\]'; |
152
|
|
|
|
|
|
|
my $nonASCII = '\x80-\xff'; |
153
|
|
|
|
|
|
|
my $ctrl = '\000-\037'; |
154
|
|
|
|
|
|
|
my $cr_list = '\n\015'; |
155
|
|
|
|
|
|
|
my $qtext = qq/[^$esc$nonASCII$cr_list\"]/; # " |
156
|
|
|
|
|
|
|
my $dtext = qq/[^$esc$nonASCII$cr_list$open_br$close_br]/; |
157
|
|
|
|
|
|
|
my $quoted_pair = qq<$esc>.qq<[^$nonASCII]>; |
158
|
|
|
|
|
|
|
my $atom_char = qq/[^($space)<>\@,;:\".$esc$open_br$close_br$ctrl$nonASCII]/;# " |
159
|
|
|
|
|
|
|
my $atom = qq<$atom_char+(?!$atom_char)>; |
160
|
|
|
|
|
|
|
my $quoted_str = qq<\"$qtext*(?:$quoted_pair$qtext*)*\">; # " |
161
|
|
|
|
|
|
|
my $word = qq<(?:$atom|$quoted_str)>; |
162
|
|
|
|
|
|
|
my $domain_ref = $atom; |
163
|
|
|
|
|
|
|
my $domain_lit = qq<$open_br(?:$dtext|$quoted_pair)*$close_br>; |
164
|
|
|
|
|
|
|
my $sub_domain = qq<(?:$domain_ref|$domain_lit)>; |
165
|
|
|
|
|
|
|
my $domain = qq<$sub_domain(?:$period$sub_domain)*>; |
166
|
|
|
|
|
|
|
my $local_part = qq<$word(?:$word|$period)*>; # This part is modified |
167
|
|
|
|
|
|
|
$Addr_spec_re = qr<$local_part\@$domain>; |
168
|
|
|
|
|
|
|
|
169
|
|
|
|
|
|
|
If you read the code from bottom to top, it is quite readable. And, |
170
|
|
|
|
|
|
|
you can even see the one violation of RFC822 that Tatsuhiko Miyagawa |
171
|
|
|
|
|
|
|
deliberately put into Email::Valid::Loose to allow periods. Look for |
172
|
|
|
|
|
|
|
the C<|\.> in the upper regexp to see that same deviation. |
173
|
|
|
|
|
|
|
|
174
|
|
|
|
|
|
|
One could certainly argue that the top regexp could be re-written more |
175
|
|
|
|
|
|
|
legibly with C<m//x> and comments. But the bottom version is |
176
|
|
|
|
|
|
|
self-documenting and, for example, doesn't repeat C<\x80-\xff> 18 |
177
|
|
|
|
|
|
|
times. Furthermore, it's much easier to compare the second version |
178
|
|
|
|
|
|
|
against the source BNF grammar in RFC 822 to judge whether the |
179
|
|
|
|
|
|
|
implementation is sound even before running tests. |
180
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
|
182
|
|
|
|
|
|
|
=head1 CONFIGURATION |
183
|
|
|
|
|
|
|
|
184
|
|
|
|
|
|
|
This policy allows regexps up to C<N> characters long, where C<N> |
185
|
|
|
|
|
|
|
defaults to 60. You can override this to set it to a different number |
186
|
|
|
|
|
|
|
with the C<max_characters> setting. To do this, put entries in a |
187
|
|
|
|
|
|
|
F<.perlcriticrc> file like this: |
188
|
|
|
|
|
|
|
|
189
|
|
|
|
|
|
|
[RegularExpressions::ProhibitComplexRegexes] |
190
|
|
|
|
|
|
|
max_characters = 40 |
191
|
|
|
|
|
|
|
|
192
|
|
|
|
|
|
|
|
193
|
|
|
|
|
|
|
=head1 CREDITS |
194
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
Initial development of this policy was supported by a grant from the |
196
|
|
|
|
|
|
|
Perl Foundation. |
197
|
|
|
|
|
|
|
|
198
|
|
|
|
|
|
|
|
199
|
|
|
|
|
|
|
=head1 AUTHOR |
200
|
|
|
|
|
|
|
|
201
|
|
|
|
|
|
|
Chris Dolan <cdolan@cpan.org> |
202
|
|
|
|
|
|
|
|
203
|
|
|
|
|
|
|
|
204
|
|
|
|
|
|
|
=head1 COPYRIGHT |
205
|
|
|
|
|
|
|
|
206
|
|
|
|
|
|
|
Copyright (c) 2007-2011 Chris Dolan. Many rights reserved. |
207
|
|
|
|
|
|
|
|
208
|
|
|
|
|
|
|
This program is free software; you can redistribute it and/or modify |
209
|
|
|
|
|
|
|
it under the same terms as Perl itself. The full text of this license |
210
|
|
|
|
|
|
|
can be found in the LICENSE file included with this module |
211
|
|
|
|
|
|
|
|
212
|
|
|
|
|
|
|
=cut |
213
|
|
|
|
|
|
|
|
214
|
|
|
|
|
|
|
# Local Variables: |
215
|
|
|
|
|
|
|
# mode: cperl |
216
|
|
|
|
|
|
|
# cperl-indent-level: 4 |
217
|
|
|
|
|
|
|
# fill-column: 78 |
218
|
|
|
|
|
|
|
# indent-tabs-mode: nil |
219
|
|
|
|
|
|
|
# c-indentation-style: bsd |
220
|
|
|
|
|
|
|
# End: |
221
|
|
|
|
|
|
|
# ex: set ts=8 sts=4 sw=4 tw=78 ft=perl expandtab shiftround : |