File Coverage

blib/lib/Code/CutNPaste.pm

Criterion	Covered	Total	%
statement	233	264	88.2
branch	57	98	58.1
condition	7	18	38.8
subroutine	31	33	93.9
pod	0	3	0.0
total	328	416	78.8

line	stmt	bran	cond	sub	pod	time	code
1							package Code::CutNPaste;
2
3	5			5		28513	use 5.006;
	5					11
	5					156
4
5	5			5		3818	use autodie;
	5					288537
	5					27
6	5			5		37825	use Benchmark qw(timediff timestr);
	5					162423
	5					125
7	5			5		69041	use Try::Tiny;
	5					18141
	5					390
8	5			5		4657	use Capture::Tiny qw(capture);
	5					197736
	5					394
9	5			5		44	use Carp;
	5					6
	5					482
10	5			5		4806	use File::Find::Rule;
	5					47630
	5					45
11	5			5		7445	use File::HomeDir;
	5					31679
	5					329
12	5			5		4367	use File::Slurp;
	5					56558
	5					532
13	5			5		4951	use File::Spec::Functions qw(catfile catdir);
	5					4816
	5					347
14	5			5		6451	use Getopt::Long;
	5					62068
	5					34
15	5			5		8547	use Moo;
	5					126018
	5					40
16	5			5		14847	use Parallel::ForkManager;
	5					34333
	5					183
17	5			5		5929	use Term::ProgressBar;
	5					352123
	5					355
18	5			5		4195	use aliased 'Code::CutNPaste::Duplicate';
	5					3776
	5					27
19	5			5		605	use aliased 'Code::CutNPaste::Duplicate::Item';
	5					10
	5					34
20
21							has 'renamed_vars' => ( is => 'ro' );
22							has 'renamed_subs' => ( is => 'ro' );
23							has 'verbose' => ( is => 'ro' );
24							has 'window' => ( is => 'rwp', default => sub {5} );
25							has 'jobs' => ( is => 'ro', default => sub {1} );
26							has 'show_warnings' => ( is => 'ro' );
27							has 'noutf8' => ( is => 'ro' );
28							has 'threshold' => (
29							is => 'rwp',
30							default => sub {.75},
31							isa => sub {
32	5			5		2079	no warnings 'uninitialized';
	5					14
	5					23178
33							my $threshold = 0 + shift;
34							if ( $threshold < 0 or $threshold > 1 ) {
35							croak("threshold must be between 0 and 1, inclusive");
36							}
37							},
38							);
39							has 'dirs' => (
40							is => 'ro',
41							default => sub {'lib'},
42							coerce => sub {
43							my $dirs = shift;
44							unless ( ref $dirs ) {
45							$dirs = [$dirs];
46							}
47							return $dirs;
48							},
49							isa => sub {
50							my $dirs = shift;
51							for my $dir (@$dirs) {
52							unless ( -d $dir ) {
53							croak("Cannot find directory '$dir'");
54							}
55							}
56							},
57							);
58
59							has 'files' => (
60							is => 'ro',
61							default => sub { [] },
62							isa => sub {
63							my $files = shift;
64							unless ( 'ARRAY' eq ref $files ) {
65							croak("Argument to files must be an array reference of files");
66							}
67							for my $file (@$files) {
68							unless ( -f $file && -r _ ) {
69							croak("File '$file' does not exist or cannot be read");
70							}
71							}
72							},
73							);
74
75							has 'ignore' => (
76							is => 'ro',
77							coerce => sub {
78							my $ignore = shift;
79							return unless defined $ignore;
80							return $ignore if ref $ignore eq 'Regexp';
81							if ( !ref $ignore ) {
82							$ignore = qr/$ignore/;
83							}
84							if ( 'ARRAY' eq ref $ignore ) {
85							return unless @$ignore;
86							$ignore = join '\|' => @$ignore;
87							$ignore = qr/$ignore/;
88							}
89							return $ignore;
90							},
91							isa => sub {
92							return unless defined $_[0];
93							croak("ignore must be a qr/regex/!")
94							unless 'Regexp' eq ref $_[0];
95							},
96							);
97
98							has 'cache_dir' => (
99							is => 'ro',
100							default => sub {
101							my $homedir = File::HomeDir->my_home;
102							return catdir( $homedir, '.cutnpaste' );
103							},
104							);
105
106							has '_duplicates' => (
107							is => 'ro',
108							default => sub { [] },
109							);
110							has '_find_dups_called' => ( is => 'rw' );
111
112							# XXX I don't expect this to be normal, but I have found this when I run this
113							# code against its own codebase due to "subroutine redefined" warnings
114							has '_could_not_deparse' => ( is => 'ro', default => sub { {} } );
115
116							=head1 NAME
117
118							Code::CutNPaste - Find Duplicate Perl Code
119
120							=head1 VERSION
121
122							Version 0.31
123
124							=cut
125
126							our $VERSION = '0.31';
127
128							=head1 SYNOPSIS
129
130							use Code::CutNPaste;
131
132							my $cutnpaste = Code::CutNPaste->new(
133							dirs => [ 'lib', 'path/to/other/lib' ],
134							renamed_vars => 1,
135							renamed_subs => 1,
136							);
137							my $duplicates = $cutnpaste->duplicates;
138
139							foreach my $duplicate (@$duplicates) {
140							my ( $left, $right ) = ( $duplicate->left, $duplicate->right );
141							printf <<'END', $left->file, $left->line, $right->file, $right->line;
142
143							Possible duplicate code found
144							Left: %s line %d
145							Right: %s line %d
146
147							END
148							print $duplicate->report;
149							}
150
151							=cut
152
153							sub BUILD {
154	4			4	0	28	my $self = shift;
155
156	4	50				28	unless ( $self->noutf8 ) {
157	0					0	eval "use utf8::all";
158	0	0				0	warn $@ if $@;
159							}
160
161	4					20	my $cache_dir = $self->cache_dir;
162	4	50				28	$self->_set_window(5) unless defined $self->window;
163	4	50				20	$self->_set_threshold(.75) unless defined $self->threshold;
164
165	4	50				68	if ( -d $cache_dir ) {
166	4					136	my @cached = File::Find::Rule->file->in($cache_dir);
167	4					4120	unlink $_ for @cached;
168							}
169							else {
170	0					0	mkdir $cache_dir;
171							}
172	4					4448	for my $dir ( @{ $self->dirs } ) {
	4					56
173							my @files
174	4					136	= grep { !/^\./ }
	12					5260
175							File::Find::Rule->file->name( '.pm', '.t', '*.pl' )->in($dir);
176
177							# XXX dups and subdirs?
178	4					24	push @{ $self->files } => @files;
	4					120
179							}
180							}
181
182							sub find_dups {
183	4			4	0	3168	my $self = shift;
184
185	4	50				32	printf "Started: %s\n", scalar localtime if $self->verbose;
186	4					36	my $start = Benchmark->new;
187	4					132	$self->_find_dups_called(1);
188	4					16	my $jobs = $self->jobs;
189
190	4		50			48	my $fork = Parallel::ForkManager->new( $jobs \|\| 1 );
191							$fork->run_on_finish(
192							sub {
193	6			6		6794532	my $duplicates = pop @_;
194	6					28	push @{ $self->_duplicates } => @$duplicates;
	6					96
195							}
196	4					2336	);
197
198	4					56	my @pairs = $self->_get_pairs_of_files;
199	4					8	my $progress;
200	4	50				16	$progress = Term::ProgressBar->new(
201							{ count => scalar @pairs,
202							ETA => 'linear',
203							}
204							) if $self->verbose;
205
206	4					8	my $count = 1;
207	4					12	foreach my $pair (@pairs) {
208	9	50				8980	$progress->update( $count++ ) if $self->verbose;
209	9	100				106	my $pid = $fork->start and next;
210
211	3					7633	my $duplicates_found = $self->_search_for_dups(@$pair);
212
213	3					60	$fork->finish( 0, $duplicates_found );
214							}
215	1					1479	$fork->wait_all_children;
216	1	50				65	if ( $self->verbose ) {
217	0					0	printf "Ended: %s\n", scalar localtime;
218	0					0	my $time = timediff( Benchmark->new, $start );
219	0					0	print STDERR "Time: ", timestr($time), "\n";
220							}
221							}
222
223							sub duplicates {
224	1			1	0	760	my $self = shift;
225	1	50				13	$self->find_dups unless $self->_find_dups_called;
226	1					15	return $self->_duplicates;
227							}
228
229							sub _search_for_dups {
230	3			3		166	my ( $self, $first, $second ) = @_;
231	3					139	my $window = $self->window;
232
233	3	50				126	my $code1 = $self->_get_text($first) or return [];
234	3	50				60	my $code2 = $self->_get_text($second) or return [];
235
236	3					59	my %in_second = map { $_->{key} => 1 } @$code2;
	72					246
237
238	3					27	my $matches_found = 0;
239	3					16	my $last_found = 0;
240	3					16	foreach my $i ( 0 .. $#$code1 ) {
241	80	100				259	if ( $in_second{ $code1->[$i]{key} } ) {
242	71	100				141	if ( $i == $last_found + 1 ) {
243	65					538	$matches_found++;
244							}
245	71					115	$last_found = $i;
246							}
247							}
248	3	50				21	if ( $matches_found < $window ) {
249	0					0	return [];
250							}
251
252							# brute force is bad!
253
254	3					9	my @duplicates_found;
255	3					28	LINE: foreach ( my $i = 0; $i < @$code1 - $window; $i++ ) {
256	29	100				87	next LINE unless $in_second{ $code1->[$i]{key} };
257
258	20					106	my @code1 = @{$code1}[ $i .. $#$code1 ];
	20					98
259	20					65	foreach my $j ( 0 .. $#$code2 - $window ) {
260	377					13456	my @code2 = @{$code2}[ $j .. $#$code2 ];
	377					1111
261	377					550	my $matches = 0;
262	377					366	my $longest = 0;
263	377					572	WINDOW: foreach my $k ( 0 .. $#code1 ) {
264	516	100				2088	if ( $code1[$k]{key} eq $code2[$k]{key} ) {
265	143					134	$matches++;
266	143					197	my $length1 = length( $code1[$k]{code} );
267	143	100				238	if ( $length1 > $longest ) {
268	63					60	$longest = $length1;
269							}
270	143					194	my $length2 = length( $code2[$k]{code} );
271	143	100				324	if ( $length2 > $longest ) {
272	26					36	$longest = $length2;
273							}
274							}
275							else {
276	373					499	last WINDOW;
277							}
278							}
279
280							# if too many lines don't meet our threshold level, don't report
281							# this block of code
282	377	100				769	if ( $matches >= $window ) {
283	7	100				38	$matches = 0
284							if $self->_match_below_threshold( $matches, \@code1 );
285							}
286	377	100				1001	if ( $matches >= $window ) {
287	3					18	my $line1 = $code1[0]{line};
288	3					10	my $line2 = $code2[0]{line};
289
290	3					9	my ( $left, $right, $report ) = ( '', '', '' );
291	3					11	for ( 0 .. $matches - 1 ) {
292	54					100	$left .= $code1[$_]{code} . "\n";
293	54					100	$right .= $code2[$_]{code} . "\n";
294	108					113	my ( $line1, $line2 )
295	54					94	= map { chomp; $_ }
	108					172
296							( $code1[$_]{code}, $code2[$_]{code} );
297	54					145	$report
298							.= $line1 . ( ' ' x ( $longest - length($line1) ) );
299	54					110	$report .= " \| $line2\n";
300							}
301
302							# Next duplicate report should start after this chunk of code
303	3					11	$i += $matches;
304
305	3					75	my $ignore = $self->ignore;
306	3	50	33			54	if ( $ignore and $report =~ /$ignore/ ) {
307	0					0	next LINE;
308							}
309	3					221	push @duplicates_found => Duplicate->new(
310							left => Item->new(
311							file => $first,
312							line => $line1,
313							code => $left,
314							),
315							right => Item->new(
316							file => $second,
317							line => $line2,
318							code => $right,
319							),
320							report => $report,
321							);
322							}
323							}
324							}
325	3					144	return \@duplicates_found;
326							}
327
328							sub _match_below_threshold {
329	7			7		15	my ( $self, $matches, $code ) = @_;
330
331	7	50				67	return unless $self->threshold;
332
333	7					12	my $total = 0;
334	7					18	for ( 0 .. $matches - 1 ) {
335	91	100				280	$total++ if $code->[$_]{code} =~ /\w/;
336							}
337	7					72	return $self->threshold > $total / $matches;
338							}
339
340							sub _get_text {
341	6			6		19	my ( $self, $file ) = @_;
342
343	6					61	my $filename = $file;
344	6					371	$filename =~ s/\W/_/g;
345	6					149	$filename = catfile( $self->cache_dir, $filename );
346
347	6					38	my $filename_munged = $filename . ".munged";
348	6					12	my ( @contents, @munged );
349	6	100				467	if ( -f $filename ) {
350	3					113	@contents = split /(\n)/ => read_file($filename);
351
352							# sometimes another fork has already written the $filename, but not
353							# yet written the $filename_munged, so we will wait up to three
354							# seconds for it before trying to read it.
355							# A better ordering of the @pairs might help?
356	3					1273	my $retry = 1;
357	3					111	while ( !-f $filename_munged ) {
358	0					0	sleep 1;
359	0	0				0	last if $retry++ > 3;
360							}
361	3					29	@munged = split /(\n)/ => read_file($filename_munged);
362							}
363							else {
364	3					22	my $stderr;
365							try {
366	3					580937	( undef, $stderr, @contents )
367	3			3		7113	= capture {qx($^X -Ilib -MO=CutNPaste $file)};
368							} catch {
369	0			0		0	warn "Problem when capturing $^X -Ilib -MO=CutNPaste $file: $_";
370	3					222	};
371	3	50				5439	return undef if !@contents; #properly return, so we can avoid undef value as an array ref error
372	3	50				56	undef $stderr if $stderr =~ /syntax OK/;
373	3	50	33			22	if ( $stderr and !$self->_could_not_deparse->{$file} ) {
374	0	0				0	warn "Problem when parsing $file: $stderr"
375							if $self->show_warnings;
376							}
377	3					8	undef $stderr;
378	3					92	write_file( $filename, @contents );
379
380	3		50			1169	local $ENV{RENAME_VARS} = $self->renamed_vars \|\| 0;
381	3		50			79	local $ENV{RENAME_SUBS} = $self->renamed_subs \|\| 0;
382							try {
383	3					515414	( undef, $stderr, @munged )
384	3			3		438	= capture {qx($^X -Ilib -MO=CutNPaste $file)};
385							} catch {
386	0			0		0	warn "Problem when capturing $^X -Ilib -MO=CutNPaste $file: $_";
387	3					102	};
388	3	50				4411	return undef if !@munged;
389	3	50				60	undef $stderr if $stderr =~ /syntax OK/;
390	3	50	33			44	if ( $stderr and !$self->_could_not_deparse->{$file} ) {
391	0	0				0	warn "\nProblem when parsing $file: $stderr"
392							if $self->show_warnings;
393							}
394	3					61	write_file( $filename_munged, @munged );
395							}
396	6					1636	return $self->_add_line_numbers( $file, \@contents, \@munged );
397							}
398
399							sub _add_line_numbers {
400	6			6		35	my $self = shift;
401	6					38	my $file = shift;
402	6	50				164	return if $self->_could_not_deparse->{$file};
403	6					56	my $contents = $self->_prefilter(shift);
404	6					211	my $munged = $self->_prefilter(shift);
405
406	6	50				69	if ( @$contents != @$munged ) {
407	0					0	warn <<"END";
408
409							There was a problem parsing $file. It will be skipped.
410							Try rerunning with show_warnings => 1.
411
412							END
413	0					0	$self->_could_not_deparse->{$file} = 1;
414	0					0	return;
415							}
416	6					14	my @contents;
417
418	6					14	my $line_num = 1;
419	6					83	foreach my $i ( 0 .. $#$contents ) {
420	264					477	my ( $line, $munged_line ) = ( $contents->[$i], $munged->[$i] );
421	264					480	chomp $line;
422	264					300	chomp $munged_line;
423
424	264	100				955	if ( $line =~ /^#line\s+([0-9]+)/ ) {
425	112					269	$line_num = $1;
426	112					177	next;
427							}
428	152					393	push @contents => {
429							line => $line_num,
430							key => $self->_make_key($munged_line),
431							code => $line,
432							};
433	152					514	$line_num++;
434							}
435	6					69	return $self->_postfilter( \@contents );
436							}
437
438							sub _prefilter {
439	12			12		248	my ( $self, $contents ) = @_;
440	12					25	my @contents;
441	12					106	my %skip = (
442							sub_begin => 0,
443							);
444	12					23	my $skip = 0;
445
446	12					45	LINE: for ( my $i = 0; $i < @$contents; $i++ ) {
447	834					1894	local $_ = $contents->[$i];
448	834	50				2190	next if /^\s*(?:use\|require)\b/; # use/require
449	834	100				3130	next if /^\s*$/; # blank lines
450	528	50				1654	next if /^#(?!line\s+[0-9]+)/; # comments which aren't line directives
451
452							# Modules which import things create code like this:
453							#
454							# sub BEGIN {
455							# require strict;
456							# do {
457							# 'strict'->import('refs')
458							# };
459							# }
460							#
461							# $skip{sub_begin} filters this out
462
463	528	50	33			2756	if (/^sub BEGIN \{/) {
		50
464	0					0	$skip{sub_begin} = 1;
465	0					0	$skip++;
466							}
467							elsif ( $skip{sub_begin} and /^}/ ) {
468	0					0	$skip{sub_begin} = 0;
469	0					0	$skip--;
470	0					0	next;
471							}
472
473	528	50				2149	push @contents => $_ unless $skip;
474							}
475	12					601	return \@contents;
476							}
477
478							sub _postfilter {
479	6			6		16	my ( $self, $contents ) = @_;
480
481	6					13	my @contents;
482	6					32	INDEX: for ( my $i = 0; $i < @$contents; $i++ ) {
483	152	50				695	if ( $contents->[$i]{code} =~ /^(\s)BEGIN\s\{/ ) { # BmEGIN {
484	0					0	my $padding = $1;
485	0	0				0	if ( $contents->[ $i + 1 ]{code} =~ /^$padding}/ ) {
486	0					0	$i++;
487	0					0	next INDEX;
488							}
489							}
490	152					371	push @contents => $contents->[$i];
491							}
492	6					155	return \@contents;
493							}
494
495							sub _make_key {
496	152			152		233	my $self = shift;
497	152					242	local $_ = shift;
498	152					183	chomp;
499	152					1222	s/\s//g;
500	152					913	return $_;
501							}
502
503							sub _get_pairs_of_files {
504	4			4		12	my $self = shift;
505	4					16	my $files = $self->files;
506	4					8	my $num_files = @$files;
507	4					24	my $jobs = $self->jobs;
508
509	4					8	my @pairs;
510	4					16	for my $i ( 0 .. $#$files - 1 ) {
511	8					12	my $next = $i + 1;
512	8					16	for my $j ( $next .. $#$files ) {
513	12					24	push @pairs => [ @{$files}[ $i, $j ] ];
	12					44
514							}
515							}
516
517	4					12	my @left_right;
518	4	50				12	if ( $jobs > 1 ) {
519	0					0	my $files_per_job = int( $num_files / $jobs );
520	0					0	for ( 1 .. $jobs ) {
521	0	0				0	if ( $_ < $jobs ) {
522	0					0	push @left_right => splice @pairs, 0, $files_per_job;
523							}
524							else {
525	0					0	push @left_right => @pairs;
526							}
527							}
528							}
529							else {
530	4					12	@left_right = @pairs;
531							}
532	4					16	return @left_right;
533							}
534
535							1;
536							__END__