File Coverage

blib/lib/Code/DRY.pm

Criterion	Covered	Total	%
statement	175	194	90.2
branch	61	78	78.2
condition	34	51	66.6
subroutine	17	21	80.9
pod	13	13	100.0
total	300	357	84.0

line	stmt	bran	cond	sub	pod	time	code
1							package Code::DRY;
2
3	4			4		108598	use 5.008000;
	4					18
	4					196
4	4			4		26	use strict;
	4					289
	4					196
5	4			4		33	use warnings;
	4					160
	4					152
6	4			4		6695	use integer;
	4					267
	4					27
7
8							require Exporter;
9
10							our @ISA = qw(Exporter);
11
12							our %EXPORT_TAGS = (
13							'all' => [
14							qw(
15
16							)
17							]
18							);
19
20							our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } );
21
22							our @EXPORT = qw(
23
24							);
25
26							our $VERSION = '0.03';
27
28							require XSLoader;
29							XSLoader::load( 'Code::DRY', $VERSION );
30
31							# Preloaded methods go here.
32
33							my @files;
34							our ( @fileoffsets, @file_lineoffsets );
35
36							my $codetotal;
37							my $verbose = 0;
38
39							# reporting
40							# $minlength is the filter criterion for duplicates to report
41							# $units is currently either 'bytes' or 'lines'.
42							# $rDups point to an array with an entry for duplication
43							# The entry is an array with the following entries
44							# [
45							# 0: filename,
46							# 1: offset of start of file,
47							# 2: line number of start,
48							# 3: line number of end,
49							# 4: offset of line start,
50							# 5: offset of line end,
51							# 6: offset start,
52							# 7: offset end
53							# ]
54							my $default_callback = sub {
55							my ( $minlength, $units, $rDups ) = @_;
56
57							# show dupes
58							my $copies = scalar @{$rDups} - 1;
59							my $myamountlines = $rDups->[0]->[3] - $rDups->[0]->[2] + 1;
60							my $myamountbytesclipped = $rDups->[0]->[5] - $rDups->[0]->[4] + 1;
61							my $myamountbytes = $rDups->[0]->[7] - $rDups->[0]->[6] + 1;
62							my $lengthstring
63							= $units eq 'bytes'
64							? "$myamountbytes (>= $minlength $units) and $myamountlines complete lines"
65							: "$myamountlines (>= $minlength $units) and $myamountbytesclipped bytes reduced to complete lines";
66							print "$copies duplicate(s) found with a length of $lengthstring:\n";
67
68							my $cnt = 1;
69							for my $dup ( @{$rDups} ) {
70							print "$cnt. File: $dup->[0] in lines $dup->[2]..$dup->[3] (offsets ",
71							$dup->[4] - $dup->[1], "..", $dup->[5] - $dup->[1], ")\n";
72							++$cnt;
73							}
74
75							$cnt = 1;
76							for my $dup ( @{$rDups} ) {
77							if (0) {
78							print
79							"offsets: clipped $dup->[4]--$dup->[5], raw $dup->[6]--$dup->[7]\n";
80							print "lineends at: ";
81							for my $j ( $dup->[2] - 1 .. $dup->[3] ) {
82							print $dup->[1]
83							+ $file_lineoffsets[ offset2fileindex( $dup->[6] ) ]
84							->[ $j - 1 ], ' ';
85							}
86							}
87							print "=================\n";
88
89							my $offsetLineEnd;
90							if ( $units eq 'bytes' ) {
91
92							# begin at start of line
93							my $linenumber = offset2line( $dup->[6] );
94							my $fileindex = offset2fileindex( $dup->[6] );
95							my $file_lineoffset = $file_lineoffsets[$fileindex];
96							my $offsetLineBegin
97							= $linenumber <= 1
98							? $dup->[1]
99							: $dup->[1] + $file_lineoffset->[ $linenumber - 2 ] + 1;
100							$offsetLineEnd
101							= $dup->[1]
102							+ $file_lineoffset->[ $dup->[3]
103							+ ( $dup->[5] == $dup->[7] ? 0 : 1 ) ];
104
105							if ( $offsetLineBegin > $dup->[6]
106							\|\| $dup->[6] > $dup->[7]
107							\|\| $dup->[7] > $offsetLineEnd )
108							{
109							warn
110							"\n\ninternal error: $offsetLineBegin, $dup->[4], $dup->[5], $offsetLineEnd, Zeilennummer $linenumber";
111							}
112
113							print substr( $codetotal, $offsetLineBegin,
114							$dup->[6] - $offsetLineBegin );
115							print " ==>>" if ( $units eq 'bytes' );
116							}
117
118							print substr( $codetotal, $dup->[4], $dup->[5] - $dup->[4] + 1 );
119							print "<<== " if ( $units eq 'bytes' );
120
121							if ( $units eq 'bytes' ) {
122							print substr(
123							$codetotal,
124							$dup->[7] + 1,
125							$offsetLineEnd - $dup->[7]
126							) if ( $dup->[7] + 1 < $offsetLineEnd );
127							}
128
129							# end at end of line
130							print "\n=================\n";
131							++$cnt;
132							last; # makes not much sense to repeat identical parts
133							}
134							};
135
136							my $callback = $default_callback;
137
138							sub set_default_reporter {
139	0			0	1	0	$callback = $default_callback;
140							}
141
142							sub set_reporter {
143	0			0	1	0	($callback) = @_;
144							}
145
146							sub clearData {
147	1			1	1	3056	@files = undef;
148	1					3	$codetotal = undef;
149	1					3	@fileoffsets = ();
150	1					8	@file_lineoffsets = ();
151
152							# $SA = undef;
153							# $LCP = undef;
154	1					6	__free_all();
155							}
156
157							sub report_dupes {
158	39			39	1	57	my ( $minlength, $dups, $length, $matchentry ) = @_;
159
160	39					40	my $units;
161	39	100				63	if ( $minlength >= 0 ) {
162	38					51	$units = 'lines';
163							}
164							else {
165	1					2	$minlength = abs($minlength);
166	1					3	$units = 'bytes';
167							}
168
169							# get position info
170							# and report via callback
171	39					42	my @dups;
172	39					86	for my $entry ( $matchentry .. $matchentry + $dups - 1 ) {
173	83					204	my $offset_start = get_offset_at($entry);
174	83					109	my $offset_end = $offset_start + $length - 1;
175	83					125	my $file_index = offset2fileindex($offset_start);
176	83	100				172	my $file_start
177							= $file_index == 0 ? 0 : $fileoffsets[ $file_index - 1 ] + 1;
178
179							#print "$offset_start -> $offset_end => length ", $offset_end - $offset_start + 1, "\n";
180	83					88	my ( $upLine, $downLine );
181	83					158	offsetAndFileindex2line( $offset_start, $file_index, \$upLine );
182	83					165	offsetAndFileindex2line( $offset_end, $file_index, undef,
183							\$downLine );
184
185							# in line mode clip to line start and line end
186	83					90	my ( $offset_start_clipped, $offset_end_clipped );
187	83	100				168	if ( $units eq 'lines' ) {
188	77	100				148	$offset_start_clipped
189							= $file_start
190							+ ( $upLine < 2
191							? 0
192							: $file_lineoffsets[$file_index]->[ $upLine - 2 ] + 1 );
193	77					106	$offset_end_clipped = $file_start
194							+ $file_lineoffsets[$file_index]->[ $downLine - 1 ];
195							}
196							else {
197	6					7	$offset_start_clipped = $offset_start;
198	6					10	$offset_end_clipped = $offset_end;
199							}
200	83					183	push @dups,
201							[
202							offset2filename($offset_start), $file_start,
203							$upLine, $downLine,
204							$offset_start_clipped, $offset_end_clipped,
205							$offset_start, $offset_end
206							];
207							}
208
209							# sort by offset
210	83					164	@dups = map { $_->[1] }
	49					115
211	39					82	sort { $a->[0] <=> $b->[0] } map { [ $_->[2], $_ ] } @dups;
	83					251
212
213							#print "\n";
214
215	39					122	$callback->( $minlength, $units, \@dups );
216							}
217
218							# position to file and line number mapping
219							sub offset2filename {
220	83			83	1	92	my $offset = shift;
221	83					118	my $fi = offset2fileindex($offset);
222	83	50				155	if ( !defined $fi ) {
223	0					0	return;
224							}
225
226							# support memory files
227	83	100				176	if ( 'SCALAR' eq ref $files[$fi] ) {
228	6					35	return "memfile$fi";
229							}
230							else {
231	77					373	return $files[$fi];
232							}
233							}
234
235							# file index is 0 based
236							sub offset2fileindex {
237	6851			6851	1	163388	my $offset = shift;
238	6851					10490	my ( $l, $r ) = ( 0, $#fileoffsets );
239	6851	50				14334	return 0 if ( 0 == $r );
240
241	6851					8626	my $file = int( ( $r + $l ) / 2 );
242	6851					13089	while ( $l < $r ) {
243
244							#print "m=$file, l=$l, r=$r, fileoffset=$fileoffsets[$file] >= $offset?\n";
245
246	13791	100	100			112986	return $file
			100
247							if (
248							( ( $file > 0 && $fileoffsets[ $file - 1 ] < $offset )
249							\|\| $file == 0
250							)
251							&& $offset <= $fileoffsets[$file]
252							);
253
254	6943	100	66			24429	if ( $file > 0 && $fileoffsets[ $file - 1 ] >= $offset ) {
255	1632					2248	$r = $file;
256	1632					3356	$file = int( ( $r + $l ) / 2 );
257							}
258							else {
259	5311					5731	$l = $file;
260	5311					12571	$file = int( ( $r + $l + 1 ) / 2 );
261							}
262							}
263
264	3					12	return undef;
265							}
266
267							#line number is 1 based
268							sub offsetAndFileindex2line {
269	6699			6699	1	9240	my ( $offset, $fileindex, $rRoundedUp_line, $rRoundedDown_line ) = @_;
270	6699	100				12130	return if ( !defined $fileindex );
271
272	6697	100				15750	my $base = $fileindex == 0 ? 0 : $fileoffsets[ $fileindex - 1 ] + 1;
273	6697					13899	$offset -= $base;
274
275	6697					8331	my $lineoffsets = $file_lineoffsets[$fileindex];
276	6697					6932	my ( $l, $r ) = ( 0, $#{$lineoffsets} );
	6697					11193
277	6697	100				13668	if ( 0 == $r ) {
278	13	100				23	if ( defined($rRoundedUp_line) ) {
279	6					8	${$rRoundedUp_line} = 0;
	6					9
280							}
281	13	100				29	if ( defined($rRoundedDown_line) ) {
282	6					7	${$rRoundedDown_line} = 0;
	6					10
283							}
284	13					29	return 0;
285							}
286	6684	50				13020	return 0 if ( 0 == $r );
287	6684					8725	my $line = int( ( $r + $l ) / 2 );
288
289	6684					12284	while ( $l < $r ) {
290	40239	100	100			341282	if (( ( $line > 0 && $lineoffsets->[ $line - 1 ] < $offset )
			100
291							\|\| $line == 0
292							)
293							&& $offset <= $lineoffsets->[$line]
294							)
295							{
296	6684	100				12936	if ( defined($rRoundedDown_line) ) {
297	3342	100				8018	${$rRoundedDown_line}
	3342	100				4598
298							= $line == 0 ? $line + 1
299							: ( $offset == ( $lineoffsets->[$line] ) ? $line + 1
300							: $line );
301							}
302	6684	100				12059	if ( defined($rRoundedUp_line) ) {
303	3342	100				8115	${$rRoundedUp_line}
	3342	100				17342
304							= $offset
305							== ( $line == 0 ? 0 : $lineoffsets->[ $line - 1 ] + 1 )
306							? $line + 1
307							: $line + 2;
308							}
309	6684					20140	return $line + 1;
310							}
311
312	33555	100	66			173338	if ( $line > 0 && $lineoffsets->[ $line - 1 ] >= $offset ) {
313	15664					23408	$r = $line;
314	15664					37789	$line = int( ( $r + $l ) / 2 );
315							}
316							else {
317	17891					42117	$l = $line;
318	17891					38240	$line = int( ( $r + $l + 1 ) / 2 );
319							}
320							}
321	0					0	return;
322							}
323
324							# line number is 1 based
325							sub offset2line {
326	6382			6382	1	74088	my ( $offset, $rRoundedUp_line, $rRoundedDown_line ) = @_;
327	6382					16829	my $fileindex = offset2fileindex($offset);
328	6382					21812	return offsetAndFileindex2line( $offset, $fileindex, $rRoundedUp_line,
329							$rRoundedDown_line );
330							}
331
332							sub get_line_offsets_of_fileindex {
333	0			0	1	0	my $fileindex = shift;
334	0					0	return $file_lineoffsets[$fileindex];
335							}
336
337							sub get_concatenated_text {
338	0			0	1	0	my ( $start, $length ) = @_;
339	0					0	return substr( $codetotal, $start, $length );
340							}
341
342							sub enter_files {
343	8			8	1	20179	(my $rfiles) = @_;
344
345							# reset all info
346	8					32	@fileoffsets = @file_lineoffsets = ();
347	8					22	$codetotal = '';
348
349							# preprocess files content
350	8					14	my $here = 0;
351	8					14	for my $file (@{$rfiles}) {
	8					21
352	35	50				605	if (-z $file) {
353	0					0	splice @{$rfiles}, $here, 1;
	0					0
354	0					0	next; # skip empty files
355							}
356
357	35					84	my ( $code, @lineoffsets ) = __get_text($file);
358	35	100				98	if ($code eq '') {
359	2					3	splice @{$rfiles}, $here, 1;
	2					6
360	2					8	next; # skip empty files
361							}
362
363							# we need the length of $code
364	33					84	$codetotal .= $code;
365	33					59	push @fileoffsets, ( length $codetotal ) - 1;
366
367							# save line offsets per file
368	33					669	push @file_lineoffsets, [@lineoffsets];
369	33					105	++$here;
370							}
371							}
372
373							sub find_duplicates_in {
374	2			2	1	50	( my $minlength, my $ignoreContentFilter, @files ) = @_;
375	2					9	enter_files(\@files);
376
377							# enter codestring
378	2	50				6294	build_suffixarray_and_lcp($codetotal) == 0
379							or die "Error building suffix array:$!\n";
380	2	50				32	warn "analysing content of ", length $codetotal, " bytes out of ",
381							scalar @files, " files...\n" if ($verbose);
382	2					1905	clip_lcp_to_fileboundaries( \@fileoffsets );
383	2					108	reduce_lcp_to_nonoverlapping_lengths();
384	2					451	set_lcp_to_zero_for_shadowed_substrings();
385
386	2					8	my $n = get_size();
387
388	2					4	my $cnt = 0;
389
390	2					4	my @ranks;
391	2					4	my $absminlength = abs($minlength);
392
393	2					6	my $last_lcp = 0;
394	149					296	@ranks = sort { get_len_at($b) <=> get_len_at($a) } grep {
	17479					20496
395
396							# filter out when the lcp for this index is smaller than our requested minimal length
397	2					1344	my $lcp; # length of match
398	17479					45697	my $res = ( $lcp = get_len_at($_) )
399							>= $absminlength; # works for bytes and lines
400
401	17479		100			40868	$res = $res && $lcp != $last_lcp;
402
403							# ignore filter
404	17479					17048	my $off;
405	17479	50	66			41672	if ( $res && defined $ignoreContentFilter ) {
406	0					0	$res = substr( $codetotal, $off = get_offset_at($_), $lcp )
407							!~ m{$ignoreContentFilter}xms;
408							}
409
410	17479	100	100			39499	if ( $res && 0 <= $minlength )
411							{ # minimal length is specified in line units
412	3115					7345	my $off = get_offset_at($_);
413
414							# include complete lines
415	3115					3172	my ( $upLine, $downLine );
416	3115					6727	my $startLine = offset2line( $off, \$upLine );
417	3115					7305	my $endLine = offset2line( $off + $lcp - 1, undef, \$downLine );
418	3115					5202	my $includesCompleteLines = $downLine - $upLine + 1;
419
420							# positive minlength is interpreted as lines
421	3115					5017	$res = $includesCompleteLines >= $minlength;
422							}
423
424	17479					18209	if (1) {
425	17479					22568	$last_lcp = $lcp;
426							}
427
428							#print "index $_ with lcp ", get_len_at($_), " and offset ", get_offset_at($_), " is ", $res ? "accepted\n" : "filtered out\n";
429							#print "index $_ with lcp ", get_len_at($_), " and offset ", get_offset_at($_), " is accepted\n" if $res;
430							#print "index $_ with lcp ", get_len_at($_), " and offset ", get_offset_at($_), " is filtered out\n" if !$res;
431	17479					41476	$res;
432							} ( 1 .. $n - 1 );
433
434	2	50				417	warn "ranking array created with ", scalar @ranks, " entries\n" if ($verbose);
435
436							# now report the remaining duplicates
437	2					17	for my $matchentry (@ranks) {
438
439							# how many duplicates?
440	39					59	my $count_dups = 2;
441	39					102	my $lcp = get_len_at($matchentry); # length of match
442	39					155	while ( $lcp == get_len_at( $matchentry + $count_dups - 1 ) ) {
443	5					18	++$count_dups;
444							}
445
446	39					99	report_dupes( $minlength, $count_dups, $lcp, $matchentry - 1 );
447							}
448							}
449
450	4			4		19163	use File::Find ();
	4					13
	4					2543
451
452							sub scan_directories {
453	1			1	1	985	my ( $minlength, $ignoreContentFilter, $regexAccept, $regexIgnore, @dirs )
454							= @_;
455	1					2	my @filepaths;
456
457	1	50				6	if ( !defined $regexIgnore ) {
458	0					0	$regexIgnore = qr{\.bak$\|~$\|\.swp$\|\.bup$}xmsi;
459							}
460	1	50	33			10	if ( defined $regexAccept && ref $regexAccept ne 'Regexp' ) {
461	0					0	$regexAccept = qr{$regexAccept}xms;
462							}
463	1	50	33			11	if ( defined $regexIgnore && ref $regexIgnore ne 'Regexp' ) {
464	0					0	$regexIgnore = qr{$regexIgnore}xms;
465							}
466	1	50	33			5	if ( defined $ignoreContentFilter
467							&& ref $ignoreContentFilter ne 'Regexp' )
468							{
469	0					0	$ignoreContentFilter = qr{$ignoreContentFilter}xms;
470							}
471	1	50	33			38	if ( 1 == scalar @dirs && !-d $dirs[0] ) {
472
473							# enable globs
474	0					0	@dirs = <$dirs[0]>;
475							}
476
477							File::Find::find(
478							sub {
479	5	50	66	5		260	if ( -f $_ && -s $_
			33
			66
480							&& ( !defined($regexAccept) \|\| $_ =~ m{$regexAccept}o ) )
481							{
482	4	50	33			33	if ( !defined($regexIgnore) \|\| $_ !~ m{$regexIgnore}o ) {
483	4					65	push @filepaths, $File::Find::name;
484							}
485							}
486							},
487							@dirs
488	1					120	);
489	1	50				8	if ( 0 == scalar @filepaths ) {
490	0					0	print "no files found for start dir(s) ", ( join ',', @dirs ),
491							" with accept filter $regexAccept and ignore filter $regexIgnore!\n";
492	0					0	return;
493							}
494
495	1					6	find_duplicates_in( $minlength, $ignoreContentFilter, @filepaths );
496							}
497
498							sub __get_text {
499	35			35		177	my $file = shift;
500	35					45	my $contents = '';
501	35					46	my @lineoffsets;
502	35	50		2		4213	open my $infile, '<', $file or die "cannot open file $file: $!\n";
	2					26
	2					7
	2					19
503	35					22975	while (<$infile>) {
504	498					514	$contents .= $_;
505	498					1181	push @lineoffsets, length($contents) - 1;
506							}
507	35					302	return ( $contents, @lineoffsets );
508							}
509
510							1;
511							__END__