| line | stmt | bran | cond | sub | pod | time | code | 
| 1 |  |  |  |  |  |  | package Text::Identify::BoilerPlate; | 
| 2 |  |  |  |  |  |  |  | 
| 3 | 1 |  |  | 1 |  | 21585 | use warnings; | 
|  | 1 |  |  |  |  | 2 |  | 
|  | 1 |  |  |  |  | 28 |  | 
| 4 | 1 |  |  | 1 |  | 5 | use strict; | 
|  | 1 |  |  |  |  | 1 |  | 
|  | 1 |  |  |  |  | 28 |  | 
| 5 | 1 |  |  | 1 |  | 5 | use Carp; | 
|  | 1 |  |  |  |  | 5 |  | 
|  | 1 |  |  |  |  | 70 |  | 
| 6 | 1 |  |  | 1 |  | 4 | use base qw{ Exporter }; | 
|  | 1 |  |  |  |  | 2 |  | 
|  | 1 |  |  |  |  | 81 |  | 
| 7 | 1 |  |  | 1 |  | 5 | use Digest::MD5 qw{ md5 }; | 
|  | 1 |  |  |  |  | 1 |  | 
|  | 1 |  |  |  |  | 1416 |  | 
| 8 |  |  |  |  |  |  |  | 
| 9 |  |  |  |  |  |  | our $VERSION   = '0.3.1'; | 
| 10 |  |  |  |  |  |  | our @EXPORT_OK = qw{ rem_boilerplate }; | 
| 11 |  |  |  |  |  |  |  | 
| 12 |  |  |  |  |  |  | my %config; | 
| 13 |  |  |  |  |  |  | my %dupl; | 
| 14 |  |  |  |  |  |  |  | 
| 15 |  |  |  |  |  |  | my $LOG_FILE; | 
| 16 |  |  |  |  |  |  |  | 
| 17 |  |  |  |  |  |  | sub rem_boilerplate { | 
| 18 |  |  |  |  |  |  |  | 
| 19 | 0 |  |  | 0 | 1 |  | my ( $files, $arg_ref ) = @_; | 
| 20 |  |  |  |  |  |  |  | 
| 21 | 0 |  |  |  |  |  | _get_config($arg_ref); | 
| 22 |  |  |  |  |  |  |  | 
| 23 | 0 |  |  |  |  |  | my @files = @$files; | 
| 24 |  |  |  |  |  |  |  | 
| 25 | 0 |  |  |  |  |  | print "MIN_DUPL: $config{'min_dupl'}\n"; | 
| 26 |  |  |  |  |  |  |  | 
| 27 | 0 | 0 |  |  |  |  | if ($config{'min_dupl'} =~ s/\s*\%//) { | 
| 28 | 0 |  |  |  |  |  | my $num_of_files = @files; | 
| 29 | 0 |  |  |  |  |  | $config{'min_dupl'} =  int ( | 
| 30 |  |  |  |  |  |  | ($num_of_files * $config{'min_dupl'}) | 
| 31 |  |  |  |  |  |  | / 100 | 
| 32 |  |  |  |  |  |  | ); | 
| 33 | 0 | 0 |  |  |  |  | if ($config{'min_dupl'} < 2) { | 
| 34 | 0 |  |  |  |  |  | $config{'min_dupl'}=2; | 
| 35 |  |  |  |  |  |  | } | 
| 36 |  |  |  |  |  |  | } | 
| 37 |  |  |  |  |  |  |  | 
| 38 | 0 |  |  |  |  |  | print "MIN_DUPL: $config{'min_dupl'}\n"; | 
| 39 |  |  |  |  |  |  |  | 
| 40 |  |  |  |  |  |  |  | 
| 41 | 0 |  |  |  |  |  | foreach my $file (@files) { | 
| 42 |  |  |  |  |  |  |  | 
| 43 | 0 |  |  |  |  |  | my @lines; | 
| 44 | 0 | 0 |  |  |  |  | open my $INPUT_FILE, '<', $file | 
| 45 |  |  |  |  |  |  | or croak "Can't open $file"; | 
| 46 | 0 |  |  |  |  |  | while ( my $line = <$INPUT_FILE> ) { | 
| 47 |  |  |  |  |  |  |  | 
| 48 | 0 |  |  |  |  |  | $line =~ s/^\s*//g; | 
| 49 | 0 |  |  |  |  |  | $line =~ s/\s*$//g; | 
| 50 | 0 |  |  |  |  |  | $line =~ s/\t/ /g; | 
| 51 | 0 |  |  |  |  |  | $line =~ s/ +/ /g; | 
| 52 | 0 |  |  |  |  |  | $line =~ s/\r//g; | 
| 53 |  |  |  |  |  |  |  | 
| 54 | 0 |  |  |  |  |  | my $line_tmp = $line; | 
| 55 | 0 | 0 |  |  |  |  | if ( $config{'ignore_digits'} ) { | 
| 56 | 0 |  |  |  |  |  | $line_tmp =~ s/\d+/_D_/g; | 
| 57 |  |  |  |  |  |  | } | 
| 58 | 0 | 0 |  |  |  |  | if ( $config{'digest'} ) { | 
| 59 | 0 |  |  |  |  |  | $line_tmp = md5($line_tmp); | 
| 60 |  |  |  |  |  |  | } | 
| 61 |  |  |  |  |  |  |  | 
| 62 | 0 |  |  |  |  |  | $dupl{$line_tmp}++; | 
| 63 | 0 |  |  |  |  |  | push @lines, $line; | 
| 64 |  |  |  |  |  |  |  | 
| 65 |  |  |  |  |  |  | } | 
| 66 | 0 |  |  |  |  |  | $file = [ $file, \@lines ];    # name + content | 
| 67 |  |  |  |  |  |  | } | 
| 68 |  |  |  |  |  |  |  | 
| 69 |  |  |  |  |  |  | # find duplicated lines | 
| 70 | 0 |  |  |  |  |  | foreach my $file (@files) { | 
| 71 |  |  |  |  |  |  |  | 
| 72 | 0 | 0 |  |  |  |  | if ( $config{'log'} ) { | 
| 73 | 0 |  |  |  |  |  | print {$LOG_FILE} "\n-- $file->[0] --\n"; | 
|  | 0 |  |  |  |  |  |  | 
| 74 |  |  |  |  |  |  | } | 
| 75 |  |  |  |  |  |  |  | 
| 76 | 0 |  |  |  |  |  | my $lines = $file->[1]; | 
| 77 | 0 |  |  |  |  |  | my @lines = @$lines; | 
| 78 |  |  |  |  |  |  |  | 
| 79 | 0 | 0 |  |  |  |  | if ( $config{'only_headers_and_footers'} ) { | 
| 80 | 0 |  |  |  |  |  | @lines = _rem_first_duplicates(@lines); | 
| 81 | 0 |  |  |  |  |  | @lines = reverse @lines; | 
| 82 | 0 |  |  |  |  |  | @lines = _rem_first_duplicates(@lines); | 
| 83 | 0 |  |  |  |  |  | @lines = reverse @lines; | 
| 84 |  |  |  |  |  |  | } | 
| 85 |  |  |  |  |  |  | else { | 
| 86 | 0 |  |  |  |  |  | @lines = _rem_duplicates(@lines); | 
| 87 |  |  |  |  |  |  | } | 
| 88 |  |  |  |  |  |  |  | 
| 89 | 0 |  |  |  |  |  | my $filename     = $file->[0]; | 
| 90 | 0 |  |  |  |  |  | my $new_filename = $filename . "." . $config{'suffix'}; | 
| 91 | 0 | 0 |  |  |  |  | open my $OUTPUT_FILE, '>', $new_filename | 
| 92 |  |  |  |  |  |  | or croak "Can't open $new_filename"; | 
| 93 | 0 |  |  |  |  |  | print {$OUTPUT_FILE} join( "\n", @lines ); | 
|  | 0 |  |  |  |  |  |  | 
| 94 |  |  |  |  |  |  |  | 
| 95 |  |  |  |  |  |  | } | 
| 96 |  |  |  |  |  |  |  | 
| 97 |  |  |  |  |  |  | } | 
| 98 |  |  |  |  |  |  |  | 
| 99 |  |  |  |  |  |  | sub _rem_first_duplicates { | 
| 100 |  |  |  |  |  |  |  | 
| 101 | 0 |  |  | 0 |  |  | my @input = @_; | 
| 102 |  |  |  |  |  |  |  | 
| 103 |  |  |  |  |  |  | CUT: | 
| 104 | 0 |  |  |  |  |  | foreach my $line (@input) { | 
| 105 |  |  |  |  |  |  |  | 
| 106 | 0 |  |  |  |  |  | my $line_tmp = $line; | 
| 107 | 0 | 0 |  |  |  |  | if ( $config{'ignore_digits'} ) { | 
| 108 | 0 |  |  |  |  |  | $line_tmp =~ s/\d+/_D_/g; | 
| 109 |  |  |  |  |  |  | } | 
| 110 |  |  |  |  |  |  |  | 
| 111 | 0 | 0 |  |  |  |  | if ( $config{'digest'} ) { | 
| 112 | 0 |  |  |  |  |  | $line_tmp = md5($line_tmp); | 
| 113 |  |  |  |  |  |  | } | 
| 114 |  |  |  |  |  |  |  | 
| 115 | 0 | 0 |  |  |  |  | if ( $line_tmp eq '' ) { | 
|  |  | 0 |  |  |  |  |  | 
| 116 | 0 |  |  |  |  |  | next CUT; | 
| 117 |  |  |  |  |  |  | } | 
| 118 |  |  |  |  |  |  | elsif ( $dupl{$line_tmp} > $config{'min_dupl'} ) { | 
| 119 | 0 |  |  |  |  |  | my $cut_line = shift @input; | 
| 120 | 0 | 0 |  |  |  |  | if ( $config{'log'} ) { | 
| 121 | 0 |  |  |  |  |  | print {$LOG_FILE} $cut_line, "\n"; | 
|  | 0 |  |  |  |  |  |  | 
| 122 |  |  |  |  |  |  | } | 
| 123 |  |  |  |  |  |  | } | 
| 124 |  |  |  |  |  |  | else { | 
| 125 | 0 |  |  |  |  |  | last CUT; | 
| 126 |  |  |  |  |  |  | } | 
| 127 |  |  |  |  |  |  |  | 
| 128 |  |  |  |  |  |  | } | 
| 129 |  |  |  |  |  |  |  | 
| 130 | 0 |  |  |  |  |  | return @input; | 
| 131 |  |  |  |  |  |  |  | 
| 132 |  |  |  |  |  |  | } | 
| 133 |  |  |  |  |  |  |  | 
| 134 |  |  |  |  |  |  | sub _rem_duplicates { | 
| 135 |  |  |  |  |  |  |  | 
| 136 | 0 |  |  | 0 |  |  | my @input = @_; | 
| 137 |  |  |  |  |  |  |  | 
| 138 | 0 |  |  |  |  |  | my $i = 0; | 
| 139 |  |  |  |  |  |  | CUT: | 
| 140 | 0 |  |  |  |  |  | foreach my $line (@input) { | 
| 141 |  |  |  |  |  |  |  | 
| 142 | 0 |  |  |  |  |  | my $line_tmp = $line; | 
| 143 | 0 | 0 |  |  |  |  | if ( $config{'ignore_digits'} ) { | 
| 144 | 0 |  |  |  |  |  | $line_tmp =~ s/\d+/_D_/g; | 
| 145 |  |  |  |  |  |  | } | 
| 146 |  |  |  |  |  |  |  | 
| 147 | 0 | 0 |  |  |  |  | if ( $config{'digest'} ) { | 
| 148 | 0 |  |  |  |  |  | $line_tmp = md5($line_tmp); | 
| 149 |  |  |  |  |  |  | } | 
| 150 |  |  |  |  |  |  |  | 
| 151 | 0 | 0 |  |  |  |  | if ( $line_tmp eq '' ) { | 
|  |  | 0 |  |  |  |  |  | 
| 152 | 0 |  |  |  |  |  | $i++; | 
| 153 | 0 |  |  |  |  |  | next CUT; | 
| 154 |  |  |  |  |  |  | } | 
| 155 |  |  |  |  |  |  | elsif ( $dupl{$line_tmp} > $config{'min_dupl'} ) { | 
| 156 |  |  |  |  |  |  |  | 
| 157 | 0 |  |  |  |  |  | my $cut_line = splice( @input, $i, 1 ); | 
| 158 | 0 | 0 |  |  |  |  | if ( $config{'log'} ) { | 
| 159 | 0 |  |  |  |  |  | print {$LOG_FILE} $cut_line, "\n"; | 
|  | 0 |  |  |  |  |  |  | 
| 160 |  |  |  |  |  |  | } | 
| 161 |  |  |  |  |  |  |  | 
| 162 |  |  |  |  |  |  | } | 
| 163 |  |  |  |  |  |  |  | 
| 164 | 0 |  |  |  |  |  | $i++; | 
| 165 |  |  |  |  |  |  |  | 
| 166 |  |  |  |  |  |  | } | 
| 167 |  |  |  |  |  |  |  | 
| 168 | 0 |  |  |  |  |  | return @input; | 
| 169 |  |  |  |  |  |  |  | 
| 170 |  |  |  |  |  |  | } | 
| 171 |  |  |  |  |  |  |  | 
| 172 |  |  |  |  |  |  | sub _get_config { | 
| 173 |  |  |  |  |  |  |  | 
| 174 | 0 |  |  | 0 |  |  | %config = ( | 
| 175 |  |  |  |  |  |  | min_dupl                 => 3, | 
| 176 |  |  |  |  |  |  | suffix                   => 'content', | 
| 177 |  |  |  |  |  |  | ignore_digits            => 1, | 
| 178 |  |  |  |  |  |  | only_headers_and_footers => 1, | 
| 179 |  |  |  |  |  |  | digest                   => 0, | 
| 180 |  |  |  |  |  |  | log                      => 'text-identify-boilerplate.log' | 
| 181 |  |  |  |  |  |  | ); | 
| 182 |  |  |  |  |  |  |  | 
| 183 | 0 |  |  |  |  |  | my $arg_ref    = $_[0]; | 
| 184 | 0 |  |  |  |  |  | my %config_arg = %$arg_ref; | 
| 185 |  |  |  |  |  |  |  | 
| 186 |  |  |  |  |  |  | # merge hashes | 
| 187 | 0 |  |  |  |  |  | foreach my $opt (keys %config) { | 
| 188 | 0 | 0 |  |  |  |  | if (defined $config_arg{$opt}) { | 
| 189 | 0 |  |  |  |  |  | $config{$opt}=$config_arg{$opt}; | 
| 190 |  |  |  |  |  |  | } | 
| 191 |  |  |  |  |  |  | } | 
| 192 |  |  |  |  |  |  |  | 
| 193 | 0 | 0 |  |  |  |  | if ( $config{'log'} ) { | 
| 194 | 0 | 0 |  |  |  |  | open $LOG_FILE, '>', $config{'log'} | 
| 195 |  |  |  |  |  |  | or croak "Can't open $config{'log'}"; | 
| 196 |  |  |  |  |  |  | } | 
| 197 |  |  |  |  |  |  |  | 
| 198 | 0 |  |  |  |  |  | return; | 
| 199 |  |  |  |  |  |  |  | 
| 200 |  |  |  |  |  |  | } | 
| 201 |  |  |  |  |  |  |  | 
| 202 |  |  |  |  |  |  | 1;    # End of Text::Identify::BoilerPlate | 
| 203 |  |  |  |  |  |  |  | 
| 204 |  |  |  |  |  |  | __END__ |