File Coverage

blib/lib/School/Code/Compare.pm
Criterion Covered Total %
statement 38 41 92.6
branch 5 10 50.0
condition 1 3 33.3
subroutine 8 8 100.0
pod 2 5 40.0
total 54 67 80.6


line stmt bran cond sub pod time code
1             package School::Code::Compare;
2             # ABSTRACT: 'naive' metrics for code similarity
3             $School::Code::Compare::VERSION = '0.103'; # TRIAL
4 1     1   138225 use strict;
  1         11  
  1         32  
5 1     1   5 use warnings;
  1         3  
  1         34  
6              
7 1     1   494 use Text::Levenshtein::XS qw(distance);
  1         708  
  1         432  
8              
9             sub new {
10 1     1 0 120 my $class = shift;
11              
12 1         6 my $self = {
13             max_relative_diff => 2,
14             min_char_total => 20,
15             max_relative_distance => 0.8,
16             };
17 1         3 bless $self, $class;
18              
19 1         5 return $self;
20             }
21              
22             sub set_max_relative_difference {
23 1     1 0 2 my $self = shift;
24              
25 1         5 $self->{max_relative_diff} = shift;
26              
27             # make this chainable in OO-interface
28 1         4 return $self;
29             }
30              
31             sub set_min_char_total {
32 1     1 1 2 my $self = shift;
33              
34 1         3 $self->{min_char_total} = shift;
35              
36             # make this chainable in OO-interface
37 1         3 return $self;
38             }
39              
40             sub set_max_relative_distance {
41 1     1 0 2 my $self = shift;
42              
43 1         3 $self->{max_relative_distance} = shift;
44              
45             # make this chainable in OO-interface
46 1         2 return $self;
47             }
48              
49             sub measure {
50 2     2 1 45 my $self = shift;
51              
52 2         4 my $str1 = shift;
53 2         3 my $str2 = shift;
54              
55 2         8 my $length_str1 = length($str1);
56 2         6 my $length_str2 = length($str2);
57              
58 2 50       11 my ($short, $long) = $length_str1 < $length_str2 ?
59             ($length_str1, $length_str2) :
60             ($length_str2, $length_str1) ;
61              
62 2         4 my $diff = $long - $short;
63              
64 2 50 33     14 if ($self->{min_char_total} > $length_str1
65             or $self->{min_char_total} > $length_str2) {
66             return {
67             distance => undef,
68             ratio => undef,
69             length1 => $length_str1,
70             length2 => $length_str2,
71             delta_length => $diff,
72             comment => 'skipped: smaller than '
73             . $self->{min_char_total},
74 0         0 };
75             }
76              
77 2         8 my $longer_percent = $long / $short;
78              
79 2         4 $self->{max_distance} = $short * $self->{max_relative_distance};
80              
81 2 50       8 if ($longer_percent > $self->{max_relative_diff}) {
82             return {
83             distance => undef,
84             ratio => undef,
85             length1 => $length_str1,
86             length2 => $length_str2,
87             delta_length => $diff,
88             comment => 'skipped: delta in length bigger than factor '
89             . $self->{max_relative_diff},
90 0         0 };
91             }
92             else {
93 2         7 my $distance = distance($str1, $str2, $self->{max_distance});
94              
95 2 50       1087 if (defined $distance) {
96              
97 2 50       5 my $shorter_strlen = $length_str1 > $length_str2
98             ? $length_str1 : $length_str2;
99             # 100 - (different in %) = (equal in %)
100 2         9 my $chars_equal_percent = 100 - int($distance/$shorter_strlen*100 + 0.5);
101              
102             return {
103 2         17 distance => $distance,
104             ratio => $chars_equal_percent,
105             length1 => $length_str1,
106             length2 => $length_str2,
107             delta_length => $diff,
108             comment => 'comparison done',
109             };
110             }
111             else {
112             return {
113             distance => undef,
114             ratio => undef,
115             length1 => $length_str1,
116             length2 => $length_str2,
117             delta_length => $diff,
118             comment => 'skipped: distance higher than '
119             . $self->{max_distance} . ' (factor '
120 0           . $self->{max_relative_distance} . ')',
121             };
122             }
123             }
124             }
125              
126             1;
127              
128             __END__
129              
130             =pod
131              
132             =encoding UTF-8
133              
134             =head1 NAME
135              
136             School::Code::Compare - 'naive' metrics for code similarity
137              
138             =head1 VERSION
139              
140             version 0.103
141              
142             =head1 SYNOPSIS
143              
144             This distribution ships a script.
145             You migth want to look at the script L<compare-code> in the C<bin> directory.
146             For documentation of the used libraries, keep on reading.
147              
148             This calculates the Levenshtein Difference for two files, if they meet certain criterias:
149              
150             use School::Code::Compare;
151            
152             my $comparer = School::Code::Compare->new()
153             ->set_max_relative_difference(2)
154             ->set_min_char_total (20)
155             ->set_max_relative_distance(0.8);
156            
157             my $comparison = $comparer->measure('use v5.22; say "Hi"!',
158             'use v5.22; say "Hello";'
159             );
160             print $comparison->{distance} if $comparison
161              
162             =head1 FUNCTIONS
163              
164             =head2 set_max_char_difference
165              
166             Don't even start comparison, if the difference in char count is higher than set.
167              
168             =head2 set_min_char_total
169              
170             Don't even start comparison if a file is below this char count.
171              
172             =head2 set_max_distance
173              
174             Abort comparison (in the midst of comparison), if distance is becoming higher then set value.
175              
176             =head2 measure
177              
178             Do a comparison for two strings.
179             Gives back a hash reference with different information:
180              
181             # (example output from synopsis)
182             {
183             'comment' => 'comparison done',
184             'delta_length' => 3,
185             'distance' => 5
186             'length1' => 20,
187             'length2' => 23,
188             'ratio' => 79,
189             };
190              
191             =over 4
192              
193             =item distance
194              
195             The Levenshtein Distance.
196             See L<Text::Levenshtein::XS> for more information.
197              
198             =item ratio
199              
200             The ratio of the distance in chars to the average length of the compared strings.
201             A ratio of zero means, the strings are similar.
202             A ratio of 50 means, that 50% of a string is different.
203              
204             My experience is, that if you get a ratio below 30% you have to start looking if the code was copied and altered (if your concern is to find 'cheaters' in educational/school environments).
205             This method of measurement is by no means well established.
206             It may be even 'naive', but it just seems to work out quite well.
207             See L<School::Code::Compare::Judge> to see, how the results are currently interpreted.
208              
209             =item comment
210              
211             A comment on how the comparison went.
212              
213             =item delta_length
214              
215             Difference in length (chars) of the two compared strings.
216              
217             =back
218              
219             =head1 AUTHOR
220              
221             Boris Däppen <bdaeppen.perl@gmail.com>
222              
223             =head1 COPYRIGHT AND LICENSE
224              
225             This software is copyright (c) 2020 by Boris Däppen.
226              
227             This is free software; you can redistribute it and/or modify it under
228             the same terms as the Perl 5 programming language system itself.
229              
230             =cut