File Coverage

blib/lib/AI/ConfusionMatrix.pm
Criterion Covered Total %
statement 85 85 100.0
branch 15 18 83.3
condition n/a
subroutine 10 10 100.0
pod 2 3 66.6
total 112 116 96.5


line stmt bran cond sub pod time code
1             package AI::ConfusionMatrix;
2             $AI::ConfusionMatrix::VERSION = '0.010';
3 2     2   118334 use strict;
  2         12  
  2         77  
4 2     2   9 use warnings;
  2         3  
  2         47  
5 2     2   10 use Carp;
  2         9  
  2         112  
6 2     2   11 use Exporter 'import';
  2         2  
  2         88  
7             our @EXPORT= qw (getConfusionMatrix makeConfusionMatrix);
8 2     2   10 use strict;
  2         4  
  2         39  
9 2     2   1276 use Tie::File;
  2         33373  
  2         1413  
10              
11             # ABSTRACT: Make a confusion matrix
12              
13             sub makeConfusionMatrix {
14 4     4 1 10336 my ($matrix, $file, $delem) = @_;
15 4 100       14 unless(defined $delem) {
16 2         4 $delem = ',';
17             }
18              
19 4 50       14 carp ('First argument must be a hash reference') if ref($matrix) ne 'HASH';
20              
21 4         10 my %cmData = genConfusionMatrixData($matrix);
22             # This ties @output_array to the output file. Each output_array item represents a line in the output file
23 4 50       26 tie my @output_array, 'Tie::File', $file or carp "$!";
24             # Empty the file
25 4         435 @output_array = ();
26              
27 4         317 my @columns = @{$cmData{columns}};
  4         12  
28 4         20 map {$output_array[0] .= $delem . $_} join $delem, (@columns, 'TOTAL', 'TP', 'FP', 'FN', 'SENS', 'ACC');
  4         17  
29 4         1749 my $line = 1;
30 4         8 my @expected = sort keys %{$matrix};
  4         18  
31 4         12 for my $expected (@expected) {
32 12         38 $output_array[$line] = $expected;
33 12         3395 my $lastIndex = 0;
34 12         13 my $index;
35 12         15 for my $predicted (sort keys %{$matrix->{$expected}}) {
  12         44  
36             # Calculate the index of the label in the output_array of columns
37 28         71 $index = _findIndex($predicted, \@columns);
38             # Print some of the delimiter to get to the column of the next value predicted
39 28         122 $output_array[$line] .= $delem x ($index - $lastIndex) . $matrix->{$expected}{$predicted};
40 28         8389 $lastIndex = $index;
41             }
42              
43             # Get to the columns of the stats
44 12         53 $output_array[$line] .= $delem x (scalar(@columns) - $lastIndex + 1);
45             $output_array[$line] .= join $delem, (
46             $cmData{stats}{$expected}{'total'},
47             $cmData{stats}{$expected}{'tp'},
48             $cmData{stats}{$expected}{'fp'},
49             $cmData{stats}{$expected}{'fn'},
50             sprintf('%.2f%%', $cmData{stats}{$expected}{'sensitivity'}),
51 12         3380 sprintf('%.2f%%', $cmData{stats}{$expected}{'acc'})
52             );
53 12         2968 ++$line;
54             }
55             # Print the TOTAL row to the csv file
56 4         18 $output_array[$line] = 'TOTAL' . $delem;
57 4         1256 map {$output_array[$line] .= $cmData{totals}{$_} . $delem} (@columns);
  18         5313  
58             $output_array[$line] .= join $delem, (
59             $cmData{totals}{'total'},
60             $cmData{totals}{'tp'},
61             $cmData{totals}{'fp'},
62             $cmData{totals}{'fn'},
63             sprintf('%.2f%%', $cmData{totals}{'sensitivity'}),
64 4         1279 sprintf('%.2f%%', $cmData{totals}{'acc'})
65             );
66              
67 4         1061 untie @output_array;
68             }
69              
70             sub getConfusionMatrix {
71 1     1 1 591 my ($matrix) = @_;
72              
73 1 50       4 carp ('First argument must be a hash reference') if ref($matrix) ne 'HASH';
74 1         4 return genConfusionMatrixData($matrix);
75             }
76              
77             sub genConfusionMatrixData {
78 5     5 0 9 my $matrix = shift;
79 5         7 my @expected = sort keys %{$matrix};
  5         27  
80 5         15 my %stats;
81             my %totals;
82 5         0 my @columns;
83 5         10 for my $expected (@expected) {
84 15         30 $stats{$expected}{'fn'} = 0;
85 15         21 $stats{$expected}{'tp'} = 0;
86             # Ensure that the False Positive counter is defined to be able to compute the total later
87 15 100       27 unless(defined $stats{$expected}{'fp'}) {
88 10         18 $stats{$expected}{'fp'} = 0;
89             }
90 15         20 for my $predicted (keys %{$matrix->{$expected}}) {
  15         33  
91 35         55 $stats{$expected}{'total'} += $matrix->{$expected}->{$predicted};
92 35 100       63 $stats{$expected}{'tp'} += $matrix->{$expected}->{$predicted} if $expected eq $predicted;
93 35 100       60 if ($expected ne $predicted) {
94 20         32 $stats{$expected}{'fn'} += $matrix->{$expected}->{$predicted};
95 20         28 $stats{$predicted}{'fp'} += $matrix->{$expected}->{$predicted};
96             }
97 35         44 $totals{$predicted} += $matrix->{$expected}->{$predicted};
98             # Add the label to the array of columns if it does not contain it already
99 35 100       52 push @columns, $predicted unless _findIndex($predicted, \@columns);
100             }
101              
102 15         42 $stats{$expected}{'acc'} = ($stats{$expected}{'tp'} * 100) / $stats{$expected}{'total'};
103             }
104              
105 5         11 for my $expected (@expected) {
106 15         22 $totals{'total'} += $stats{$expected}{'total'};
107 15         17 $totals{'tp'} += $stats{$expected}{'tp'};
108 15         22 $totals{'fn'} += $stats{$expected}{'fn'};
109 15         29 $totals{'fp'} += $stats{$expected}{'fp'};
110 15         32 $stats{$expected}{'sensitivity'} = ($stats{$expected}{'tp'} * 100) / ($stats{$expected}{'tp'} + $stats{$expected}{'fp'});
111             }
112              
113 5         14 $totals{'acc'} = ($totals{'tp'} * 100) / $totals{'total'};
114 5         11 $totals{'sensitivity'} = ($totals{'tp'} * 100) / ($totals{'tp'} + $totals{'fp'});
115              
116             return (
117 5         37 columns => [sort @columns],
118             stats => \%stats,
119             totals => \%totals
120             );
121             }
122              
123             sub _findIndex {
124 63     63   96 my ($string, $array) = @_;
125 63         123 for (0 .. @$array - 1) {
126 144 100       166 return $_ + 1 if ($string eq @{$array}[$_]);
  144         311  
127             }
128             }
129              
130             =head1 NAME
131              
132             AI::ConfusionMatrix - make a confusion matrix
133              
134             =head1 SYNOPSIS
135              
136             my %matrix;
137              
138             # Loop over your predictions
139             # [...]
140              
141             $matrix{$expected}{$predicted} += 1;
142              
143             # [...]
144              
145             makeConfusionMatrix(\%matrix, 'output.csv');
146              
147              
148             =head1 DESCRIPTION
149              
150             This module prints a L from a hash reference. This module tries to be generic enough to be used within a lot of machine learning projects.
151              
152             =head3 Functions:
153              
154             =head4 C
155              
156             This function makes a confusion matrix from C<$hash_ref> and writes it to C<$file>. C<$file> can be a filename or a file handle opened with the C mode. If C<$delimiter> is present, it is used as a custom separator for the fields in the confusion matrix.
157              
158             Examples:
159              
160             makeConfusionMatrix(\%matrix, 'output.csv');
161             makeConfusionMatrix(\%matrix, 'output.csv', ';');
162             makeConfusionMatrix(\%matrix, *$fh);
163              
164             The hash reference must look like this :
165              
166             $VAR1 = {
167             'value_expected1' => {
168             'value_predicted1' => number_of_predictions
169             },
170             'value_expected2' => {
171             'value_predicted1' => number_of_predictions,
172             'value_predicted2' => number_of_predictions
173             },
174             'value_expected3' => {
175             'value_predicted3' => number_of_predictions
176             }
177             };
178              
179             The output will be in CSV. Here is an example:
180              
181             ,1974,1978,2002,2003,2005,TOTAL,TP,FP,FN,SENS,ACC
182             1974,3,1,,,2,6,3,4,3,42.86%,50.00%
183             1978,1,5,,,,6,5,4,1,55.56%,83.33%
184             2002,2,2,8,,,12,8,1,4,88.89%,66.67%
185             2003,1,,,7,2,10,7,0,3,100.00%,70.00%
186             2005,,1,1,,6,8,6,4,2,60.00%,75.00%
187             TOTAL,7,9,9,7,10,42,29,13,13,69.05%,69.05%
188              
189             Prettified:
190              
191             | | 1974 | 1978 | 2002 | 2003 | 2005 | TOTAL | TP | FP | FN | SENS | ACC |
192             |-------|------|------|------|------|------|-------|----|----|----|---------|--------|
193             | 1974 | 3 | 1 | | | 2 | 6 | 3 | 4 | 3 | 42.86% | 50.00% |
194             | 1978 | 1 | 5 | | | | 6 | 5 | 4 | 1 | 55.56% | 83.33% |
195             | 2002 | 2 | 2 | 8 | | | 12 | 8 | 1 | 4 | 88.89% | 66.67% |
196             | 2003 | 1 | | | 7 | 2 | 10 | 7 | 0 | 3 | 100.00% | 70.00% |
197             | 2005 | | 1 | 1 | | 6 | 8 | 6 | 4 | 2 | 60.00% | 75.00% |
198             | TOTAL | 7 | 9 | 9 | 7 | 10 | 42 | 29 | 13 | 13 | 69.05% | 69.05% |
199              
200             =over
201              
202             =item TP:
203              
204             True Positive
205              
206             =item FP:
207              
208             False Positive
209              
210             =item FN:
211              
212             False Negative
213              
214             =item SENS
215              
216             Sensitivity. Number of true positives divided by the number of positives.
217              
218             =item ACC:
219              
220             Accuracy
221              
222             =back
223              
224             =head4 C
225              
226             Get the data used to compute the table above.
227              
228             Example:
229              
230             my %cm = getConfusionMatrix(\%matrix);
231              
232             =head1 AUTHOR
233              
234             Vincent Lequertier
235              
236             =head1 LICENSE
237              
238             This library is free software; you can redistribute it and/or modify
239             it under the same terms as Perl itself.
240              
241             =cut
242              
243             1;
244              
245             # vim: set ts=4 sw=4 tw=0 fdm=marker :
246