File Coverage

blib/lib/Algorithm/AM/Result.pm
Criterion Covered Total %
statement 242 245 98.7
branch 46 50 92.0
condition 5 9 55.5
subroutine 17 17 100.0
pod 6 6 100.0
total 316 327 96.6


line stmt bran cond sub pod time code
1             # encapsulate information about a single classification result
2             package Algorithm::AM::Result;
3 10     10   76 use strict;
  10         25  
  10         416  
4 10     10   52 use warnings;
  10         18  
  10         787  
5             our $VERSION = '3.13';
6             # ABSTRACT: Store results of an AM classification
7 10     10   7190 use Text::Table;
  10         286212  
  10         2071  
8              
9             #pod =head2 SYNOPSIS
10             #pod
11             #pod use Algorithm::AM;
12             #pod
13             #pod my $am = Algorithm::AM->new('finnverb', -commas => 'no');
14             #pod my ($result) = $am->classify;
15             #pod print @{ $result->winners };
16             #pod print $result->statistical_summary;
17             #pod
18             #pod =head2 DESCRIPTION
19             #pod
20             #pod This package encapsulates all of the classification information
21             #pod generated by L, including the assigned class,
22             #pod score to each class, gang effects, analogical sets,
23             #pod and timing information. It also provides several methods for
24             #pod generating printable reports with this information.
25             #pod
26             #pod Note that the words 'score' and 'point' are used here to represent
27             #pod whatever count is assigned by analogical modeling during
28             #pod classification. This can be either pointers or occurrences. For an
29             #pod explanation of this, see L.
30             #pod
31             #pod All of the scores returned by the methods here are scalars with
32             #pod special PV and NV values. You should excercise caution when doing
33             #pod calculations with them. See L for more
34             #pod information.
35             #pod
36             #pod =cut
37              
38             ## TODO: variables consider exporting someday
39             ## @itemcontextchain
40             ## %itemcontextchainhead
41             ## %context_to_class
42             ## %context_size
43             use Class::Tiny qw(
44             exclude_nulls
45             given_excluded
46             cardinality
47             test_in_train
48             test_item
49             count_method
50              
51             start_time
52             end_time
53              
54             training_set
55              
56             scores
57             high_score
58             total_points
59             winners
60             is_tie
61             result
62              
63             scores_normalized
64             ), {
65             'scores_normalized' => sub {
66 1         3335 my ($self) = @_;
67 1         27 my $total_points = $self->total_points;
68 1         30 my $scores = $self->scores;
69 1         8 my $normalized = {};
70 1         6 for my $class (keys %$scores){
71 2         10 $normalized->{$class} = $scores->{$class} / $total_points
72             }
73 1         16 return $normalized;
74             },
75 10     10   137 };
  10         21  
  10         164  
76 10     10   31320 use Carp 'croak';
  10         24  
  10         805  
77 10     10   6514 use Algorithm::AM::BigInt 'bigcmp';
  10         61  
  10         33764  
78              
79             # For printing percentages in reports
80             my $percentage_format = '%.3f';
81              
82             #pod =head1 REPORT METHODS
83             #pod
84             #pod The methods below return human eye-friendly reports about the
85             #pod classification. The return value is a reference, so it must be
86             #pod dereferenced for printing like so:
87             #pod
88             #pod print ${ $result->statistical_summary };
89             #pod
90             #pod =head2 C
91             #pod
92             #pod Returns a scalar (string) ref containing information about the
93             #pod configuration at the time of classification. Information from the
94             #pod following accessors is included:
95             #pod
96             #pod exclude_nulls
97             #pod given_excluded
98             #pod cardinality
99             #pod test_in_train
100             #pod test_item
101             #pod count_method
102             #pod
103             #pod =cut
104             sub config_info {
105 2     2 1 3669 my ($self) = @_;
106 2         9 my @headers = ('Option', 'Setting');
107             my @rows = (
108 2 100       5 [ "Given context", (join ' ', @{$self->test_item->features}) .
  2 100       87  
    100          
109             ', ' . $self->test_item->comment],
110             [ "Nulls", ($self->exclude_nulls ? 'exclude' : 'include')],
111             [ "Gang", $self->count_method],
112             [ "Test item in training set", ($self->test_in_train ? 'yes' : 'no')],
113             [ "Test item excluded", ($self->given_excluded ? 'yes' : 'no')],
114             [ "Size of training set", $self->training_set->size ],
115             [ "Number of active features", $self->cardinality ],
116             );
117 2         23 my @table = _make_table(\@headers, \@rows);
118 2         882 my $info = join '', @table;
119 2         26 return \$info;
120             }
121              
122             # input several variables from AM's guts (sum, pointers,
123             # itemcontextchainhead and itemcontextchain). Calculate the
124             # prediction statistics, and
125             # store information needed for computing analogical sets.
126             # Set result to tie/correct/incorrect and also is_tie if
127             # expected class is provided, and high_score, scores, winners, and
128             # total_points.
129             sub _process_stats {
130 193     193   819 my ($self, $sum, $pointers,
131             $itemcontextchainhead, $itemcontextchain, $context_to_class,
132             $raw_gang, $active_feats, $context_size) = @_;
133 193         585 my $total_points = $pointers->{grand_total};
134 193         464 my $max = '';
135 193         538 my @winners;
136             my %scores;
137              
138             # iterate all possible classes and store the ones that have a
139             # non-zero score. Store the high-scorers, as well.
140             # 1) find which one(s) has the highest score (the prediction) and
141             # 2) print out the ones with scores (probability of prediction)
142 193         5274 for my $class_index (1 .. $self->training_set->num_classes) {
143 559         865 my $class_score;
144             # skip classes with no score
145 559 100       1742 next unless $class_score = $sum->[$class_index];
146              
147 355         7604 my $class = $self->training_set->_class_for_index($class_index);
148 355         1336 $scores{$class} = $class_score;
149              
150             # check if the class has the highest score, or ties for it
151 355         606 do {
152 355         1511 my $cmp = bigcmp($class_score, $max);
153 355 100       1223 if ($cmp > 0){
    100          
154 274         789 @winners = ($class);
155 274         682 $max = $class_score;
156             }elsif($cmp == 0){
157 2         7 push @winners, $class;
158             }
159             };
160             }
161              
162             # set result to tie/correct/incorrect after comparing
163             # expected/actual class labels. Only do this if the expected
164             # class label is known.
165 193 100       4697 if(my $expected = $self->test_item->class){
166 191 100 100     1102 if(exists $scores{$expected} &&
167             bigcmp($scores{$expected}, $max) == 0){
168 178 100       525 if(@winners > 1){
169 2         45 $self->result('tie');
170             }else{
171 176         4106 $self->result('correct');
172             }
173             }else{
174 13         294 $self->result('incorrect');
175             }
176             }
177 193 100       1747 if(@winners > 1){
178 2         58 $self->is_tie(1);
179             }
180 193         4556 $self->high_score($max);
181 193         5243 $self->scores(\%scores);
182 193         5280 $self->winners(\@winners);
183 193         4853 $self->total_points($total_points);
184 193         1364 $self->{pointers} = $pointers;
185 193         542 $self->{itemcontextchainhead} = $itemcontextchainhead;
186 193         558 $self->{itemcontextchain} = $itemcontextchain;
187 193         544 $self->{context_to_class} = $context_to_class;
188 193         534 $self->{raw_gang} = $raw_gang;
189 193         528 $self->{active_feats} = $active_feats;
190 193         442 $self->{context_size} = $context_size;
191 193         679 return;
192             }
193              
194             #pod =head2 C
195             #pod
196             #pod Returns a scalar reference (string) containing a statistical summary
197             #pod of the classification results. The summary includes all possible
198             #pod predicted classes with their scores and percentage scores and the
199             #pod total score for all classes. Whether the predicted class
200             #pod is correct/incorrect/a tie of some sort is also included, if the
201             #pod test item had a known class.
202             #pod
203             #pod =cut
204             sub statistical_summary {
205 4     4 1 4263 my ($self) = @_;
206 4         10 my %scores = %{$self->scores};
  4         132  
207 4         163 my $total_points = $self->total_points;
208              
209             # Make a table with information about predictions for different
210             # classes. Each row contains a class name, the score,
211             # and the percentage predicted.
212 4         26 my @rows;
213 4         21 for my $class (sort keys %scores){
214             push @rows, [ $class, $scores{$class},
215             sprintf($percentage_format,
216 8         84 100 * $scores{$class} / $total_points) ];
217             }
218             # add a Total row
219 4         55 push @rows, [ 'Total', $total_points ];
220              
221 4         26 my @table = _make_table(['Class', 'Score', 'Percentage'],
222             \@rows);
223             # copy the rule from the first row into the second to last row
224             # to separate the Total row
225 4         1436 splice(@table, $#table - 1, 0, $table[0]);
226              
227 4         15 my $info = "Statistical Summary\n";
228 4         38 $info .= join '', @table;
229             # the predicted class (the one with the highest score)
230             # and the result (correct/incorrect/tie).
231 4 100       235 if ( defined (my $expected = $self->test_item->class) ) {
232 3         10 $info .= "Expected class: $expected\n";
233 3         41 my $result = $self->result;
234 3 100       26 if ( $result eq 'correct') {
    100          
235 1         4 $info .= "Correct class predicted.\n";
236             }elsif($result eq 'tie'){
237 1         2 $info .= "Prediction is a tie.\n";
238             }else {
239 1         4 $info .= "Incorrect class predicted.\n";
240             }
241             }else{
242 1         4 $info .= "Expected class unknown\n";
243             }
244 4         46 return \$info;
245             }
246              
247             #TODO: the keys for this set don't seem to make any sense.
248             sub analogical_set {
249 3     3 1 6067 my ($self) = @_;
250 3 50       16 if(!exists $self->{_analogical_set}){
251 3         15 $self->_calculate_analogical_set;
252             }
253             # make a safe copy
254 3         7 my %set = %{$self->{_analogical_set}};
  3         14  
255 3         12 return \%set;
256             }
257              
258             #pod =head2 C
259             #pod
260             #pod Returns a scalar reference (string) containing the analogical set,
261             #pod meaning all items that contributed to the predicted class, along
262             #pod with the amount contributed by each item (score and
263             #pod percentage overall). Items are ordered by appearance in the data
264             #pod set.
265             #pod
266             #pod =cut
267             sub analogical_set_summary {
268 1     1 1 3093 my ($self) = @_;
269 1         7 my $set = $self->analogical_set;
270 1         35 my $total_points = $self->total_points;
271              
272             # Make a table for the analogical set. Each row contains an
273             # item with its class, comment, score, and the percentage
274             # of total score contributed.
275 1         9 my @rows;
276 1         7 foreach my $id (sort keys %$set){
277 4         9 my $entry = $set->{$id};
278 4         9 my $score = $entry->{score};
279             push @rows, [
280             $entry->{item}->class,
281             $entry->{item}->comment,
282 4         12 $score,
283             sprintf($percentage_format, 100 * $score / $total_points)
284             ];
285             }
286 1         8 my @table = _make_table(
287             ['Class', 'Item', 'Score', 'Percentage'], \@rows);
288 1         428 my $info = "Analogical Set\nTotal Frequency = $total_points\n";
289 1         10 $info .= join '', @table;
290 1         18 return \$info;
291             }
292              
293             # calculate and store analogical effects in $self->{_analogical_set}
294             sub _calculate_analogical_set {
295 3     3   8 my ($self) = @_;
296 3         106 my $train = $self->training_set;
297 3         49 my %set;
298 3         7 foreach my $context ( keys %{$self->{pointers}} ) {
  3         34  
299             next unless
300 11 100       36 exists $self->{itemcontextchainhead}->{$context};
301 8         29 for (
302             my $index = $self->{itemcontextchainhead}->{$context};
303             defined $index;
304             $index = $self->{itemcontextchain}->[$index]
305             )
306             {
307 10         33 my $item = $train->get_item($index);
308             $set{$item->id} = {
309             item => $item,
310 10         47 score => $self->{pointers}->{$context}
311             };
312             }
313             }
314 3         16 $self->{_analogical_set} = \%set;
315 3         9 return;
316             }
317              
318             sub gang_effects {
319 4     4 1 938 my ($self) = @_;
320 4 100       20 if(!$self->{_gang_effects}){
321 3         15 $self->_calculate_gangs;
322             }
323 4         16 return $self->{_gang_effects};
324             }
325              
326             #pod =head2 C
327             #pod
328             #pod Returns a scalar reference (string) containing the gang effects on the
329             #pod final class prediction.
330             #pod
331             #pod A single boolean parameter can be provided to turn on list printing,
332             #pod meaning gang items items are printed. This is false (off) by default.
333             #pod
334             #pod =cut
335             sub gang_summary {
336 3     3 1 4246 my ($self, $print_list) = @_;
337 3         124 my $test_item = $self->test_item;
338              
339 3         25 my $gangs = $self->gang_effects;
340              
341             # Make a table for the gangs with these rows:
342             # Percentage
343             # Score
344             # Num
345             # Class
346             # Features
347             # item comment
348 3         6 my @rows;
349             # first row is a header with test item for easy reference
350             push @rows, [
351             'Context',
352             undef,
353             undef,
354             undef,
355 3         9 @{$test_item->features},
  3         12  
356             ];
357              
358             # store the number of rows added for each gang
359             # will help with printing later
360 3         8 my @gang_rows;
361 3         6 my $current_row = -1;
362             # add information for each gang; sort by order of highest to
363             # lowest effect
364 3         10 foreach my $gang (@$gangs){
365 10         18 $current_row++;
366 10         22 $gang_rows[$current_row]++;
367 10         44 my $features = $gang->{features};
368             # add the gang supracontext, effect and score
369             push @rows, [
370             sprintf($percentage_format, 100 * $gang->{effect}),
371             $gang->{score},
372             undef,
373             undef,
374             # print undefined feature slots as asterisks
375 10 100       72 map {length($_) ? $_ : '*'} @$features
  58         158  
376             ];
377             # add each class in the gang, along with the total number
378             # and effect of the gang items supporting it
379 10         25 for my $class (sort keys %{ $gang->{class} }){
  10         37  
380 12         50 $gang_rows[$current_row]++;
381             push @rows, [
382             sprintf($percentage_format,
383             100 * $gang->{class}->{$class}->{effect}),
384             $gang->{class}->{$class}->{score},
385 12         67 scalar @{ $gang->{data}->{$class} },
  12         45  
386             $class,
387             undef
388             ];
389 12 100       38 if($print_list){
390             # add the list of items in the given context
391 4         7 for my $item (@{ $gang->{data}->{$class} }){
  4         11  
392 4         8 $gang_rows[$current_row]++;
393             push @rows, [
394             undef,
395             undef,
396             undef,
397             undef,
398 4         6 @{ $item->features },
  4         12  
399             $item->comment,
400             ];
401             }
402             }
403             }
404             }
405              
406             # construct the table from the rows
407             my @headers = (
408             \'| ',
409             'Percentage' => \' | ',
410             'Score' => \' | ',
411             'Num Items' => \' | ',
412             'Class' => \' | ',
413 3         9 ('' => \' ') x @{$test_item->features}
  3         10  
414             );
415 3         11 pop @headers;
416 3 100       8 if($print_list){
417 1         5 push @headers, \' | ', 'Item Comment';
418             }
419 3         6 push @headers, \' |';
420 3         20 my @rule = qw(- +);
421 3         24 my $table = Text::Table->new(@headers);
422 3         22384 $table->load(@rows);
423             # main header
424 3         4267 $current_row = 0;
425 3         19 my $return = $table->rule(@rule) .
426             $table->title .
427             $table->body($current_row) .
428             $table->rule(@rule);
429 3         230320 $current_row++;
430             # add info with a header for each gang
431 3         16 for my $num (@gang_rows){
432             # a row of '*' separates each gang
433 10         41 $return .= $table->rule('*','*') .
434             $table->body($current_row) .
435             $table->rule(@rule);
436 10         12240 $current_row++;
437 10         49 for(1 .. $num - 1){
438 16         53 $return .= $table->body($current_row);
439 16         1659 $current_row++;
440             }
441             }
442 3         15 $return .= $table->rule(@rule);
443 3         1192 return \$return;
444             }
445              
446             sub _calculate_gangs {
447 3     3   9 my ($self) = @_;
448 3         122 my $train = $self->training_set;
449 3         179 my $total_points = $self->total_points;
450 3         25 my $raw_gang = $self->{raw_gang};
451 3         8 my @gangs;
452              
453 3         8 foreach my $context (keys %{$raw_gang})
  3         16  
454             {
455 10         18 my $gang = {};
456 10         29 my @features = $self->_unpack_supracontext($context);
457             # for now, store gangs by the supracontext printout
458 10 100       26 my $key = join ' ', map {length($_) ? $_ : '-'} @features;
  58         155  
459 10         41 $gang->{score} = $raw_gang->{$context};
460 10         31 $gang->{effect} = $raw_gang->{$context} / $total_points;
461 10         23 $gang->{features} = \@features;
462              
463 10         31 my $num_class_pointers = $self->{pointers}->{$context};
464             # if the supracontext is homogenous
465 10 100       33 if ( my $class_index = $self->{context_to_class}->{$context} ) {
466             # store a 'homogenous' key that indicates this, besides
467             # indicating the unanimous class prediction.
468 8         60 my $class = $train->_class_for_index($class_index);
469 8         22 $gang->{homogenous} = $class;
470 8         16 my @data;
471 8         31 for (
472             my $index = $self->{itemcontextchainhead}->{$context};
473             defined $index;
474             $index = $self->{itemcontextchain}->[$index]
475             )
476             {
477 9         29 push @data, $train->get_item($index);
478             }
479 8         29 $gang->{data}->{$class} = \@data;
480 8         22 $gang->{size} = scalar @data;
481 8         33 $gang->{class}->{$class}->{score} = $num_class_pointers;
482             $gang->{class}->{$class}->{effect} =
483 8         22 $gang->{effect};
484             }
485             # for heterogenous supracontexts we have to store data for
486             # each class
487             else {
488 2         6 $gang->{homogenous} = 0;
489             # first loop through the data and sort by class, also
490             # finding the total gang size
491 2         5 my $size = 0;
492 2         5 my %data;
493 2         11 for (
494             my $index = $self->{itemcontextchainhead}->{$context};
495             defined $index;
496             $index = $self->{itemcontextchain}->[$index]
497             )
498             {
499 4         40 my $item = $train->get_item($index);
500 4         8 push @{ $data{$item->class} }, $item;
  4         12  
501 4         17 $size++;
502             }
503 2         6 $gang->{data} = \%data;
504 2         7 $gang->{size} = $size;
505              
506             # then store aggregate statistics for each class
507 2         8 for my $class (keys %data){
508 4         16 $gang->{class}->{$class}->{score} = $num_class_pointers;
509             $gang->{class}->{$class}->{effect} =
510             # score*num_data/total
511 4         9 @{ $data{$class} } * $num_class_pointers / $total_points;
  4         18  
512             }
513             }
514 10         29 push @gangs, $gang;
515             }
516              
517             # sort by score and then alphabetically by class labels
518             @gangs = sort{
519 3         23 bigcmp($b->{score}, $a->{score}) ||
520 0         0 (join '', sort keys %{ $b->{class} })
521             cmp
522 9 50       36 (join '', sort keys %{ $a->{class} })} @gangs;
  0         0  
523 3         12 $self->{_gang_effects} = \@gangs;
524 3         9 return;
525             }
526              
527             # Unpack and return the supracontext features.
528             # Blank entries mean the variable may be anything, e.g.
529             # ('a' 'b' '') means a supracontext containing items
530             # wich have ('a' 'b' whatever) as variable values.
531             sub _unpack_supracontext {
532 10     10   24 my ($self, $context) = @_;
533 10         46 my @context_list = unpack "S!4", $context;
534 10         18 my @alist = @{$self->{active_feats}};
  10         32  
535 10         18 my (@features) = @{ $self->test_item->features };
  10         293  
536 10         232 my $exclude_nulls = $self->exclude_nulls;
537 10         53 my $j = 1;
538 10         22 foreach my $a (reverse @alist) {
539 40         73 my $partial_context = pop @context_list;
540 40         119 for ( ; $a ; --$a ) {
541 46 50       95 if($exclude_nulls){
542 46         125 ++$j while !defined $features[ -$j ];
543             }
544 46 100       113 $features[ -$j ] = '' if $partial_context & 1;
545 46         75 $partial_context >>= 1;
546 46         107 ++$j;
547             }
548             }
549 10         71 return @features;
550             }
551              
552             # mostly by Ovid:
553             # http://use.perl.org/use.perl.org/_Ovid/journal/36762.html
554             # Return table rows with a nice header and column separators
555             sub _make_table {
556 7     7   20 my ( $headers, $rows ) = @_;
557              
558 7         22 my @rule = qw(- +);
559 7         18 my @headers = \'| ';
560 7         25 push @headers => map { $_ => \' | ' } @$headers;
  20         56  
561 7         17 pop @headers;
562 7         17 push @headers => \' |';
563              
564 7 50 33     82 unless ('ARRAY' eq ref $rows
      33        
565             && 'ARRAY' eq ref $rows->[0]
566 7         29 && @$headers == @{ $rows->[0] }) {
567 0         0 croak(
568             "make_table() rows must be an AoA with rows being same size as headers"
569             );
570             }
571 7         45 my $table = Text::Table->new(@headers);
572 7         19310 $table->rule(@rule);
573 7         23641 $table->body_rule(@rule);
574 7         1868 $table->load(@$rows);
575              
576             return $table->rule(@rule),
577             $table->title,
578             $table->rule(@rule),
579 7         3136 map({ $table->body($_) } 0 .. @$rows),
  37         87739  
580             $table->rule(@rule);
581             }
582              
583             1;
584              
585             __END__