File Coverage

blib/lib/Algorithm/AM/Result.pm
Criterion Covered Total %
statement 242 245 98.7
branch 46 50 92.0
condition 5 9 55.5
subroutine 17 17 100.0
pod 6 6 100.0
total 316 327 96.6


line stmt bran cond sub pod time code
1             # encapsulate information about a single classification result
2             package Algorithm::AM::Result;
3 10     10   59 use strict;
  10         20  
  10         255  
4 10     10   50 use warnings;
  10         19  
  10         383  
5             our $VERSION = '3.11';
6             # ABSTRACT: Store results of an AM classification
7 10     10   3490 use Text::Table;
  10         145735  
  10         1112  
8              
9             #pod =head2 SYNOPSIS
10             #pod
11             #pod use Algorithm::AM;
12             #pod
13             #pod my $am = Algorithm::AM->new('finnverb', -commas => 'no');
14             #pod my ($result) = $am->classify;
15             #pod print @{ $result->winners };
16             #pod print $result->statistical_summary;
17             #pod
18             #pod =head2 DESCRIPTION
19             #pod
20             #pod This package encapsulates all of the classification information
21             #pod generated by L, including the assigned class,
22             #pod score to each class, gang effects, analogical sets,
23             #pod and timing information. It also provides several methods for
24             #pod generating printable reports with this information.
25             #pod
26             #pod Note that the words 'score' and 'point' are used here to represent
27             #pod whatever count is assigned by analogical modeling during
28             #pod classification. This can be either pointers or occurrences. For an
29             #pod explanation of this, see L.
30             #pod
31             #pod All of the scores returned by the methods here are scalars with
32             #pod special PV and NV values. You should excercise caution when doing
33             #pod calculations with them. See L for more
34             #pod information.
35             #pod
36             #pod =cut
37              
38             ## TODO: variables consider exporting someday
39             ## @itemcontextchain
40             ## %itemcontextchainhead
41             ## %context_to_class
42             ## %contextsize
43             use Class::Tiny qw(
44             exclude_nulls
45             given_excluded
46             cardinality
47             test_in_train
48             test_item
49             count_method
50              
51             start_time
52             end_time
53              
54             training_set
55              
56             scores
57             high_score
58             total_points
59             winners
60             is_tie
61             result
62              
63             scores_normalized
64             ), {
65             'scores_normalized' => sub {
66 1         905 my ($self) = @_;
67 1         16 my $total_points = $self->total_points;
68 1         18 my $scores = $self->scores;
69 1         6 my $normalized = {};
70 1         4 for my $class (keys %$scores){
71 2         5 $normalized->{$class} = $scores->{$class} / $total_points
72             }
73 1         8 return $normalized;
74             },
75 10     10   92 };
  10         51  
  10         144  
76 10     10   16532 use Carp 'croak';
  10         27  
  10         494  
77 10     10   3448 use Algorithm::AM::BigInt 'bigcmp';
  10         54  
  10         19115  
78              
79             # For printing percentages in reports
80             my $percentage_format = '%.3f';
81              
82             #pod =head1 REPORT METHODS
83             #pod
84             #pod The methods below return human eye-friendly reports about the
85             #pod classification. The return value is a reference, so it must be
86             #pod dereferenced for printing like so:
87             #pod
88             #pod print ${ $result->statistical_summary };
89             #pod
90             #pod =head2 C
91             #pod
92             #pod Returns a scalar (string) ref containing information about the
93             #pod configuration at the time of classification. Information from the
94             #pod following accessors is included:
95             #pod
96             #pod exclude_nulls
97             #pod given_excluded
98             #pod cardinality
99             #pod test_in_train
100             #pod test_item
101             #pod count_method
102             #pod
103             #pod =cut
104             sub config_info {
105 2     2 1 1780 my ($self) = @_;
106 2         5 my @headers = ('Option', 'Setting');
107             my @rows = (
108 2 100       4 [ "Given context", (join ' ', @{$self->test_item->features}) .
  2 100       44  
    100          
109             ', ' . $self->test_item->comment],
110             [ "Nulls", ($self->exclude_nulls ? 'exclude' : 'include')],
111             [ "Gang", $self->count_method],
112             [ "Test item in training set", ($self->test_in_train ? 'yes' : 'no')],
113             [ "Test item excluded", ($self->given_excluded ? 'yes' : 'no')],
114             [ "Size of training set", $self->training_set->size ],
115             [ "Number of active features", $self->cardinality ],
116             );
117 2         16 my @table = _make_table(\@headers, \@rows);
118 2         458 my $info = join '', @table;
119 2         11 return \$info;
120             }
121              
122             # input several variables from AM's guts (sum, pointers,
123             # itemcontextchainhead and itemcontextchain). Calculate the
124             # prediction statistics, and
125             # store information needed for computing analogical sets.
126             # Set result to tie/correct/incorrect and also is_tie if
127             # expected class is provided, and high_score, scores, winners, and
128             # total_points.
129             sub _process_stats {
130 193     193   587 my ($self, $sum, $pointers,
131             $itemcontextchainhead, $itemcontextchain, $context_to_class,
132             $gang, $active_feats, $contextsize) = @_;
133 193         387 my $total_points = $pointers->{grandtotal};
134 193         324 my $max = '';
135 193         329 my @winners;
136             my %scores;
137              
138             # iterate all possible classes and store the ones that have a
139             # non-zero score. Store the high-scorers, as well.
140             # 1) find which one(s) has the highest score (the prediction) and
141             # 2) print out the ones with scores (probability of prediction)
142 193         2838 for my $class_index (1 .. $self->training_set->num_classes) {
143 559         672 my $class_score;
144             # skip classes with no score
145 559 100       1149 next unless $class_score = $sum->[$class_index];
146              
147 355         4870 my $class = $self->training_set->_class_for_index($class_index);
148 355         938 $scores{$class} = $class_score;
149              
150             # check if the class has the highest score, or ties for it
151 355         479 do {
152 355         892 my $cmp = bigcmp($class_score, $max);
153 355 100       891 if ($cmp > 0){
    100          
154 274         739 @winners = ($class);
155 274         510 $max = $class_score;
156             }elsif($cmp == 0){
157 2         7 push @winners, $class;
158             }
159             };
160             }
161              
162             # set result to tie/correct/incorrect after comparing
163             # expected/actual class labels. Only do this if the expected
164             # class label is known.
165 193 100       2846 if(my $expected = $self->test_item->class){
166 191 100 100     732 if(exists $scores{$expected} &&
167             bigcmp($scores{$expected}, $max) == 0){
168 178 100       408 if(@winners > 1){
169 2         33 $self->result('tie');
170             }else{
171 176         2550 $self->result('correct');
172             }
173             }else{
174 13         180 $self->result('incorrect');
175             }
176             }
177 193 100       1203 if(@winners > 1){
178 2         29 $self->is_tie(1);
179             }
180 193         3096 $self->high_score($max);
181 193         3406 $self->scores(\%scores);
182 193         3402 $self->winners(\@winners);
183 193         3169 $self->total_points($total_points);
184 193         1020 $self->{pointers} = $pointers;
185 193         600 $self->{itemcontextchainhead} = $itemcontextchainhead;
186 193         431 $self->{itemcontextchain} = $itemcontextchain;
187 193         395 $self->{context_to_class} = $context_to_class;
188 193         311 $self->{gang} = $gang;
189 193         401 $self->{active_feats} = $active_feats;
190 193         376 $self->{contextsize} = $contextsize;
191 193         494 return;
192             }
193              
194             #pod =head2 C
195             #pod
196             #pod Returns a scalar reference (string) containing a statistical summary
197             #pod of the classification results. The summary includes all possible
198             #pod predicted classes with their scores and percentage scores and the
199             #pod total score for all classes. Whether the predicted class
200             #pod is correct/incorrect/a tie of some sort is also included, if the
201             #pod test item had a known class.
202             #pod
203             #pod =cut
204             sub statistical_summary {
205 4     4 1 1809 my ($self) = @_;
206 4         5 my %scores = %{$self->scores};
  4         85  
207 4         82 my $total_points = $self->total_points;
208              
209             # Make a table with information about predictions for different
210             # classes. Each row contains a class name, the score,
211             # and the percentage predicted.
212 4         16 my @rows;
213 4         15 for my $class (sort keys %scores){
214             push @rows, [ $class, $scores{$class},
215             sprintf($percentage_format,
216 8         61 100 * $scores{$class} / $total_points) ];
217             }
218             # add a Total row
219 4         11 push @rows, [ 'Total', $total_points ];
220              
221 4         11 my @table = _make_table(['Class', 'Score', 'Percentage'],
222             \@rows);
223             # copy the rule from the first row into the second to last row
224             # to separate the Total row
225 4         817 splice(@table, $#table - 1, 0, $table[0]);
226              
227 4         9 my $info = "Statistical Summary\n";
228 4         16 $info .= join '', @table;
229             # the predicted class (the one with the highest score)
230             # and the result (correct/incorrect/tie).
231 4 100       93 if ( defined (my $expected = $self->test_item->class) ) {
232 3         8 $info .= "Expected class: $expected\n";
233 3         21 my $result = $self->result;
234 3 100       16 if ( $result eq 'correct') {
    100          
235 1         3 $info .= "Correct class predicted.\n";
236             }elsif($result eq 'tie'){
237 1         3 $info .= "Prediction is a tie.\n";
238             }else {
239 1         3 $info .= "Incorrect class predicted.\n";
240             }
241             }else{
242 1         4 $info .= "Expected class unknown\n";
243             }
244 4         21 return \$info;
245             }
246              
247             #TODO: the keys for this set don't seem to make any sense.
248             sub analogical_set {
249 3     3 1 4542 my ($self) = @_;
250 3 50       14 if(!exists $self->{_analogical_set}){
251 3         13 $self->_calculate_analogical_set;
252             }
253             # make a safe copy
254 3         6 my %set = %{$self->{_analogical_set}};
  3         13  
255 3         9 return \%set;
256             }
257              
258             #pod =head2 C
259             #pod
260             #pod Returns a scalar reference (string) containing the analogical set,
261             #pod meaning all items that contributed to the predicted class, along
262             #pod with the amount contributed by each item (score and
263             #pod percentage overall). Items are ordered by appearance in the data
264             #pod set.
265             #pod
266             #pod =cut
267             sub analogical_set_summary {
268 1     1 1 1164 my ($self) = @_;
269 1         5 my $set = $self->analogical_set;
270 1         17 my $total_points = $self->total_points;
271              
272             # Make a table for the analogical set. Each row contains an
273             # item with its class, comment, score, and the percentage
274             # of total score contributed.
275 1         4 my @rows;
276 1         6 foreach my $id (sort keys %$set){
277 4         7 my $entry = $set->{$id};
278 4         5 my $score = $entry->{score};
279             push @rows, [
280             $entry->{item}->class,
281             $entry->{item}->comment,
282 4         10 $score,
283             sprintf($percentage_format, 100 * $score / $total_points)
284             ];
285             }
286 1         6 my @table = _make_table(
287             ['Class', 'Item', 'Score', 'Percentage'], \@rows);
288 1         250 my $info = "Analogical Set\nTotal Frequency = $total_points\n";
289 1         5 $info .= join '', @table;
290 1         6 return \$info;
291             }
292              
293             # calculate and store analogical effects in $self->{_analogical_set}
294             sub _calculate_analogical_set {
295 3     3   6 my ($self) = @_;
296 3         73 my $train = $self->training_set;
297 3         15 my %set;
298 3         5 foreach my $context ( keys %{$self->{pointers}} ) {
  3         12  
299             next unless
300 11 100       23 exists $self->{itemcontextchainhead}->{$context};
301 8         20 for (
302             my $index = $self->{itemcontextchainhead}->{$context};
303             defined $index;
304             $index = $self->{itemcontextchain}->[$index]
305             )
306             {
307 10         22 my $item = $train->get_item($index);
308             $set{$item->id} = {
309             item => $item,
310 10         33 score => $self->{pointers}->{$context}
311             };
312             }
313             }
314 3         9 $self->{_analogical_set} = \%set;
315 3         6 return;
316             }
317              
318             sub gang_effects {
319 4     4 1 627 my ($self) = @_;
320 4 100       16 if(!$self->{_gang_effects}){
321 3         11 $self->_calculate_gangs;
322             }
323 4         11 return $self->{_gang_effects};
324             }
325              
326             #pod =head2 C
327             #pod
328             #pod Returns a scalar reference (string) containing the gang effects on the
329             #pod final class prediction.
330             #pod
331             #pod A single boolean parameter can be provided to turn on list printing,
332             #pod meaning gang items items are printed. This is false (off) by default.
333             #pod
334             #pod =cut
335             sub gang_summary {
336 3     3 1 2105 my ($self, $print_list) = @_;
337 3         71 my $test_item = $self->test_item;
338              
339 3         19 my $gangs = $self->gang_effects;
340              
341             # Make a table for the gangs with these rows:
342             # Percentage
343             # Score
344             # Num
345             # Class
346             # Features
347             # item comment
348 3         5 my @rows;
349             # first row is a header with test item for easy reference
350             push @rows, [
351             'Context',
352             undef,
353             undef,
354             undef,
355 3         6 @{$test_item->features},
  3         10  
356             ];
357              
358             # store the number of rows added for each gang
359             # will help with printing later
360 3         5 my @gang_rows;
361 3         5 my $current_row = -1;
362             # add information for each gang; sort by order of highest to
363             # lowest effect
364 3         7 foreach my $gang (@$gangs){
365 10         12 $current_row++;
366 10         14 $gang_rows[$current_row]++;
367 10         15 my $features = $gang->{features};
368             # add the gang supracontext, effect and score
369             push @rows, [
370             sprintf($percentage_format, 100 * $gang->{effect}),
371             $gang->{score},
372             undef,
373             undef,
374             # print undefined feature slots as asterisks
375 10 100       61 map {length($_) ? $_ : '*'} @$features
  58         110  
376             ];
377             # add each class in the gang, along with the total number
378             # and effect of the gang items supporting it
379 10         19 for my $class (sort keys %{ $gang->{class} }){
  10         30  
380 12         16 $gang_rows[$current_row]++;
381             push @rows, [
382             sprintf($percentage_format,
383             100 * $gang->{class}->{$class}->{effect}),
384             $gang->{class}->{$class}->{score},
385 12         42 scalar @{ $gang->{data}->{$class} },
  12         33  
386             $class,
387             undef
388             ];
389 12 100       26 if($print_list){
390             # add the list of items in the given context
391 4         6 for my $item (@{ $gang->{data}->{$class} }){
  4         7  
392 4         5 $gang_rows[$current_row]++;
393             push @rows, [
394             undef,
395             undef,
396             undef,
397             undef,
398 4         6 @{ $item->features },
  4         8  
399             $item->comment,
400             ];
401             }
402             }
403             }
404             }
405              
406             # construct the table from the rows
407             my @headers = (
408             \'| ',
409             'Percentage' => \' | ',
410             'Score' => \' | ',
411             'Num Items' => \' | ',
412             'Class' => \' | ',
413 3         8 ('' => \' ') x @{$test_item->features}
  3         7  
414             );
415 3         6 pop @headers;
416 3 100       8 if($print_list){
417 1         3 push @headers, \' | ', 'Item Comment';
418             }
419 3         3 push @headers, \' |';
420 3         5 my @rule = qw(- +);
421 3         17 my $table = Text::Table->new(@headers);
422 3         14139 $table->load(@rows);
423             # main header
424 3         2636 $current_row = 0;
425 3         8 my $return = $table->rule(@rule) .
426             $table->title .
427             $table->body($current_row) .
428             $table->rule(@rule);
429 3         140404 $current_row++;
430             # add info with a header for each gang
431 3         11 for my $num (@gang_rows){
432             # a row of '*' separates each gang
433 10         23 $return .= $table->rule('*','*') .
434             $table->body($current_row) .
435             $table->rule(@rule);
436 10         4706 $current_row++;
437 10         48 for(1 .. $num - 1){
438 16         34 $return .= $table->body($current_row);
439 16         1042 $current_row++;
440             }
441             }
442 3         11 $return .= $table->rule(@rule);
443 3         749 return \$return;
444             }
445              
446             sub _calculate_gangs {
447 3     3   6 my ($self) = @_;
448 3         85 my $train = $self->training_set;
449 3         55 my $total_points = $self->total_points;
450 3         18 my $raw_gang = $self->{gang};
451 3         6 my @gangs;
452              
453 3         5 foreach my $context (keys %{$raw_gang})
  3         14  
454             {
455 10         15 my $gang = {};
456 10         24 my @features = $self->_unpack_supracontext($context);
457             # for now, store gangs by the supracontext printout
458 10 100       23 my $key = join ' ', map {length($_) ? $_ : '-'} @features;
  58         129  
459 10         29 $gang->{score} = $raw_gang->{$context};
460 10         26 $gang->{effect} = $raw_gang->{$context} / $total_points;
461 10         15 $gang->{features} = \@features;
462              
463 10         20 my $p = $self->{pointers}->{$context};
464             # if the supracontext is homogenous
465 10 100       24 if ( my $class_index = $self->{context_to_class}->{$context} ) {
466             # store a 'homogenous' key that indicates this, besides
467             # indicating the unanimous class prediction.
468 8         25 my $class = $train->_class_for_index($class_index);
469 8         13 $gang->{homogenous} = $class;
470 8         13 my @data;
471 8         23 for (
472             my $index = $self->{itemcontextchainhead}->{$context};
473             defined $index;
474             $index = $self->{itemcontextchain}->[$index]
475             )
476             {
477 9         24 push @data, $train->get_item($index);
478             }
479 8         19 $gang->{data}->{$class} = \@data;
480 8         15 $gang->{size} = scalar @data;
481 8         19 $gang->{class}->{$class}->{score} = $p;
482             $gang->{class}->{$class}->{effect} =
483 8         15 $gang->{effect};
484             }
485             # for heterogenous supracontexts we have to store data for
486             # each class
487             else {
488 2         5 $gang->{homogenous} = 0;
489             # first loop through the data and sort by class, also
490             # finding the total gang size
491 2         4 my $size = 0;
492 2         2 my %data;
493 2         9 for (
494             my $index = $self->{itemcontextchainhead}->{$context};
495             defined $index;
496             $index = $self->{itemcontextchain}->[$index]
497             )
498             {
499 4         13 my $item = $train->get_item($index);
500 4         6 push @{ $data{$item->class} }, $item;
  4         8  
501 4         13 $size++;
502             }
503 2         5 $gang->{data} = \%data;
504 2         3 $gang->{size} = $size;
505              
506             # then store aggregate statistics for each class
507 2         7 for my $class (keys %data){
508 4         9 $gang->{class}->{$class}->{score} = $p;
509             $gang->{class}->{$class}->{effect} =
510             # score*num_data/total
511 4         6 @{ $data{$class} } * $p / $total_points;
  4         12  
512             }
513             }
514 10         20 push @gangs, $gang;
515             }
516              
517             # sort by score and then alphabetically by class labels
518             @gangs = sort{
519 3         17 bigcmp($b->{score}, $a->{score}) ||
520 0         0 (join '', sort keys %{ $b->{class} })
521             cmp
522 11 50       29 (join '', sort keys %{ $a->{class} })} @gangs;
  0         0  
523 3         9 $self->{_gang_effects} = \@gangs;
524 3         6 return;
525             }
526              
527             # Unpack and return the supracontext features.
528             # Blank entries mean the variable may be anything, e.g.
529             # ('a' 'b' '') means a supracontext containing items
530             # wich have ('a' 'b' whatever) as variable values.
531             sub _unpack_supracontext {
532 10     10   22 my ($self, $context) = @_;
533 10         36 my @context_list = unpack "S!4", $context;
534 10         16 my @alist = @{$self->{active_feats}};
  10         23  
535 10         13 my (@features) = @{ $self->test_item->features };
  10         183  
536 10         146 my $exclude_nulls = $self->exclude_nulls;
537 10         44 my $j = 1;
538 10         21 foreach my $a (reverse @alist) {
539 40         51 my $partial_context = pop @context_list;
540 40         868 for ( ; $a ; --$a ) {
541 46 50       64 if($exclude_nulls){
542 46         83 ++$j while !defined $features[ -$j ];
543             }
544 46 100       79 $features[ -$j ] = '' if $partial_context & 1;
545 46         71 $partial_context >>= 1;
546 46         73 ++$j;
547             }
548             }
549 10         39 return @features;
550             }
551              
552             # mostly by Ovid:
553             # http://use.perl.org/use.perl.org/_Ovid/journal/36762.html
554             # Return table rows with a nice header and column separators
555             sub _make_table {
556 7     7   12 my ( $headers, $rows ) = @_;
557              
558 7         17 my @rule = qw(- +);
559 7         9 my @headers = \'| ';
560 7         15 push @headers => map { $_ => \' | ' } @$headers;
  20         36  
561 7         11 pop @headers;
562 7         9 push @headers => \' |';
563              
564 7 50 33     36 unless ('ARRAY' eq ref $rows
      33        
565             && 'ARRAY' eq ref $rows->[0]
566 7         17 && @$headers == @{ $rows->[0] }) {
567 0         0 croak(
568             "make_table() rows must be an AoA with rows being same size as headers"
569             );
570             }
571 7         23 my $table = Text::Table->new(@headers);
572 7         11763 $table->rule(@rule);
573 7         14279 $table->body_rule(@rule);
574 7         1124 $table->load(@$rows);
575              
576             return $table->rule(@rule),
577             $table->title,
578             $table->rule(@rule),
579 7         1907 map({ $table->body($_) } 0 .. @$rows),
  37         53114  
580             $table->rule(@rule);
581             }
582              
583             1;
584              
585             __END__