File Coverage

blib/lib/Algorithm/AM/Result.pm

Criterion	Covered	Total	%
statement	242	245	98.7
branch	46	50	92.0
condition	5	9	55.5
subroutine	17	17	100.0
pod	6	6	100.0
total	316	327	96.6

line	stmt	bran	cond	sub	pod	time	code
1							# encapsulate information about a single classification result
2							package Algorithm::AM::Result;
3	10			10		59	use strict;
	10					20
	10					255
4	10			10		50	use warnings;
	10					19
	10					383
5							our $VERSION = '3.11';
6							# ABSTRACT: Store results of an AM classification
7	10			10		3490	use Text::Table;
	10					145735
	10					1112
8
9							#pod =head2 SYNOPSIS
10							#pod
11							#pod use Algorithm::AM;
12							#pod
13							#pod my $am = Algorithm::AM->new('finnverb', -commas => 'no');
14							#pod my ($result) = $am->classify;
15							#pod print @{ $result->winners };
16							#pod print $result->statistical_summary;
17							#pod
18							#pod =head2 DESCRIPTION
19							#pod
20							#pod This package encapsulates all of the classification information
21							#pod generated by L, including the assigned class,
22							#pod score to each class, gang effects, analogical sets,
23							#pod and timing information. It also provides several methods for
24							#pod generating printable reports with this information.
25							#pod
26							#pod Note that the words 'score' and 'point' are used here to represent
27							#pod whatever count is assigned by analogical modeling during
28							#pod classification. This can be either pointers or occurrences. For an
29							#pod explanation of this, see L.
30							#pod
31							#pod All of the scores returned by the methods here are scalars with
32							#pod special PV and NV values. You should excercise caution when doing
33							#pod calculations with them. See L for more
34							#pod information.
35							#pod
36							#pod =cut
37
38							## TODO: variables consider exporting someday
39							## @itemcontextchain
40							## %itemcontextchainhead
41							## %context_to_class
42							## %contextsize
43							use Class::Tiny qw(
44							exclude_nulls
45							given_excluded
46							cardinality
47							test_in_train
48							test_item
49							count_method
50
51							start_time
52							end_time
53
54							training_set
55
56							scores
57							high_score
58							total_points
59							winners
60							is_tie
61							result
62
63							scores_normalized
64							), {
65							'scores_normalized' => sub {
66	1					905	my ($self) = @_;
67	1					16	my $total_points = $self->total_points;
68	1					18	my $scores = $self->scores;
69	1					6	my $normalized = {};
70	1					4	for my $class (keys %$scores){
71	2					5	$normalized->{$class} = $scores->{$class} / $total_points
72							}
73	1					8	return $normalized;
74							},
75	10			10		92	};
	10					51
	10					144
76	10			10		16532	use Carp 'croak';
	10					27
	10					494
77	10			10		3448	use Algorithm::AM::BigInt 'bigcmp';
	10					54
	10					19115
78
79							# For printing percentages in reports
80							my $percentage_format = '%.3f';
81
82							#pod =head1 REPORT METHODS
83							#pod
84							#pod The methods below return human eye-friendly reports about the
85							#pod classification. The return value is a reference, so it must be
86							#pod dereferenced for printing like so:
87							#pod
88							#pod print ${ $result->statistical_summary };
89							#pod
90							#pod =head2 C
91							#pod
92							#pod Returns a scalar (string) ref containing information about the
93							#pod configuration at the time of classification. Information from the
94							#pod following accessors is included:
95							#pod
96							#pod exclude_nulls
97							#pod given_excluded
98							#pod cardinality
99							#pod test_in_train
100							#pod test_item
101							#pod count_method
102							#pod
103							#pod =cut
104							sub config_info {
105	2			2	1	1780	my ($self) = @_;
106	2					5	my @headers = ('Option', 'Setting');
107							my @rows = (
108	2	100				4	[ "Given context", (join ' ', @{$self->test_item->features}) .
	2	100				44
		100
109							', ' . $self->test_item->comment],
110							[ "Nulls", ($self->exclude_nulls ? 'exclude' : 'include')],
111							[ "Gang", $self->count_method],
112							[ "Test item in training set", ($self->test_in_train ? 'yes' : 'no')],
113							[ "Test item excluded", ($self->given_excluded ? 'yes' : 'no')],
114							[ "Size of training set", $self->training_set->size ],
115							[ "Number of active features", $self->cardinality ],
116							);
117	2					16	my @table = _make_table(\@headers, \@rows);
118	2					458	my $info = join '', @table;
119	2					11	return \$info;
120							}
121
122							# input several variables from AM's guts (sum, pointers,
123							# itemcontextchainhead and itemcontextchain). Calculate the
124							# prediction statistics, and
125							# store information needed for computing analogical sets.
126							# Set result to tie/correct/incorrect and also is_tie if
127							# expected class is provided, and high_score, scores, winners, and
128							# total_points.
129							sub _process_stats {
130	193			193		587	my ($self, $sum, $pointers,
131							$itemcontextchainhead, $itemcontextchain, $context_to_class,
132							$gang, $active_feats, $contextsize) = @_;
133	193					387	my $total_points = $pointers->{grandtotal};
134	193					324	my $max = '';
135	193					329	my @winners;
136							my %scores;
137
138							# iterate all possible classes and store the ones that have a
139							# non-zero score. Store the high-scorers, as well.
140							# 1) find which one(s) has the highest score (the prediction) and
141							# 2) print out the ones with scores (probability of prediction)
142	193					2838	for my $class_index (1 .. $self->training_set->num_classes) {
143	559					672	my $class_score;
144							# skip classes with no score
145	559	100				1149	next unless $class_score = $sum->[$class_index];
146
147	355					4870	my $class = $self->training_set->_class_for_index($class_index);
148	355					938	$scores{$class} = $class_score;
149
150							# check if the class has the highest score, or ties for it
151	355					479	do {
152	355					892	my $cmp = bigcmp($class_score, $max);
153	355	100				891	if ($cmp > 0){
		100
154	274					739	@winners = ($class);
155	274					510	$max = $class_score;
156							}elsif($cmp == 0){
157	2					7	push @winners, $class;
158							}
159							};
160							}
161
162							# set result to tie/correct/incorrect after comparing
163							# expected/actual class labels. Only do this if the expected
164							# class label is known.
165	193	100				2846	if(my $expected = $self->test_item->class){
166	191	100	100			732	if(exists $scores{$expected} &&
167							bigcmp($scores{$expected}, $max) == 0){
168	178	100				408	if(@winners > 1){
169	2					33	$self->result('tie');
170							}else{
171	176					2550	$self->result('correct');
172							}
173							}else{
174	13					180	$self->result('incorrect');
175							}
176							}
177	193	100				1203	if(@winners > 1){
178	2					29	$self->is_tie(1);
179							}
180	193					3096	$self->high_score($max);
181	193					3406	$self->scores(\%scores);
182	193					3402	$self->winners(\@winners);
183	193					3169	$self->total_points($total_points);
184	193					1020	$self->{pointers} = $pointers;
185	193					600	$self->{itemcontextchainhead} = $itemcontextchainhead;
186	193					431	$self->{itemcontextchain} = $itemcontextchain;
187	193					395	$self->{context_to_class} = $context_to_class;
188	193					311	$self->{gang} = $gang;
189	193					401	$self->{active_feats} = $active_feats;
190	193					376	$self->{contextsize} = $contextsize;
191	193					494	return;
192							}
193
194							#pod =head2 C
195							#pod
196							#pod Returns a scalar reference (string) containing a statistical summary
197							#pod of the classification results. The summary includes all possible
198							#pod predicted classes with their scores and percentage scores and the
199							#pod total score for all classes. Whether the predicted class
200							#pod is correct/incorrect/a tie of some sort is also included, if the
201							#pod test item had a known class.
202							#pod
203							#pod =cut
204							sub statistical_summary {
205	4			4	1	1809	my ($self) = @_;
206	4					5	my %scores = %{$self->scores};
	4					85
207	4					82	my $total_points = $self->total_points;
208
209							# Make a table with information about predictions for different
210							# classes. Each row contains a class name, the score,
211							# and the percentage predicted.
212	4					16	my @rows;
213	4					15	for my $class (sort keys %scores){
214							push @rows, [ $class, $scores{$class},
215							sprintf($percentage_format,
216	8					61	100 * $scores{$class} / $total_points) ];
217							}
218							# add a Total row
219	4					11	push @rows, [ 'Total', $total_points ];
220
221	4					11	my @table = _make_table(['Class', 'Score', 'Percentage'],
222							\@rows);
223							# copy the rule from the first row into the second to last row
224							# to separate the Total row
225	4					817	splice(@table, $#table - 1, 0, $table[0]);
226
227	4					9	my $info = "Statistical Summary\n";
228	4					16	$info .= join '', @table;
229							# the predicted class (the one with the highest score)
230							# and the result (correct/incorrect/tie).
231	4	100				93	if ( defined (my $expected = $self->test_item->class) ) {
232	3					8	$info .= "Expected class: $expected\n";
233	3					21	my $result = $self->result;
234	3	100				16	if ( $result eq 'correct') {
		100
235	1					3	$info .= "Correct class predicted.\n";
236							}elsif($result eq 'tie'){
237	1					3	$info .= "Prediction is a tie.\n";
238							}else {
239	1					3	$info .= "Incorrect class predicted.\n";
240							}
241							}else{
242	1					4	$info .= "Expected class unknown\n";
243							}
244	4					21	return \$info;
245							}
246
247							#TODO: the keys for this set don't seem to make any sense.
248							sub analogical_set {
249	3			3	1	4542	my ($self) = @_;
250	3	50				14	if(!exists $self->{_analogical_set}){
251	3					13	$self->_calculate_analogical_set;
252							}
253							# make a safe copy
254	3					6	my %set = %{$self->{_analogical_set}};
	3					13
255	3					9	return \%set;
256							}
257
258							#pod =head2 C
259							#pod
260							#pod Returns a scalar reference (string) containing the analogical set,
261							#pod meaning all items that contributed to the predicted class, along
262							#pod with the amount contributed by each item (score and
263							#pod percentage overall). Items are ordered by appearance in the data
264							#pod set.
265							#pod
266							#pod =cut
267							sub analogical_set_summary {
268	1			1	1	1164	my ($self) = @_;
269	1					5	my $set = $self->analogical_set;
270	1					17	my $total_points = $self->total_points;
271
272							# Make a table for the analogical set. Each row contains an
273							# item with its class, comment, score, and the percentage
274							# of total score contributed.
275	1					4	my @rows;
276	1					6	foreach my $id (sort keys %$set){
277	4					7	my $entry = $set->{$id};
278	4					5	my $score = $entry->{score};
279							push @rows, [
280							$entry->{item}->class,
281							$entry->{item}->comment,
282	4					10	$score,
283							sprintf($percentage_format, 100 * $score / $total_points)
284							];
285							}
286	1					6	my @table = _make_table(
287							['Class', 'Item', 'Score', 'Percentage'], \@rows);
288	1					250	my $info = "Analogical Set\nTotal Frequency = $total_points\n";
289	1					5	$info .= join '', @table;
290	1					6	return \$info;
291							}
292
293							# calculate and store analogical effects in $self->{_analogical_set}
294							sub _calculate_analogical_set {
295	3			3		6	my ($self) = @_;
296	3					73	my $train = $self->training_set;
297	3					15	my %set;
298	3					5	foreach my $context ( keys %{$self->{pointers}} ) {
	3					12
299							next unless
300	11	100				23	exists $self->{itemcontextchainhead}->{$context};
301	8					20	for (
302							my $index = $self->{itemcontextchainhead}->{$context};
303							defined $index;
304							$index = $self->{itemcontextchain}->[$index]
305							)
306							{
307	10					22	my $item = $train->get_item($index);
308							$set{$item->id} = {
309							item => $item,
310	10					33	score => $self->{pointers}->{$context}
311							};
312							}
313							}
314	3					9	$self->{_analogical_set} = \%set;
315	3					6	return;
316							}
317
318							sub gang_effects {
319	4			4	1	627	my ($self) = @_;
320	4	100				16	if(!$self->{_gang_effects}){
321	3					11	$self->_calculate_gangs;
322							}
323	4					11	return $self->{_gang_effects};
324							}
325
326							#pod =head2 C
327							#pod
328							#pod Returns a scalar reference (string) containing the gang effects on the
329							#pod final class prediction.
330							#pod
331							#pod A single boolean parameter can be provided to turn on list printing,
332							#pod meaning gang items items are printed. This is false (off) by default.
333							#pod
334							#pod =cut
335							sub gang_summary {
336	3			3	1	2105	my ($self, $print_list) = @_;
337	3					71	my $test_item = $self->test_item;
338
339	3					19	my $gangs = $self->gang_effects;
340
341							# Make a table for the gangs with these rows:
342							# Percentage
343							# Score
344							# Num
345							# Class
346							# Features
347							# item comment
348	3					5	my @rows;
349							# first row is a header with test item for easy reference
350							push @rows, [
351							'Context',
352							undef,
353							undef,
354							undef,
355	3					6	@{$test_item->features},
	3					10
356							];
357
358							# store the number of rows added for each gang
359							# will help with printing later
360	3					5	my @gang_rows;
361	3					5	my $current_row = -1;
362							# add information for each gang; sort by order of highest to
363							# lowest effect
364	3					7	foreach my $gang (@$gangs){
365	10					12	$current_row++;
366	10					14	$gang_rows[$current_row]++;
367	10					15	my $features = $gang->{features};
368							# add the gang supracontext, effect and score
369							push @rows, [
370							sprintf($percentage_format, 100 * $gang->{effect}),
371							$gang->{score},
372							undef,
373							undef,
374							# print undefined feature slots as asterisks
375	10	100				61	map {length($_) ? $_ : '*'} @$features
	58					110
376							];
377							# add each class in the gang, along with the total number
378							# and effect of the gang items supporting it
379	10					19	for my $class (sort keys %{ $gang->{class} }){
	10					30
380	12					16	$gang_rows[$current_row]++;
381							push @rows, [
382							sprintf($percentage_format,
383							100 * $gang->{class}->{$class}->{effect}),
384							$gang->{class}->{$class}->{score},
385	12					42	scalar @{ $gang->{data}->{$class} },
	12					33
386							$class,
387							undef
388							];
389	12	100				26	if($print_list){
390							# add the list of items in the given context
391	4					6	for my $item (@{ $gang->{data}->{$class} }){
	4					7
392	4					5	$gang_rows[$current_row]++;
393							push @rows, [
394							undef,
395							undef,
396							undef,
397							undef,
398	4					6	@{ $item->features },
	4					8
399							$item->comment,
400							];
401							}
402							}
403							}
404							}
405
406							# construct the table from the rows
407							my @headers = (
408							\'\| ',
409							'Percentage' => \' \| ',
410							'Score' => \' \| ',
411							'Num Items' => \' \| ',
412							'Class' => \' \| ',
413	3					8	('' => \' ') x @{$test_item->features}
	3					7
414							);
415	3					6	pop @headers;
416	3	100				8	if($print_list){
417	1					3	push @headers, \' \| ', 'Item Comment';
418							}
419	3					3	push @headers, \' \|';
420	3					5	my @rule = qw(- +);
421	3					17	my $table = Text::Table->new(@headers);
422	3					14139	$table->load(@rows);
423							# main header
424	3					2636	$current_row = 0;
425	3					8	my $return = $table->rule(@rule) .
426							$table->title .
427							$table->body($current_row) .
428							$table->rule(@rule);
429	3					140404	$current_row++;
430							# add info with a header for each gang
431	3					11	for my $num (@gang_rows){
432							# a row of '*' separates each gang
433	10					23	$return .= $table->rule('','') .
434							$table->body($current_row) .
435							$table->rule(@rule);
436	10					4706	$current_row++;
437	10					48	for(1 .. $num - 1){
438	16					34	$return .= $table->body($current_row);
439	16					1042	$current_row++;
440							}
441							}
442	3					11	$return .= $table->rule(@rule);
443	3					749	return \$return;
444							}
445
446							sub _calculate_gangs {
447	3			3		6	my ($self) = @_;
448	3					85	my $train = $self->training_set;
449	3					55	my $total_points = $self->total_points;
450	3					18	my $raw_gang = $self->{gang};
451	3					6	my @gangs;
452
453	3					5	foreach my $context (keys %{$raw_gang})
	3					14
454							{
455	10					15	my $gang = {};
456	10					24	my @features = $self->_unpack_supracontext($context);
457							# for now, store gangs by the supracontext printout
458	10	100				23	my $key = join ' ', map {length($_) ? $_ : '-'} @features;
	58					129
459	10					29	$gang->{score} = $raw_gang->{$context};
460	10					26	$gang->{effect} = $raw_gang->{$context} / $total_points;
461	10					15	$gang->{features} = \@features;
462
463	10					20	my $p = $self->{pointers}->{$context};
464							# if the supracontext is homogenous
465	10	100				24	if ( my $class_index = $self->{context_to_class}->{$context} ) {
466							# store a 'homogenous' key that indicates this, besides
467							# indicating the unanimous class prediction.
468	8					25	my $class = $train->_class_for_index($class_index);
469	8					13	$gang->{homogenous} = $class;
470	8					13	my @data;
471	8					23	for (
472							my $index = $self->{itemcontextchainhead}->{$context};
473							defined $index;
474							$index = $self->{itemcontextchain}->[$index]
475							)
476							{
477	9					24	push @data, $train->get_item($index);
478							}
479	8					19	$gang->{data}->{$class} = \@data;
480	8					15	$gang->{size} = scalar @data;
481	8					19	$gang->{class}->{$class}->{score} = $p;
482							$gang->{class}->{$class}->{effect} =
483	8					15	$gang->{effect};
484							}
485							# for heterogenous supracontexts we have to store data for
486							# each class
487							else {
488	2					5	$gang->{homogenous} = 0;
489							# first loop through the data and sort by class, also
490							# finding the total gang size
491	2					4	my $size = 0;
492	2					2	my %data;
493	2					9	for (
494							my $index = $self->{itemcontextchainhead}->{$context};
495							defined $index;
496							$index = $self->{itemcontextchain}->[$index]
497							)
498							{
499	4					13	my $item = $train->get_item($index);
500	4					6	push @{ $data{$item->class} }, $item;
	4					8
501	4					13	$size++;
502							}
503	2					5	$gang->{data} = \%data;
504	2					3	$gang->{size} = $size;
505
506							# then store aggregate statistics for each class
507	2					7	for my $class (keys %data){
508	4					9	$gang->{class}->{$class}->{score} = $p;
509							$gang->{class}->{$class}->{effect} =
510							# score*num_data/total
511	4					6	@{ $data{$class} } * $p / $total_points;
	4					12
512							}
513							}
514	10					20	push @gangs, $gang;
515							}
516
517							# sort by score and then alphabetically by class labels
518							@gangs = sort{
519	3					17	bigcmp($b->{score}, $a->{score}) \|\|
520	0					0	(join '', sort keys %{ $b->{class} })
521							cmp
522	11	50				29	(join '', sort keys %{ $a->{class} })} @gangs;
	0					0
523	3					9	$self->{_gang_effects} = \@gangs;
524	3					6	return;
525							}
526
527							# Unpack and return the supracontext features.
528							# Blank entries mean the variable may be anything, e.g.
529							# ('a' 'b' '') means a supracontext containing items
530							# wich have ('a' 'b' whatever) as variable values.
531							sub _unpack_supracontext {
532	10			10		22	my ($self, $context) = @_;
533	10					36	my @context_list = unpack "S!4", $context;
534	10					16	my @alist = @{$self->{active_feats}};
	10					23
535	10					13	my (@features) = @{ $self->test_item->features };
	10					183
536	10					146	my $exclude_nulls = $self->exclude_nulls;
537	10					44	my $j = 1;
538	10					21	foreach my $a (reverse @alist) {
539	40					51	my $partial_context = pop @context_list;
540	40					868	for ( ; $a ; --$a ) {
541	46	50				64	if($exclude_nulls){
542	46					83	++$j while !defined $features[ -$j ];
543							}
544	46	100				79	$features[ -$j ] = '' if $partial_context & 1;
545	46					71	$partial_context >>= 1;
546	46					73	++$j;
547							}
548							}
549	10					39	return @features;
550							}
551
552							# mostly by Ovid:
553							# http://use.perl.org/use.perl.org/_Ovid/journal/36762.html
554							# Return table rows with a nice header and column separators
555							sub _make_table {
556	7			7		12	my ( $headers, $rows ) = @_;
557
558	7					17	my @rule = qw(- +);
559	7					9	my @headers = \'\| ';
560	7					15	push @headers => map { $_ => \' \| ' } @$headers;
	20					36
561	7					11	pop @headers;
562	7					9	push @headers => \' \|';
563
564	7	50	33			36	unless ('ARRAY' eq ref $rows
			33
565							&& 'ARRAY' eq ref $rows->[0]
566	7					17	&& @$headers == @{ $rows->[0] }) {
567	0					0	croak(
568							"make_table() rows must be an AoA with rows being same size as headers"
569							);
570							}
571	7					23	my $table = Text::Table->new(@headers);
572	7					11763	$table->rule(@rule);
573	7					14279	$table->body_rule(@rule);
574	7					1124	$table->load(@$rows);
575
576							return $table->rule(@rule),
577							$table->title,
578							$table->rule(@rule),
579	7					1907	map({ $table->body($_) } 0 .. @$rows),
	37					53114
580							$table->rule(@rule);
581							}
582
583							1;
584
585							__END__