File Coverage

blib/lib/AI/Categorizer/KnowledgeSet.pm
Criterion Covered Total %
statement 98 220 44.5
branch 30 76 39.4
condition 0 9 0.0
subroutine 21 38 55.2
pod 18 23 78.2
total 167 366 45.6


line stmt bran cond sub pod time code
1             package AI::Categorizer::KnowledgeSet;
2              
3 11     11   77 use strict;
  11         21  
  11         388  
4 11     11   52 use Class::Container;
  11         17  
  11         183  
5 11     11   51 use AI::Categorizer::Storable;
  11         18  
  11         238  
6 11     11   45 use base qw(Class::Container AI::Categorizer::Storable);
  11         19  
  11         972  
7              
8 11     11   50 use Params::Validate qw(:types);
  11         19  
  11         1547  
9 11     11   52 use AI::Categorizer::ObjectSet;
  11         17  
  11         215  
10 11     11   46 use AI::Categorizer::Document;
  11         21  
  11         249  
11 11     11   48 use AI::Categorizer::Category;
  11         22  
  11         285  
12 11     11   50 use AI::Categorizer::FeatureVector;
  11         23  
  11         199  
13 11     11   5363 use AI::Categorizer::Util;
  11         26  
  11         530  
14 11     11   59 use Carp qw(croak);
  11         23  
  11         32497  
15              
16             __PACKAGE__->valid_params
17             (
18             categories => {
19             type => ARRAYREF,
20             default => [],
21             callbacks => { 'all are Category objects' =>
22             sub { ! grep !UNIVERSAL::isa($_, 'AI::Categorizer::Category'),
23             @{$_[0]} },
24             },
25             },
26             documents => {
27             type => ARRAYREF,
28             default => [],
29             callbacks => { 'all are Document objects' =>
30             sub { ! grep !UNIVERSAL::isa($_, 'AI::Categorizer::Document'),
31             @{$_[0]} },
32             },
33             },
34             scan_first => {
35             type => BOOLEAN,
36             default => 1,
37             },
38             feature_selector => {
39             isa => 'AI::Categorizer::FeatureSelector',
40             },
41             tfidf_weighting => {
42             type => SCALAR,
43             optional => 1,
44             },
45             term_weighting => {
46             type => SCALAR,
47             default => 'x',
48             },
49             collection_weighting => {
50             type => SCALAR,
51             default => 'x',
52             },
53             normalize_weighting => {
54             type => SCALAR,
55             default => 'x',
56             },
57             verbose => {
58             type => SCALAR,
59             default => 0,
60             },
61             );
62              
63             __PACKAGE__->contained_objects
64             (
65             document => { delayed => 1,
66             class => 'AI::Categorizer::Document' },
67             category => { delayed => 1,
68             class => 'AI::Categorizer::Category' },
69             collection => { delayed => 1,
70             class => 'AI::Categorizer::Collection::Files' },
71             features => { delayed => 1,
72             class => 'AI::Categorizer::FeatureVector' },
73             feature_selector => 'AI::Categorizer::FeatureSelector::DocFrequency',
74             );
75              
76             sub new {
77 12     12 1 6934 my ($pkg, %args) = @_;
78            
79             # Shortcuts
80 12 100       52 if ($args{tfidf_weighting}) {
81 1         5 @args{'term_weighting', 'collection_weighting', 'normalize_weighting'} = split '', $args{tfidf_weighting};
82 1         4 delete $args{tfidf_weighting};
83             }
84              
85 12         133 my $self = $pkg->SUPER::new(%args);
86              
87             # Convert to AI::Categorizer::ObjectSet sets
88 12         8460 $self->{categories} = new AI::Categorizer::ObjectSet( @{$self->{categories}} );
  12         94  
89 12         24 $self->{documents} = new AI::Categorizer::ObjectSet( @{$self->{documents}} );
  12         47  
90              
91 12 50       58 if ($self->{load}) {
92 0 0       0 my $args = ref($self->{load}) ? $self->{load} : { path => $self->{load} };
93 0         0 $self->load(%$args);
94 0         0 delete $self->{load};
95             }
96 12         56 return $self;
97             }
98              
99             sub features {
100 19     19 1 37 my $self = shift;
101              
102 19 100       41 if (@_) {
103 1         2 $self->{features} = shift;
104 1 50       5 $self->trim_doc_features if $self->{features};
105             }
106 19 100       88 return $self->{features} if $self->{features};
107              
108             # Create a feature vector encompassing the whole set of documents
109 3         12 my $v = $self->create_delayed_object('features');
110 3         10 foreach my $document ($self->documents) {
111 12         35 $v->add( $document->features );
112             }
113 3         15 return $self->{features} = $v;
114             }
115              
116             sub categories {
117 24     24 1 43 my $c = $_[0]->{categories};
118 24 50       97 return wantarray ? $c->members : $c->size;
119             }
120              
121             sub documents {
122 35     35 1 67 my $d = $_[0]->{documents};
123 35 100       127 return wantarray ? $d->members : $d->size;
124             }
125              
126             sub document {
127 7     7 1 14 my ($self, $name) = @_;
128 7         24 return $self->{documents}->retrieve($name);
129             }
130              
131 0     0 0 0 sub feature_selector { $_[0]->{feature_selector} }
132 0     0 0 0 sub scan_first { $_[0]->{scan_first} }
133              
134             sub verbose {
135 0     0 1 0 my $self = shift;
136 0 0       0 $self->{verbose} = shift if @_;
137 0         0 return $self->{verbose};
138             }
139              
140             sub trim_doc_features {
141 0     0 0 0 my ($self) = @_;
142            
143 0         0 foreach my $doc ($self->documents) {
144 0         0 $doc->features( $doc->features->intersection($self->features) );
145             }
146             }
147              
148              
149             sub prog_bar {
150 0     0 0 0 my ($self, $collection) = @_;
151              
152 0 0   0   0 return sub {} unless $self->verbose;
  0         0  
153 0 0   0   0 return sub { print STDERR '.' } unless eval "use Time::Progress; 1";
  0         0  
154              
155 0 0       0 my $count = $collection->can('count_documents') ? $collection->count_documents : 0;
156            
157 0         0 my $pb = 'Time::Progress'->new;
158 0         0 $pb->attr(max => $count);
159 0         0 my $i = 0;
160             return sub {
161 0     0   0 $i++;
162 0 0       0 return if $i % 25;
163 0         0 print STDERR $pb->report("%50b %p ($i/$count)\r", $i);
164 0         0 };
165             }
166              
167             # A little utility method for several other methods like scan_stats(),
168             # load(), read(), etc.
169             sub _make_collection {
170 0     0   0 my ($self, $args) = @_;
171 0   0     0 return $args->{collection} || $self->create_delayed_object('collection', %$args);
172             }
173              
174             sub scan_stats {
175             # Should determine:
176             # - number of documents
177             # - number of categories
178             # - avg. number of categories per document (whole corpus)
179             # - avg. number of tokens per document (whole corpus)
180             # - avg. number of types per document (whole corpus)
181             # - number of documents, tokens, & types for each category
182             # - "category skew index" (% variance?) by num. documents, tokens, and types
183              
184 0     0 1 0 my ($self, %args) = @_;
185 0         0 my $collection = $self->_make_collection(\%args);
186 0         0 my $pb = $self->prog_bar($collection);
187              
188 0         0 my %stats;
189              
190              
191 0         0 while (my $doc = $collection->next) {
192 0         0 $pb->();
193 0         0 $stats{category_count_with_duplicates} += $doc->categories;
194              
195 0         0 my ($sum, $length) = ($doc->features->sum, $doc->features->length);
196 0         0 $stats{document_count}++;
197 0         0 $stats{token_count} += $sum;
198 0         0 $stats{type_count} += $length;
199            
200 0         0 foreach my $cat ($doc->categories) {
201             #warn $doc->name, ": ", $cat->name, "\n";
202 0         0 $stats{categories}{$cat->name}{document_count}++;
203 0         0 $stats{categories}{$cat->name}{token_count} += $sum;
204 0         0 $stats{categories}{$cat->name}{type_count} += $length;
205             }
206             }
207 0 0       0 print "\n" if $self->verbose;
208              
209 0         0 my @cats = keys %{ $stats{categories} };
  0         0  
210              
211 0         0 $stats{category_count} = @cats;
212 0         0 $stats{categories_per_document} = $stats{category_count_with_duplicates} / $stats{document_count};
213 0         0 $stats{tokens_per_document} = $stats{token_count} / $stats{document_count};
214 0         0 $stats{types_per_document} = $stats{type_count} / $stats{document_count};
215              
216 0         0 foreach my $thing ('type', 'token', 'document') {
217 0         0 $stats{"${thing}s_per_category"} = AI::Categorizer::Util::average
218 0         0 ( map { $stats{categories}{$_}{"${thing}_count"} } @cats );
219              
220 0 0       0 next unless @cats;
221              
222             # Compute the skews
223 0         0 my $ssum;
224 0         0 foreach my $cat (@cats) {
225 0         0 $ssum += ($stats{categories}{$cat}{"${thing}_count"} - $stats{"${thing}s_per_category"}) ** 2;
226             }
227 0         0 $stats{"${thing}_skew_by_category"} = sqrt($ssum/@cats) / $stats{"${thing}s_per_category"};
228             }
229              
230 0         0 return \%stats;
231             }
232              
233             sub load {
234 0     0 1 0 my ($self, %args) = @_;
235 0         0 my $c = $self->_make_collection(\%args);
236              
237 0 0       0 if ($self->{features_kept}) {
    0          
238             # Read the whole thing in, then reduce
239 0         0 $self->read( collection => $c );
240 0         0 $self->select_features;
241              
242             } elsif ($self->{scan_first}) {
243             # Figure out the feature set first, then read data in
244 0         0 $self->scan_features( collection => $c );
245 0         0 $c->rewind;
246 0         0 $self->read( collection => $c );
247              
248             } else {
249             # Don't do any feature reduction, just read the data
250 0         0 $self->read( collection => $c );
251             }
252             }
253              
254             sub read {
255 0     0 1 0 my ($self, %args) = @_;
256 0         0 my $collection = $self->_make_collection(\%args);
257 0         0 my $pb = $self->prog_bar($collection);
258            
259 0         0 while (my $doc = $collection->next) {
260 0         0 $pb->();
261 0         0 $self->add_document($doc);
262             }
263 0 0       0 print "\n" if $self->verbose;
264             }
265              
266             sub finish {
267 11     11 1 29 my $self = shift;
268 11 100       51 return if $self->{finished}++;
269 10         43 $self->weigh_features;
270             }
271              
272             sub weigh_features {
273             # This could be made more efficient by figuring out an execution
274             # plan in advance
275              
276 10     10 1 16 my $self = shift;
277            
278 10 100       92 if ( $self->{term_weighting} =~ /^(t|x)$/ ) {
    50          
    100          
    50          
279             # Nothing to do
280             } elsif ( $self->{term_weighting} eq 'l' ) {
281 0         0 foreach my $doc ($self->documents) {
282 0         0 my $f = $doc->features->as_hash;
283 0         0 $_ = 1 + log($_) foreach values %$f;
284             }
285             } elsif ( $self->{term_weighting} eq 'n' ) {
286 1         6 foreach my $doc ($self->documents) {
287 4         13 my $f = $doc->features->as_hash;
288 4         19 my $max_tf = AI::Categorizer::Util::max values %$f;
289 4         35 $_ = 0.5 + 0.5 * $_ / $max_tf foreach values %$f;
290             }
291             } elsif ( $self->{term_weighting} eq 'b' ) {
292 1         6 foreach my $doc ($self->documents) {
293 4         10 my $f = $doc->features->as_hash;
294 4 50       37 $_ = $_ ? 1 : 0 foreach values %$f;
295             }
296             } else {
297 0         0 die "term_weighting must be one of 'x', 't', 'l', 'b', or 'n'";
298             }
299            
300 10 100       53 if ($self->{collection_weighting} eq 'x') {
    50          
301             # Nothing to do
302             } elsif ($self->{collection_weighting} =~ /^(f|p)$/) {
303 1 50       6 my $subtrahend = ($1 eq 'f' ? 0 : 1);
304 1         5 my $num_docs = $self->documents;
305 1         5 $self->document_frequency('foo'); # Initialize
306 1         3 foreach my $doc ($self->documents) {
307 4         9 my $f = $doc->features->as_hash;
308 4         52 $f->{$_} *= log($num_docs / $self->{doc_freq_vector}{$_} - $subtrahend) foreach keys %$f;
309             }
310             } else {
311 0         0 die "collection_weighting must be one of 'x', 'f', or 'p'";
312             }
313              
314 10 50       49 if ( $self->{normalize_weighting} eq 'x' ) {
    0          
315             # Nothing to do
316             } elsif ( $self->{normalize_weighting} eq 'c' ) {
317 0         0 $_->features->normalize foreach $self->documents;
318             } else {
319 0         0 die "normalize_weighting must be one of 'x' or 'c'";
320             }
321             }
322              
323             sub document_frequency {
324 4     4 1 7 my ($self, $term) = @_;
325            
326 4 100       12 unless (exists $self->{doc_freq_vector}) {
327 1 50       5 die "No corpus has been scanned for features" unless $self->documents;
328              
329 1         4 my $doc_freq = $self->create_delayed_object('features', features => {});
330 1         4 foreach my $doc ($self->documents) {
331 4         11 $doc_freq->add( $doc->features->as_boolean_hash );
332             }
333 1         5 $self->{doc_freq_vector} = $doc_freq->as_hash;
334             }
335            
336 4 100       22 return exists $self->{doc_freq_vector}{$term} ? $self->{doc_freq_vector}{$term} : 0;
337             }
338              
339             sub scan_features {
340 0     0 1 0 my ($self, %args) = @_;
341 0         0 my $c = $self->_make_collection(\%args);
342              
343 0         0 my $pb = $self->prog_bar($c);
344 0         0 my $ranked_features = $self->{feature_selector}->scan_features( collection => $c, prog_bar => $pb );
345              
346 0         0 $self->delayed_object_params('document', use_features => $ranked_features);
347 0         0 $self->delayed_object_params('collection', use_features => $ranked_features);
348 0         0 return $ranked_features;
349             }
350              
351             sub select_features {
352 0     0 0 0 my $self = shift;
353            
354 0         0 my $f = $self->feature_selector->select_features(knowledge_set => $self);
355 0         0 $self->features($f);
356             }
357              
358             sub partition {
359 0     0 1 0 my ($self, @sizes) = @_;
360 0         0 my $num_docs = my @docs = $self->documents;
361 0         0 my @groups;
362              
363 0         0 while (@sizes > 1) {
364 0         0 my $size = int ($num_docs * shift @sizes);
365 0         0 push @groups, [];
366 0         0 for (0..$size) {
367 0         0 push @{ $groups[-1] }, splice @docs, rand(@docs), 1;
  0         0  
368             }
369             }
370 0         0 push @groups, \@docs;
371              
372 0         0 return map { ref($self)->new( documents => $_ ) } @groups;
  0         0  
373             }
374              
375             sub make_document {
376 40     40 1 134 my ($self, %args) = @_;
377 40         73 my $cats = delete $args{categories};
378 40         74 my @cats = map { $self->call_method('category', 'by_name', name => $_) } @$cats;
  40         176  
379 40         168 my $d = $self->create_delayed_object('document', %args, categories => \@cats);
380 40         126 $self->add_document($d);
381             }
382              
383             sub add_document {
384 40     40 1 55 my ($self, $doc) = @_;
385              
386 40         104 foreach ($doc->categories) {
387 40         115 $_->add_document($doc);
388             }
389 40         133 $self->{documents}->insert($doc);
390 40         117 $self->{categories}->insert($doc->categories);
391             }
392              
393             sub save_features {
394 0     0 1   my ($self, $file) = @_;
395            
396 0 0 0       my $f = ($self->{features} || { $self->delayed_object_params('document') }->{use_features})
397             or croak "No features to save";
398            
399 0 0         open my($fh), "> $file" or croak "Can't create $file: $!";
400 0           my $h = $f->as_hash;
401 0           print $fh "# Total: ", $f->length, "\n";
402            
403 0           foreach my $k (sort {$h->{$b} <=> $h->{$a}} keys %$h) {
  0            
404 0           print $fh "$k\t$h->{$k}\n";
405             }
406 0           close $fh;
407             }
408              
409             sub restore_features {
410 0     0 1   my ($self, $file, $n) = @_;
411            
412 0 0         open my($fh), "< $file" or croak "Can't open $file: $!";
413              
414 0           my %hash;
415 0           while (<$fh>) {
416 0 0         next if /^#/;
417 0 0         /^(.*)\t([\d.]+)$/ or croak "Malformed line: $_";
418 0           $hash{$1} = $2;
419 0 0 0       last if defined $n and $. >= $n;
420             }
421 0           my $features = $self->create_delayed_object('features', features => \%hash);
422            
423 0           $self->delayed_object_params('document', use_features => $features);
424 0           $self->delayed_object_params('collection', use_features => $features);
425             }
426              
427             1;
428              
429             __END__