File Coverage

blib/lib/AI/Categorizer/KnowledgeSet.pm

Criterion	Covered	Total	%
statement	98	220	44.5
branch	30	76	39.4
condition	0	9	0.0
subroutine	21	38	55.2
pod	18	23	78.2
total	167	366	45.6

line	stmt	bran	cond	sub	pod	time	code
1							package AI::Categorizer::KnowledgeSet;
2
3	11			11		77	use strict;
	11					21
	11					388
4	11			11		52	use Class::Container;
	11					17
	11					183
5	11			11		51	use AI::Categorizer::Storable;
	11					18
	11					238
6	11			11		45	use base qw(Class::Container AI::Categorizer::Storable);
	11					19
	11					972
7
8	11			11		50	use Params::Validate qw(:types);
	11					19
	11					1547
9	11			11		52	use AI::Categorizer::ObjectSet;
	11					17
	11					215
10	11			11		46	use AI::Categorizer::Document;
	11					21
	11					249
11	11			11		48	use AI::Categorizer::Category;
	11					22
	11					285
12	11			11		50	use AI::Categorizer::FeatureVector;
	11					23
	11					199
13	11			11		5363	use AI::Categorizer::Util;
	11					26
	11					530
14	11			11		59	use Carp qw(croak);
	11					23
	11					32497
15
16							__PACKAGE__->valid_params
17							(
18							categories => {
19							type => ARRAYREF,
20							default => [],
21							callbacks => { 'all are Category objects' =>
22							sub { ! grep !UNIVERSAL::isa($_, 'AI::Categorizer::Category'),
23							@{$_[0]} },
24							},
25							},
26							documents => {
27							type => ARRAYREF,
28							default => [],
29							callbacks => { 'all are Document objects' =>
30							sub { ! grep !UNIVERSAL::isa($_, 'AI::Categorizer::Document'),
31							@{$_[0]} },
32							},
33							},
34							scan_first => {
35							type => BOOLEAN,
36							default => 1,
37							},
38							feature_selector => {
39							isa => 'AI::Categorizer::FeatureSelector',
40							},
41							tfidf_weighting => {
42							type => SCALAR,
43							optional => 1,
44							},
45							term_weighting => {
46							type => SCALAR,
47							default => 'x',
48							},
49							collection_weighting => {
50							type => SCALAR,
51							default => 'x',
52							},
53							normalize_weighting => {
54							type => SCALAR,
55							default => 'x',
56							},
57							verbose => {
58							type => SCALAR,
59							default => 0,
60							},
61							);
62
63							__PACKAGE__->contained_objects
64							(
65							document => { delayed => 1,
66							class => 'AI::Categorizer::Document' },
67							category => { delayed => 1,
68							class => 'AI::Categorizer::Category' },
69							collection => { delayed => 1,
70							class => 'AI::Categorizer::Collection::Files' },
71							features => { delayed => 1,
72							class => 'AI::Categorizer::FeatureVector' },
73							feature_selector => 'AI::Categorizer::FeatureSelector::DocFrequency',
74							);
75
76							sub new {
77	12			12	1	6934	my ($pkg, %args) = @_;
78
79							# Shortcuts
80	12	100				52	if ($args{tfidf_weighting}) {
81	1					5	@args{'term_weighting', 'collection_weighting', 'normalize_weighting'} = split '', $args{tfidf_weighting};
82	1					4	delete $args{tfidf_weighting};
83							}
84
85	12					133	my $self = $pkg->SUPER::new(%args);
86
87							# Convert to AI::Categorizer::ObjectSet sets
88	12					8460	$self->{categories} = new AI::Categorizer::ObjectSet( @{$self->{categories}} );
	12					94
89	12					24	$self->{documents} = new AI::Categorizer::ObjectSet( @{$self->{documents}} );
	12					47
90
91	12	50				58	if ($self->{load}) {
92	0	0				0	my $args = ref($self->{load}) ? $self->{load} : { path => $self->{load} };
93	0					0	$self->load(%$args);
94	0					0	delete $self->{load};
95							}
96	12					56	return $self;
97							}
98
99							sub features {
100	19			19	1	37	my $self = shift;
101
102	19	100				41	if (@_) {
103	1					2	$self->{features} = shift;
104	1	50				5	$self->trim_doc_features if $self->{features};
105							}
106	19	100				88	return $self->{features} if $self->{features};
107
108							# Create a feature vector encompassing the whole set of documents
109	3					12	my $v = $self->create_delayed_object('features');
110	3					10	foreach my $document ($self->documents) {
111	12					35	$v->add( $document->features );
112							}
113	3					15	return $self->{features} = $v;
114							}
115
116							sub categories {
117	24			24	1	43	my $c = $_[0]->{categories};
118	24	50				97	return wantarray ? $c->members : $c->size;
119							}
120
121							sub documents {
122	35			35	1	67	my $d = $_[0]->{documents};
123	35	100				127	return wantarray ? $d->members : $d->size;
124							}
125
126							sub document {
127	7			7	1	14	my ($self, $name) = @_;
128	7					24	return $self->{documents}->retrieve($name);
129							}
130
131	0			0	0	0	sub feature_selector { $_[0]->{feature_selector} }
132	0			0	0	0	sub scan_first { $_[0]->{scan_first} }
133
134							sub verbose {
135	0			0	1	0	my $self = shift;
136	0	0				0	$self->{verbose} = shift if @_;
137	0					0	return $self->{verbose};
138							}
139
140							sub trim_doc_features {
141	0			0	0	0	my ($self) = @_;
142
143	0					0	foreach my $doc ($self->documents) {
144	0					0	$doc->features( $doc->features->intersection($self->features) );
145							}
146							}
147
148
149							sub prog_bar {
150	0			0	0	0	my ($self, $collection) = @_;
151
152	0	0		0		0	return sub {} unless $self->verbose;
	0					0
153	0	0		0		0	return sub { print STDERR '.' } unless eval "use Time::Progress; 1";
	0					0
154
155	0	0				0	my $count = $collection->can('count_documents') ? $collection->count_documents : 0;
156
157	0					0	my $pb = 'Time::Progress'->new;
158	0					0	$pb->attr(max => $count);
159	0					0	my $i = 0;
160							return sub {
161	0			0		0	$i++;
162	0	0				0	return if $i % 25;
163	0					0	print STDERR $pb->report("%50b %p ($i/$count)\r", $i);
164	0					0	};
165							}
166
167							# A little utility method for several other methods like scan_stats(),
168							# load(), read(), etc.
169							sub _make_collection {
170	0			0		0	my ($self, $args) = @_;
171	0		0			0	return $args->{collection} \|\| $self->create_delayed_object('collection', %$args);
172							}
173
174							sub scan_stats {
175							# Should determine:
176							# - number of documents
177							# - number of categories
178							# - avg. number of categories per document (whole corpus)
179							# - avg. number of tokens per document (whole corpus)
180							# - avg. number of types per document (whole corpus)
181							# - number of documents, tokens, & types for each category
182							# - "category skew index" (% variance?) by num. documents, tokens, and types
183
184	0			0	1	0	my ($self, %args) = @_;
185	0					0	my $collection = $self->_make_collection(\%args);
186	0					0	my $pb = $self->prog_bar($collection);
187
188	0					0	my %stats;
189
190
191	0					0	while (my $doc = $collection->next) {
192	0					0	$pb->();
193	0					0	$stats{category_count_with_duplicates} += $doc->categories;
194
195	0					0	my ($sum, $length) = ($doc->features->sum, $doc->features->length);
196	0					0	$stats{document_count}++;
197	0					0	$stats{token_count} += $sum;
198	0					0	$stats{type_count} += $length;
199
200	0					0	foreach my $cat ($doc->categories) {
201							#warn $doc->name, ": ", $cat->name, "\n";
202	0					0	$stats{categories}{$cat->name}{document_count}++;
203	0					0	$stats{categories}{$cat->name}{token_count} += $sum;
204	0					0	$stats{categories}{$cat->name}{type_count} += $length;
205							}
206							}
207	0	0				0	print "\n" if $self->verbose;
208
209	0					0	my @cats = keys %{ $stats{categories} };
	0					0
210
211	0					0	$stats{category_count} = @cats;
212	0					0	$stats{categories_per_document} = $stats{category_count_with_duplicates} / $stats{document_count};
213	0					0	$stats{tokens_per_document} = $stats{token_count} / $stats{document_count};
214	0					0	$stats{types_per_document} = $stats{type_count} / $stats{document_count};
215
216	0					0	foreach my $thing ('type', 'token', 'document') {
217	0					0	$stats{"${thing}s_per_category"} = AI::Categorizer::Util::average
218	0					0	( map { $stats{categories}{$_}{"${thing}_count"} } @cats );
219
220	0	0				0	next unless @cats;
221
222							# Compute the skews
223	0					0	my $ssum;
224	0					0	foreach my $cat (@cats) {
225	0					0	$ssum += ($stats{categories}{$cat}{"${thing}_count"} - $stats{"${thing}s_per_category"}) ** 2;
226							}
227	0					0	$stats{"${thing}_skew_by_category"} = sqrt($ssum/@cats) / $stats{"${thing}s_per_category"};
228							}
229
230	0					0	return \%stats;
231							}
232
233							sub load {
234	0			0	1	0	my ($self, %args) = @_;
235	0					0	my $c = $self->_make_collection(\%args);
236
237	0	0				0	if ($self->{features_kept}) {
		0
238							# Read the whole thing in, then reduce
239	0					0	$self->read( collection => $c );
240	0					0	$self->select_features;
241
242							} elsif ($self->{scan_first}) {
243							# Figure out the feature set first, then read data in
244	0					0	$self->scan_features( collection => $c );
245	0					0	$c->rewind;
246	0					0	$self->read( collection => $c );
247
248							} else {
249							# Don't do any feature reduction, just read the data
250	0					0	$self->read( collection => $c );
251							}
252							}
253
254							sub read {
255	0			0	1	0	my ($self, %args) = @_;
256	0					0	my $collection = $self->_make_collection(\%args);
257	0					0	my $pb = $self->prog_bar($collection);
258
259	0					0	while (my $doc = $collection->next) {
260	0					0	$pb->();
261	0					0	$self->add_document($doc);
262							}
263	0	0				0	print "\n" if $self->verbose;
264							}
265
266							sub finish {
267	11			11	1	29	my $self = shift;
268	11	100				51	return if $self->{finished}++;
269	10					43	$self->weigh_features;
270							}
271
272							sub weigh_features {
273							# This could be made more efficient by figuring out an execution
274							# plan in advance
275
276	10			10	1	16	my $self = shift;
277
278	10	100				92	if ( $self->{term_weighting} =~ /^(t\|x)$/ ) {
		50
		100
		50
279							# Nothing to do
280							} elsif ( $self->{term_weighting} eq 'l' ) {
281	0					0	foreach my $doc ($self->documents) {
282	0					0	my $f = $doc->features->as_hash;
283	0					0	$_ = 1 + log($_) foreach values %$f;
284							}
285							} elsif ( $self->{term_weighting} eq 'n' ) {
286	1					6	foreach my $doc ($self->documents) {
287	4					13	my $f = $doc->features->as_hash;
288	4					19	my $max_tf = AI::Categorizer::Util::max values %$f;
289	4					35	$_ = 0.5 + 0.5 * $_ / $max_tf foreach values %$f;
290							}
291							} elsif ( $self->{term_weighting} eq 'b' ) {
292	1					6	foreach my $doc ($self->documents) {
293	4					10	my $f = $doc->features->as_hash;
294	4	50				37	$_ = $_ ? 1 : 0 foreach values %$f;
295							}
296							} else {
297	0					0	die "term_weighting must be one of 'x', 't', 'l', 'b', or 'n'";
298							}
299
300	10	100				53	if ($self->{collection_weighting} eq 'x') {
		50
301							# Nothing to do
302							} elsif ($self->{collection_weighting} =~ /^(f\|p)$/) {
303	1	50				6	my $subtrahend = ($1 eq 'f' ? 0 : 1);
304	1					5	my $num_docs = $self->documents;
305	1					5	$self->document_frequency('foo'); # Initialize
306	1					3	foreach my $doc ($self->documents) {
307	4					9	my $f = $doc->features->as_hash;
308	4					52	$f->{$_} *= log($num_docs / $self->{doc_freq_vector}{$_} - $subtrahend) foreach keys %$f;
309							}
310							} else {
311	0					0	die "collection_weighting must be one of 'x', 'f', or 'p'";
312							}
313
314	10	50				49	if ( $self->{normalize_weighting} eq 'x' ) {
		0
315							# Nothing to do
316							} elsif ( $self->{normalize_weighting} eq 'c' ) {
317	0					0	$_->features->normalize foreach $self->documents;
318							} else {
319	0					0	die "normalize_weighting must be one of 'x' or 'c'";
320							}
321							}
322
323							sub document_frequency {
324	4			4	1	7	my ($self, $term) = @_;
325
326	4	100				12	unless (exists $self->{doc_freq_vector}) {
327	1	50				5	die "No corpus has been scanned for features" unless $self->documents;
328
329	1					4	my $doc_freq = $self->create_delayed_object('features', features => {});
330	1					4	foreach my $doc ($self->documents) {
331	4					11	$doc_freq->add( $doc->features->as_boolean_hash );
332							}
333	1					5	$self->{doc_freq_vector} = $doc_freq->as_hash;
334							}
335
336	4	100				22	return exists $self->{doc_freq_vector}{$term} ? $self->{doc_freq_vector}{$term} : 0;
337							}
338
339							sub scan_features {
340	0			0	1	0	my ($self, %args) = @_;
341	0					0	my $c = $self->_make_collection(\%args);
342
343	0					0	my $pb = $self->prog_bar($c);
344	0					0	my $ranked_features = $self->{feature_selector}->scan_features( collection => $c, prog_bar => $pb );
345
346	0					0	$self->delayed_object_params('document', use_features => $ranked_features);
347	0					0	$self->delayed_object_params('collection', use_features => $ranked_features);
348	0					0	return $ranked_features;
349							}
350
351							sub select_features {
352	0			0	0	0	my $self = shift;
353
354	0					0	my $f = $self->feature_selector->select_features(knowledge_set => $self);
355	0					0	$self->features($f);
356							}
357
358							sub partition {
359	0			0	1	0	my ($self, @sizes) = @_;
360	0					0	my $num_docs = my @docs = $self->documents;
361	0					0	my @groups;
362
363	0					0	while (@sizes > 1) {
364	0					0	my $size = int ($num_docs * shift @sizes);
365	0					0	push @groups, [];
366	0					0	for (0..$size) {
367	0					0	push @{ $groups[-1] }, splice @docs, rand(@docs), 1;
	0					0
368							}
369							}
370	0					0	push @groups, \@docs;
371
372	0					0	return map { ref($self)->new( documents => $_ ) } @groups;
	0					0
373							}
374
375							sub make_document {
376	40			40	1	134	my ($self, %args) = @_;
377	40					73	my $cats = delete $args{categories};
378	40					74	my @cats = map { $self->call_method('category', 'by_name', name => $_) } @$cats;
	40					176
379	40					168	my $d = $self->create_delayed_object('document', %args, categories => \@cats);
380	40					126	$self->add_document($d);
381							}
382
383							sub add_document {
384	40			40	1	55	my ($self, $doc) = @_;
385
386	40					104	foreach ($doc->categories) {
387	40					115	$_->add_document($doc);
388							}
389	40					133	$self->{documents}->insert($doc);
390	40					117	$self->{categories}->insert($doc->categories);
391							}
392
393							sub save_features {
394	0			0	1		my ($self, $file) = @_;
395
396	0	0	0				my $f = ($self->{features} \|\| { $self->delayed_object_params('document') }->{use_features})
397							or croak "No features to save";
398
399	0	0					open my($fh), "> $file" or croak "Can't create $file: $!";
400	0						my $h = $f->as_hash;
401	0						print $fh "# Total: ", $f->length, "\n";
402
403	0						foreach my $k (sort {$h->{$b} <=> $h->{$a}} keys %$h) {
	0
404	0						print $fh "$k\t$h->{$k}\n";
405							}
406	0						close $fh;
407							}
408
409							sub restore_features {
410	0			0	1		my ($self, $file, $n) = @_;
411
412	0	0					open my($fh), "< $file" or croak "Can't open $file: $!";
413
414	0						my %hash;
415	0						while (<$fh>) {
416	0	0					next if /^#/;
417	0	0					/^(.*)\t([\d.]+)$/ or croak "Malformed line: $_";
418	0						$hash{$1} = $2;
419	0	0	0				last if defined $n and $. >= $n;
420							}
421	0						my $features = $self->create_delayed_object('features', features => \%hash);
422
423	0						$self->delayed_object_params('document', use_features => $features);
424	0						$self->delayed_object_params('collection', use_features => $features);
425							}
426
427							1;
428
429							__END__