File Coverage

blib/lib/AI/Categorizer/Document.pm

Criterion	Covered	Total	%
statement	109	132	82.5
branch	40	64	62.5
condition	4	9	44.4
subroutine	22	23	95.6
pod	7	14	50.0
total	182	242	75.2

line	stmt	bran	cond	sub	pod	time	code
1							package AI::Categorizer::Document;
2
3	11			11		48	use strict;
	11					33
	11					301
4	11			11		48	use Class::Container;
	11					18
	11					243
5	11			11		47	use base qw(Class::Container);
	11					15
	11					677
6
7	11			11		48	use Params::Validate qw(:types);
	11					18
	11					1493
8	11			11		56	use AI::Categorizer::ObjectSet;
	11					19
	11					185
9	11			11		12959	use AI::Categorizer::FeatureVector;
	11					24
	11					20555
10
11							__PACKAGE__->valid_params
12							(
13							name => {
14							type => SCALAR,
15							},
16							categories => {
17							type => ARRAYREF,
18							default => [],
19							callbacks => { 'all are Category objects' =>
20							sub { ! grep !UNIVERSAL::isa($_, 'AI::Categorizer::Category'), @{$_[0]} },
21							},
22							public => 0,
23							},
24							stopwords => {
25							type => ARRAYREF\|HASHREF,
26							default => {},
27							},
28							content => {
29							type => HASHREF\|SCALAR,
30							default => undef,
31							},
32							parse => {
33							type => SCALAR,
34							optional => 1,
35							},
36							parse_handle => {
37							type => HANDLE,
38							optional => 1,
39							},
40							features => {
41							isa => 'AI::Categorizer::FeatureVector',
42							optional => 1,
43							},
44							content_weights => {
45							type => HASHREF,
46							default => {},
47							},
48							front_bias => {
49							type => SCALAR,
50							default => 0,
51							},
52							use_features => {
53							type => HASHREF\|UNDEF,
54							default => undef,
55							},
56							stemming => {
57							type => SCALAR\|UNDEF,
58							optional => 1,
59							},
60							stopword_behavior => {
61							type => SCALAR,
62							default => "stem",
63							},
64							);
65
66							__PACKAGE__->contained_objects
67							(
68							features => { delayed => 1,
69							class => 'AI::Categorizer::FeatureVector' },
70							);
71
72							### Constructors
73
74							my $NAME = 'a';
75
76							sub new {
77	94			94	1	4735	my $pkg = shift;
78	94					489	my $self = $pkg->SUPER::new(name => $NAME++, # Use a default name
79							@_);
80
81							# Get efficient internal data structures
82	94					7583	$self->{categories} = new AI::Categorizer::ObjectSet( @{$self->{categories}} );
	94					463
83
84	94					257	$self->_fix_stopwords;
85
86							# A few different ways for the caller to initialize the content
87	94	100				441	if (exists $self->{parse}) {
		50
		100
88	1					6	$self->parse(content => delete $self->{parse});
89
90							} elsif (exists $self->{parse_handle}) {
91	0					0	$self->parse_handle(handle => delete $self->{parse_handle});
92
93							} elsif (defined $self->{content}) {
94							# Allow a simple string as the content
95	88	50				372	$self->{content} = { body => $self->{content} } unless ref $self->{content};
96							}
97
98	94	100				406	$self->finish if $self->{content};
99	94					420	return $self;
100							}
101
102							sub _fix_stopwords {
103	94			94		125	my $self = shift;
104
105							# Convert to hash
106	94	100				331	$self->{stopwords} = { map {($_ => 1)} @{ $self->{stopwords} } }
	123					282
	27					54
107							if UNIVERSAL::isa($self->{stopwords}, 'ARRAY');
108
109	94					167	my $s = $self->{stopwords};
110
111							# May need to perform stemming on the stopwords
112	94	100				264	return unless keys %$s; # No point in doing anything if there are no stopwords
113	27	100				99	return unless $self->{stopword_behavior} eq 'stem';
114	25	100	66			115	return if !defined($self->{stemming}) or $self->{stemming} eq 'none';
115	1	50				5	return if $s->{___stemmed};
116
117	1					3	my @keys = keys %$s;
118	1					3	%$s = ();
119	1					4	$self->stem_words(\@keys);
120	1					4282	$s->{$_} = 1 foreach @keys;
121
122							# This flag is attached to the stopword structure itself so that
123							# other documents will notice it.
124	1					6	$s->{___stemmed} = 1;
125							}
126
127							sub finish {
128	92			92	0	127	my $self = shift;
129	92					197	$self->create_feature_vector;
130
131							# Now we're done with all the content stuff
132	92					126	delete @{$self}{'content', 'content_weights', 'stopwords', 'use_features'};
	92					384
133							}
134
135
136							# Parse a document format - a virtual method
137							sub parse;
138
139							sub parse_handle {
140	3			3	0	9	my ($self, %args) = @_;
141	3	50				8	my $fh = $args{handle} or die "No 'handle' argument given to parse_handle()";
142	3					90	return $self->parse( content => join '', <$fh> );
143							}
144
145							### Accessors
146
147	157			157	1	857	sub name { $_[0]->{name} }
148	3			3	1	39	sub stopword_behavior { $_[0]->{stopword_behavior} }
149
150							sub features {
151	198			198	1	1168	my $self = shift;
152	198	50				477	if (@_) {
153	0					0	$self->{features} = shift;
154							}
155	198					698	return $self->{features};
156							}
157
158							sub categories {
159	140			140	1	234	my $c = $_[0]->{categories};
160	140	50				454	return wantarray ? $c->members : $c->size;
161							}
162
163
164							### Workers
165
166							sub create_feature_vector {
167	92			92	1	104	my $self = shift;
168	92					134	my $content = $self->{content};
169	92					126	my $weights = $self->{content_weights};
170
171	92	50				379	die "'stopword_behavior' must be one of 'stem', 'no_stem', or 'pre_stemmed'"
172							unless $self->{stopword_behavior} =~ /^stem\|no_stem\|pre_stemmed$/;
173
174	92					334	$self->{features} = $self->create_delayed_object('features');
175	92					370	while (my ($name, $data) = each %$content) {
176	92					227	my $t = $self->tokenize($data);
177	92	100				280	$t = $self->_filter_tokens($t) if $self->{stopword_behavior} eq 'no_stem';
178	92					235	$self->stem_words($t);
179	92	100				1272	$t = $self->_filter_tokens($t) if $self->{stopword_behavior} =~ /^stem\|pre_stemmed$/;
180	92	50				453	my $h = $self->vectorize(tokens => $t, weight => exists($weights->{$name}) ? $weights->{$name} : 1 );
181	92					353	$self->{features}->add($h);
182							}
183							}
184
185							sub is_in_category {
186	16	50		16	0	100	return (ref $_[1]
187							? $_[0]->{categories}->includes( $_[1] )
188							: $_[0]->{categories}->includes_name( $_[1] ));
189
190							}
191
192							sub tokenize {
193	92			92	0	114	my $self = shift;
194	92					111	my @tokens;
195	92					444	while ($_[0] =~ /([-\w]+)/g) {
196	670					1291	my $word = lc $1;
197	670	50				1553	next unless $word =~ /[a-z]/;
198	670					990	$word =~ s/^[^a-z]+//; # Trim leading non-alpha characters (helps with ordinals)
199	670					2268	push @tokens, $word;
200							}
201	92					207	return \@tokens;
202							}
203
204							sub stem_words {
205	93			93	0	137	my ($self, $tokens) = @_;
206	93	100				259	return unless $self->{stemming};
207	4	50				12	return if $self->{stemming} eq 'none';
208	4	50				10	die "Unknown stemming option '$self->{stemming}' - options are 'porter' or 'none'"
209							unless $self->{stemming} eq 'porter';
210
211	4	50				6	eval {require Lingua::Stem; 1}
	4					1034
	4					7084
212							or die "Porter stemming requires the Lingua::Stem module, available from CPAN.\n";
213
214	4					7	@$tokens = @{ Lingua::Stem::stem(@$tokens) };
	4					17
215							}
216
217							sub _filter_tokens {
218	92			92		140	my ($self, $tokens_in) = @_;
219
220	92	50	66			392	if ($self->{use_features}) {
	92	100				363
221	0					0	my $f = $self->{use_features}->as_hash;
222	0					0	return [ grep exists($f->{$_}), @$tokens_in ];
223							} elsif ($self->{stopwords} and keys %{$self->{stopwords}}) {
224	27					40	my $s = $self->{stopwords};
225	27					170	return [ grep !exists($s->{$_}), @$tokens_in ];
226							}
227	65					128	return $tokens_in;
228							}
229
230							sub _weigh_tokens {
231	92			92		201	my ($self, $tokens, $weight) = @_;
232
233	92					114	my %counts;
234	92	50				235	if (my $b = 0+$self->{front_bias}) {
235	0	0	0			0	die "'front_bias' value must be between -1 and 1"
236							unless -1 < $b and $b < 1;
237
238	0					0	my $n = @$tokens;
239	0					0	my $r = ($b-1)**2 / ($b+1);
240	0					0	my $mult = $weight * log($r)/($r-1);
241
242	0					0	my $i = 0;
243	0					0	foreach my $feature (@$tokens) {
244	0					0	$counts{$feature} += $mult * $r**($i/$n);
245	0					0	$i++;
246							}
247
248							} else {
249	92					169	foreach my $feature (@$tokens) {
250	632					1209	$counts{$feature} += $weight;
251							}
252							}
253
254	92					302	return \%counts;
255							}
256
257							sub vectorize {
258	92			92	0	259	my ($self, %args) = @_;
259	92	50				222	if ($self->{stem_stopwords}) {
260	0					0	my $s = $self->stem_tokens([keys %{$self->{stopwords}}]);
	0					0
261	0					0	$self->{stopwords} = { map {+$_, 1} @$s };
	0					0
262	0					0	$args{tokens} = $self->_filter_tokens($args{tokens});
263							}
264	92					249	return $self->_weigh_tokens($args{tokens}, $args{weight});
265							}
266
267							sub read {
268	3			3	1	93	my ($class, %args) = @_;
269	3	50				20	my $path = delete $args{path} or die "Must specify 'path' argument to read()";
270
271	3					12	my $self = $class->new(%args);
272
273	3	50				124	open my($fh), "< $path" or die "$path: $!";
274	3					13	$self->parse_handle(handle => $fh);
275	3					35	close $fh;
276
277	3					11	$self->finish;
278	3					19	return $self;
279							}
280
281							sub dump_features {
282	0			0	0		my ($self, %args) = @_;
283	0	0					my $path = $args{path} or die "No 'path' argument given to dump_features()";
284	0	0					open my($fh), "> $path" or die "Can't create $path: $!";
285	0						my $f = $self->features->as_hash;
286	0						while (my ($k, $v) = each %$f) {
287	0						print $fh "$k\t$v\n";
288							}
289							}
290
291							1;
292
293							__END__