line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package AI::Categorizer::Collection; |
2
|
11
|
|
|
11
|
|
47
|
use strict; |
|
11
|
|
|
|
|
15
|
|
|
11
|
|
|
|
|
333
|
|
3
|
|
|
|
|
|
|
|
4
|
11
|
|
|
11
|
|
49
|
use Params::Validate qw(:types); |
|
11
|
|
|
|
|
14
|
|
|
11
|
|
|
|
|
1381
|
|
5
|
11
|
|
|
11
|
|
53
|
use Class::Container; |
|
11
|
|
|
|
|
17
|
|
|
11
|
|
|
|
|
214
|
|
6
|
11
|
|
|
11
|
|
48
|
use base qw(Class::Container); |
|
11
|
|
|
|
|
14
|
|
|
11
|
|
|
|
|
5487
|
|
7
|
|
|
|
|
|
|
__PACKAGE__->valid_params |
8
|
|
|
|
|
|
|
( |
9
|
|
|
|
|
|
|
verbose => {type => SCALAR, default => 0}, |
10
|
|
|
|
|
|
|
stopword_file => { type => SCALAR, optional => 1 }, |
11
|
|
|
|
|
|
|
category_hash => { type => HASHREF, default => {} }, |
12
|
|
|
|
|
|
|
category_file => { type => SCALAR, optional => 1 }, |
13
|
|
|
|
|
|
|
); |
14
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
__PACKAGE__->contained_objects |
16
|
|
|
|
|
|
|
( |
17
|
|
|
|
|
|
|
document => { class => 'AI::Categorizer::Document::Text', |
18
|
|
|
|
|
|
|
delayed => 1 }, |
19
|
|
|
|
|
|
|
); |
20
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
sub new { |
22
|
7
|
|
|
7
|
1
|
36
|
my ($class, %args) = @_; |
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
# Optimize so every document doesn't have to convert the stopword list to a hash |
25
|
7
|
50
|
33
|
|
|
46
|
if ($args{stopwords} and UNIVERSAL::isa($args{stopwords}, 'ARRAY')) { |
26
|
0
|
|
|
|
|
0
|
$args{stopwords} = { map {+$_ => 1} @{ $args{stopwords} } }; |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
27
|
|
|
|
|
|
|
} |
28
|
|
|
|
|
|
|
|
29
|
7
|
|
|
|
|
72
|
my $self = $class->SUPER::new(%args); |
30
|
|
|
|
|
|
|
|
31
|
7
|
50
|
|
|
|
2757
|
if ($self->{category_file}) { |
32
|
0
|
|
|
|
|
0
|
local *FH; |
33
|
0
|
0
|
|
|
|
0
|
open FH, $self->{category_file} or die "Can't open $self->{category_file}: $!"; |
34
|
0
|
|
|
|
|
0
|
while () { |
35
|
0
|
|
|
|
|
0
|
my ($doc, @cats) = split; |
36
|
0
|
|
|
|
|
0
|
$self->{category_hash}{$doc} = \@cats; |
37
|
|
|
|
|
|
|
} |
38
|
0
|
|
|
|
|
0
|
close FH; |
39
|
|
|
|
|
|
|
} |
40
|
7
|
50
|
|
|
|
44
|
if (exists $self->{stopword_file}) { |
41
|
0
|
|
|
|
|
0
|
my %stopwords; |
42
|
0
|
|
|
|
|
0
|
local *FH; |
43
|
0
|
0
|
|
|
|
0
|
open FH, "< $self->{stopword_file}" or die "$self->{stopword_file}: $!"; |
44
|
0
|
|
|
|
|
0
|
while () { |
45
|
0
|
|
|
|
|
0
|
chomp; |
46
|
0
|
|
|
|
|
0
|
$stopwords{$_} = 1; |
47
|
|
|
|
|
|
|
} |
48
|
0
|
|
|
|
|
0
|
close FH; |
49
|
|
|
|
|
|
|
|
50
|
0
|
|
|
|
|
0
|
$self->delayed_object_params('document', stopwords => \%stopwords); |
51
|
|
|
|
|
|
|
} |
52
|
|
|
|
|
|
|
|
53
|
7
|
|
|
|
|
31
|
return $self; |
54
|
|
|
|
|
|
|
} |
55
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
# This should usually be replaced in subclasses with a faster version that doesn't |
57
|
|
|
|
|
|
|
# need to create actual documents each time through |
58
|
|
|
|
|
|
|
sub count_documents { |
59
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
60
|
0
|
0
|
|
|
|
|
return $self->{document_count} if exists $self->{document_count}; |
61
|
|
|
|
|
|
|
|
62
|
0
|
|
|
|
|
|
$self->rewind; |
63
|
0
|
|
|
|
|
|
my $count = 0; |
64
|
0
|
|
|
|
|
|
$count++ while $self->next; |
65
|
0
|
|
|
|
|
|
$self->rewind; |
66
|
|
|
|
|
|
|
|
67
|
0
|
|
|
|
|
|
return $self->{document_count} = $count; |
68
|
|
|
|
|
|
|
} |
69
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
# Abstract methods |
71
|
|
|
|
|
|
|
sub next; |
72
|
|
|
|
|
|
|
sub rewind; |
73
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
1; |
75
|
|
|
|
|
|
|
__END__ |