| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
# SPDX-FileCopyrightText: 2014 Koichi SATOH |
|
2
|
|
|
|
|
|
|
# SPDX-FileCopyrightText: 2026 Wesley Schwengle |
|
3
|
|
|
|
|
|
|
# |
|
4
|
|
|
|
|
|
|
# SPDX-License-Identifier: MIT |
|
5
|
|
|
|
|
|
|
|
|
6
|
|
|
|
|
|
|
package Lingua::TermWeight; |
|
7
|
2
|
|
|
2
|
|
228799
|
use utf8; |
|
|
2
|
|
|
|
|
257
|
|
|
|
2
|
|
|
|
|
13
|
|
|
8
|
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
# ABSTRACT: Language-independent TermWeight calculator. |
|
10
|
|
|
|
|
|
|
|
|
11
|
2
|
|
|
2
|
|
105
|
use v5.26; |
|
|
2
|
|
|
|
|
7
|
|
|
12
|
2
|
|
|
2
|
|
13
|
use warnings; |
|
|
2
|
|
|
|
|
4
|
|
|
|
2
|
|
|
|
|
130
|
|
|
13
|
2
|
|
|
2
|
|
710
|
use Object::Pad 0.800 ':experimental(init_expr)'; |
|
|
2
|
|
|
|
|
8444
|
|
|
|
2
|
|
|
|
|
142
|
|
|
14
|
2
|
|
|
2
|
|
351
|
use Carp qw(croak); |
|
|
2
|
|
|
|
|
4
|
|
|
|
2
|
|
|
|
|
163
|
|
|
15
|
2
|
|
|
2
|
|
16
|
use List::Util qw(sum); |
|
|
2
|
|
|
|
|
5
|
|
|
|
2
|
|
|
|
|
682
|
|
|
16
|
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
our $VERSION = '0.01'; |
|
18
|
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
class Lingua::TermWeight { |
|
20
|
0
|
|
|
0
|
0
|
0
|
field $word_counter :reader = do { |
|
21
|
|
|
|
|
|
|
require Lingua::TermWeight::WordCounter::Simple; |
|
22
|
|
|
|
|
|
|
Lingua::TermWeight::WordCounter::Simple->new; |
|
23
|
|
|
|
|
|
|
}; |
|
24
|
|
|
|
|
|
|
|
|
25
|
0
|
|
|
0
|
0
|
0
|
field $word_segmenter : reader : param; |
|
|
0
|
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
26
|
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
ADJUST { |
|
28
|
|
|
|
|
|
|
croak "word_segmenter must provide a segment method" |
|
29
|
|
|
|
|
|
|
unless $word_segmenter->can('segment'); |
|
30
|
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
croak "word_counter must provide clear, add_count, and frequencies methods" |
|
32
|
|
|
|
|
|
|
unless $word_counter->can('clear') |
|
33
|
|
|
|
|
|
|
&& $word_counter->can('add_count') |
|
34
|
|
|
|
|
|
|
&& $word_counter->can('frequencies'); |
|
35
|
|
|
|
|
|
|
} |
|
36
|
|
|
|
|
|
|
|
|
37
|
8
|
|
|
8
|
1
|
2436
|
method tf (%args) { |
|
|
8
|
|
|
|
|
26
|
|
|
|
8
|
|
|
|
|
16
|
|
|
|
8
|
|
|
|
|
11
|
|
|
38
|
|
|
|
|
|
|
croak "tf requires a document argument" |
|
39
|
8
|
50
|
|
|
|
21
|
unless exists $args{document}; |
|
40
|
|
|
|
|
|
|
|
|
41
|
8
|
|
|
|
|
16
|
my $document = $args{document}; |
|
42
|
8
|
|
100
|
|
|
27
|
my $normalize = $args{normalize} // 0; |
|
43
|
|
|
|
|
|
|
|
|
44
|
8
|
|
|
|
|
32
|
$word_counter->clear; |
|
45
|
|
|
|
|
|
|
|
|
46
|
8
|
|
|
|
|
22
|
my $iter = $word_segmenter->segment($document); |
|
47
|
8
|
50
|
|
|
|
18
|
croak "word_segmenter->segment must return a coderef iterator" |
|
48
|
|
|
|
|
|
|
unless ref($iter) eq 'CODE'; |
|
49
|
|
|
|
|
|
|
|
|
50
|
8
|
|
|
|
|
12
|
while (defined(my $word = $iter->())) { |
|
51
|
419
|
|
|
|
|
585
|
$word_counter->add_count($word); |
|
52
|
|
|
|
|
|
|
} |
|
53
|
|
|
|
|
|
|
|
|
54
|
8
|
|
|
|
|
24
|
my $tf = $word_counter->frequencies; |
|
55
|
8
|
100
|
|
|
|
43
|
return $tf unless $normalize; |
|
56
|
|
|
|
|
|
|
|
|
57
|
3
|
50
|
|
|
|
8
|
return {} unless %$tf; |
|
58
|
|
|
|
|
|
|
|
|
59
|
3
|
|
50
|
|
|
19
|
my $total_words = sum(values %$tf) // 0; |
|
60
|
3
|
50
|
|
|
|
6
|
return {} unless $total_words; |
|
61
|
|
|
|
|
|
|
|
|
62
|
3
|
|
|
|
|
14
|
return +{ map { ($_ => $tf->{$_} / $total_words) } keys %$tf }; |
|
|
96
|
|
|
|
|
155
|
|
|
63
|
|
|
|
|
|
|
} |
|
64
|
|
|
|
|
|
|
|
|
65
|
3
|
|
|
3
|
1
|
3112
|
method idf (%args) { |
|
|
3
|
|
|
|
|
7
|
|
|
|
3
|
|
|
|
|
5
|
|
|
|
3
|
|
|
|
|
4
|
|
|
66
|
|
|
|
|
|
|
croak "idf requires a documents argument" |
|
67
|
3
|
50
|
|
|
|
21
|
unless exists $args{documents}; |
|
68
|
|
|
|
|
|
|
|
|
69
|
3
|
|
|
|
|
4
|
my $documents = $args{documents}; |
|
70
|
3
|
50
|
|
|
|
7
|
croak "documents must be an arrayref" |
|
71
|
|
|
|
|
|
|
unless ref($documents) eq 'ARRAY'; |
|
72
|
|
|
|
|
|
|
|
|
73
|
3
|
50
|
|
|
|
7
|
return {} if @$documents == 0; |
|
74
|
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
my @tfs |
|
76
|
|
|
|
|
|
|
= ref($documents->[0]) |
|
77
|
|
|
|
|
|
|
? @$documents |
|
78
|
3
|
100
|
|
|
|
10
|
: map { $self->tf(document => \$_) } @$documents; |
|
|
2
|
|
|
|
|
6
|
|
|
79
|
|
|
|
|
|
|
|
|
80
|
3
|
|
|
|
|
6
|
my %seen_word; |
|
81
|
3
|
|
|
|
|
4
|
for my $tf (@tfs) { |
|
82
|
6
|
50
|
|
|
|
14
|
croak "each term-frequency entry must be a hashref" |
|
83
|
|
|
|
|
|
|
unless ref($tf) eq 'HASH'; |
|
84
|
6
|
|
|
|
|
101
|
$seen_word{$_} = 1 for keys %$tf; |
|
85
|
|
|
|
|
|
|
} |
|
86
|
|
|
|
|
|
|
|
|
87
|
3
|
|
|
|
|
4
|
my %idf; |
|
88
|
3
|
|
|
|
|
14
|
for my $word (keys %seen_word) { |
|
89
|
141
|
|
|
|
|
126
|
my $num_documents_including_word = grep { exists $_->{$word} } @tfs; |
|
|
282
|
|
|
|
|
321
|
|
|
90
|
141
|
50
|
|
|
|
207
|
next unless $num_documents_including_word; |
|
91
|
141
|
|
|
|
|
222
|
$idf{$word} = log(@tfs / $num_documents_including_word); |
|
92
|
|
|
|
|
|
|
} |
|
93
|
|
|
|
|
|
|
|
|
94
|
3
|
|
|
|
|
27
|
return \%idf; |
|
95
|
|
|
|
|
|
|
} |
|
96
|
|
|
|
|
|
|
|
|
97
|
2
|
|
|
2
|
1
|
4836
|
method tf_idf (%args) { |
|
|
2
|
|
|
|
|
7
|
|
|
|
2
|
|
|
|
|
4
|
|
|
|
2
|
|
|
|
|
4
|
|
|
98
|
|
|
|
|
|
|
croak "tf_idf requires a documents argument" |
|
99
|
2
|
50
|
|
|
|
7
|
unless exists $args{documents}; |
|
100
|
|
|
|
|
|
|
|
|
101
|
2
|
|
|
|
|
5
|
my $documents = $args{documents}; |
|
102
|
2
|
|
100
|
|
|
13
|
my $normalize = $args{normalize} // 0; |
|
103
|
|
|
|
|
|
|
|
|
104
|
2
|
50
|
|
|
|
7
|
croak "documents must be an arrayref" |
|
105
|
|
|
|
|
|
|
unless ref($documents) eq 'ARRAY'; |
|
106
|
|
|
|
|
|
|
|
|
107
|
2
|
50
|
|
|
|
5
|
return [] if @$documents == 0; |
|
108
|
|
|
|
|
|
|
|
|
109
|
2
|
|
|
|
|
7
|
my @tfs = map { $self->tf(document => \$_, normalize => $normalize) } |
|
|
4
|
|
|
|
|
13
|
|
|
110
|
|
|
|
|
|
|
@$documents; |
|
111
|
|
|
|
|
|
|
|
|
112
|
2
|
|
|
|
|
9
|
my $idf = $self->idf(documents => \@tfs); |
|
113
|
|
|
|
|
|
|
|
|
114
|
2
|
|
|
|
|
3
|
my @tf_idf; |
|
115
|
2
|
|
|
|
|
3
|
for my $tf (@tfs) { |
|
116
|
4
|
|
|
|
|
15
|
push @tf_idf, +{ map { ($_ => $tf->{$_} * $idf->{$_}) } keys %$tf }; |
|
|
130
|
|
|
|
|
209
|
|
|
117
|
|
|
|
|
|
|
} |
|
118
|
|
|
|
|
|
|
|
|
119
|
2
|
|
|
|
|
20
|
return \@tf_idf; |
|
120
|
|
|
|
|
|
|
} |
|
121
|
|
|
|
|
|
|
} |
|
122
|
|
|
|
|
|
|
|
|
123
|
|
|
|
|
|
|
1; |
|
124
|
|
|
|
|
|
|
|
|
125
|
|
|
|
|
|
|
__END__ |