line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Text::Categorize::Textrank; |
2
|
|
|
|
|
|
|
|
3
|
1
|
|
|
1
|
|
36283
|
use strict; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
49
|
|
4
|
1
|
|
|
1
|
|
9
|
use warnings; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
45
|
|
5
|
1
|
|
|
1
|
|
3865
|
use Graph; |
|
1
|
|
|
|
|
182182
|
|
|
1
|
|
|
|
|
38
|
|
6
|
1
|
|
|
1
|
|
978
|
use Graph::Centrality::Pagerank; |
|
1
|
|
|
|
|
9212
|
|
|
1
|
|
|
|
|
67
|
|
7
|
1
|
|
|
1
|
|
14
|
use Data::Dump qw(dump); |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
44
|
|
8
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
BEGIN { |
10
|
1
|
|
|
1
|
|
4
|
use Exporter (); |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
20
|
|
11
|
1
|
|
|
1
|
|
5
|
use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS); |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
97
|
|
12
|
1
|
|
|
1
|
|
3
|
$VERSION = '0.51'; |
13
|
1
|
|
|
|
|
14
|
@ISA = qw(Exporter); |
14
|
1
|
|
|
|
|
3
|
@EXPORT = qw(getTextrankOfListOfTokens); |
15
|
1
|
|
|
|
|
2
|
@EXPORT_OK = qw(getTextrankOfListOfTokens); |
16
|
1
|
|
|
|
|
431
|
%EXPORT_TAGS = (); |
17
|
|
|
|
|
|
|
} |
18
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
#12345678901234567890123456789012345678901234 |
20
|
|
|
|
|
|
|
#Method to rank potential keywords of text. |
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
=head1 NAME |
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
C - Method to rank potential keywords of text. |
25
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
=head1 SYNOPSIS |
27
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
use strict; |
29
|
|
|
|
|
|
|
use warnings; |
30
|
|
|
|
|
|
|
use Text::Categorize::Textrank; |
31
|
|
|
|
|
|
|
use Data::Dump qw(dump); |
32
|
|
|
|
|
|
|
my $listOfTokens = [ [qw(This is the first sentence)], [qw(Here is the second sentence)] ]; |
33
|
|
|
|
|
|
|
my $hashOfTextrankValues = getTextrankOfListOfTokens(listOfTokens => $listOfTokens); |
34
|
|
|
|
|
|
|
dump $hashOfTextrankValues; |
35
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
=head1 DESCRIPTION |
37
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
C provides a routine for ranking the words in |
39
|
|
|
|
|
|
|
text as potential keywords. It implements a version of the textrank algorithm |
40
|
|
|
|
|
|
|
from the report I by R. Mihalcea and P. Tarau. |
41
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
=head1 ROUTINES |
43
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
=head2 C |
45
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
The routine C returns a hash reference containing the textrank value |
47
|
|
|
|
|
|
|
for all the tokens in the lists provided; the textrank values sum to one. The textrank of a token |
48
|
|
|
|
|
|
|
is its pagerank in the graph obtained by joining neighboring tokens with an edge, called the |
49
|
|
|
|
|
|
|
token graph. |
50
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
Usually, C should I be applied to all the words of the text. The |
52
|
|
|
|
|
|
|
complete textrank algorithm first filters the words in the text to only the nouns and adjectives. |
53
|
|
|
|
|
|
|
See L to compute the textrank of English text. |
54
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
=over |
56
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
=item C |
58
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
listOfTokens => [[...], [...], ...[...]] |
60
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
C is an array reference containing the list of tokens that are |
62
|
|
|
|
|
|
|
to be ranked using textrank. Each list is also an array reference of tokens that should |
63
|
|
|
|
|
|
|
correspond to the list of tokens in a sentence. For example, |
64
|
|
|
|
|
|
|
C<[[qw(This is the first sentence)], [qw(Here is the second sentence)]]>. |
65
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
=item C |
67
|
|
|
|
|
|
|
|
68
|
|
|
|
|
|
|
edgeCreationSpan => 1 |
69
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
For each token in the C, C is the number of successive |
71
|
|
|
|
|
|
|
tokens used to make an edge in the token graph. For example, if |
72
|
|
|
|
|
|
|
C is two, then given the token sequence C<"apple orange pear"> |
73
|
|
|
|
|
|
|
the edges C<[apple, orange]> and C<[apple, pear]> will be added to the token |
74
|
|
|
|
|
|
|
graph for the token C. The default is one. |
75
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
Note that loop edges are ignored. For example, |
77
|
|
|
|
|
|
|
if C is two, then given the token sequence C<"daba daba doo"> |
78
|
|
|
|
|
|
|
the edge C<[daba, daba]> is disguarded but the edge C<[daba, doo]> is |
79
|
|
|
|
|
|
|
added to the token graph. |
80
|
|
|
|
|
|
|
|
81
|
|
|
|
|
|
|
=item C |
82
|
|
|
|
|
|
|
|
83
|
|
|
|
|
|
|
directedGraph => 0 |
84
|
|
|
|
|
|
|
|
85
|
|
|
|
|
|
|
If C is true, the textranks |
86
|
|
|
|
|
|
|
are computed from the directed token graph, if false, they |
87
|
|
|
|
|
|
|
are computed from the undirected version of the graph. The default is false. |
88
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
=item C |
90
|
|
|
|
|
|
|
|
91
|
|
|
|
|
|
|
dampeningFactor => 0.85 |
92
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
When computing the textranks of the token graph, the dampening factor |
94
|
|
|
|
|
|
|
specified by C will |
95
|
|
|
|
|
|
|
be used; it should range from zero to one. The default is 0.85. |
96
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
=begin html |
98
|
|
|
|
|
|
|
|
99
|
|
|
|
|
|
|
The Wikipedia article on pagerank has a good explaination of the |
100
|
|
|
|
|
|
|
dampening factor. |
101
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
=end html |
103
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
=item C |
105
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
addEdgesSpanningLists => 1 |
107
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
If C is true, then when building the token graph, links |
109
|
|
|
|
|
|
|
between the tokens at the end of a list and the beginning of the next list |
110
|
|
|
|
|
|
|
will be made. For example, for the lists C<[[qw(This is the first list)], [qw(Here is the second list)]]> |
111
|
|
|
|
|
|
|
the edge C<[list, Here]> will be added to the token graph. The default is true. |
112
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
=item C |
114
|
|
|
|
|
|
|
|
115
|
|
|
|
|
|
|
tokenWeights => {} |
116
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
C is an optional hash reference that can provide a weight for a subset |
118
|
|
|
|
|
|
|
of the tokens provided by C. If C is not defined for any token |
119
|
|
|
|
|
|
|
in C, then each |
120
|
|
|
|
|
|
|
token has a weight of one. If C is defined for |
121
|
|
|
|
|
|
|
at least one node in the graph, then the default weight of any undefined |
122
|
|
|
|
|
|
|
token is zero. |
123
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
=back |
125
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
=cut |
127
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
sub getTextrankOfListOfTokens |
129
|
|
|
|
|
|
|
{ |
130
|
31
|
|
|
31
|
1
|
706944
|
my %Parameters = @_; |
131
|
|
|
|
|
|
|
|
132
|
|
|
|
|
|
|
# if there are no lists, return undef. |
133
|
31
|
50
|
|
|
|
198
|
return undef unless exists $Parameters{listOfTokens}; |
134
|
31
|
|
|
|
|
81
|
my $listOfTokens = $Parameters{listOfTokens}; |
135
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
# edgeCreationSpan holds the number of sucessive tokens linked by an edge for each token. |
137
|
|
|
|
|
|
|
# for example, given tokens "apple orange pear", the edges [apple, orange] and [apple, pear] |
138
|
|
|
|
|
|
|
# will be created for token apple if $edgeCreationSpan is two. |
139
|
31
|
|
|
|
|
60
|
my $edgeCreationSpan = 1; |
140
|
31
|
50
|
|
|
|
124
|
$edgeCreationSpan = abs $Parameters{edgeCreationSpan} if (exists ($Parameters{edgeCreationSpan})); |
141
|
31
|
50
|
|
|
|
119
|
$edgeCreationSpan = 1 if ($edgeCreationSpan < 1); |
142
|
|
|
|
|
|
|
|
143
|
|
|
|
|
|
|
# set the type of graph built from the tokens to compute the textrank. |
144
|
31
|
|
|
|
|
68
|
$Parameters{undirected} = 1; |
145
|
31
|
50
|
|
|
|
128
|
$Parameters{undirected} = !$Parameters{directedGraph} if exists ($Parameters{directedGraph}); |
146
|
|
|
|
|
|
|
|
147
|
|
|
|
|
|
|
# get the pagerank dampening factor. |
148
|
31
|
50
|
|
|
|
122
|
$Parameters{dampeningFactor} = 0.85 unless exists ($Parameters{dampeningFactor}); |
149
|
31
|
|
|
|
|
71
|
$Parameters{dampeningFactor} = abs $Parameters{dampeningFactor}; |
150
|
31
|
50
|
|
|
|
123
|
$Parameters{dampeningFactor} = 1 if ($Parameters{dampeningFactor} > 1); |
151
|
|
|
|
|
|
|
|
152
|
|
|
|
|
|
|
# set the addEdgesSpanningLists flag. |
153
|
31
|
|
|
|
|
45
|
my $addEdgesSpanningLists = 1; |
154
|
31
|
50
|
|
|
|
81
|
$addEdgesSpanningLists = $Parameters{addEdgesSpanningLists} if exists $Parameters{addEdgesSpanningLists}; |
155
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
# build the graph of the links between tokens. |
157
|
31
|
|
|
|
|
245
|
my $graph = Graph->new(directed => !$Parameters{undirected}); |
158
|
|
|
|
|
|
|
|
159
|
|
|
|
|
|
|
# if edges will span lists, make listOfTokens into just one list. |
160
|
31
|
50
|
|
|
|
9083
|
if ($addEdgesSpanningLists) |
161
|
|
|
|
|
|
|
{ |
162
|
31
|
|
|
|
|
53
|
my @allWords; |
163
|
31
|
|
|
|
|
86
|
foreach my $list (@$listOfTokens) |
164
|
|
|
|
|
|
|
{ |
165
|
362
|
|
|
|
|
1370
|
push @allWords, @$list; |
166
|
|
|
|
|
|
|
} |
167
|
31
|
|
|
|
|
126
|
$listOfTokens = [\@allWords]; |
168
|
|
|
|
|
|
|
} |
169
|
|
|
|
|
|
|
|
170
|
|
|
|
|
|
|
# add all the edges to the graph. |
171
|
31
|
|
|
|
|
78
|
foreach my $list (@$listOfTokens) |
172
|
|
|
|
|
|
|
{ |
173
|
|
|
|
|
|
|
# get the total tokens in the list. |
174
|
31
|
|
|
|
|
86
|
my $totalTokens = $#$list + 1; |
175
|
31
|
|
|
|
|
123
|
for (my $i = 0; $i < $totalTokens; $i++) |
176
|
|
|
|
|
|
|
{ |
177
|
|
|
|
|
|
|
# since isolalted vertices are possible, add the vertex. |
178
|
|
|
|
|
|
|
# for example, if a list had only one token in it, it would |
179
|
|
|
|
|
|
|
# not be linked to any other token. |
180
|
5097
|
|
|
|
|
960205
|
$graph->add_vertex ($list->[0]); |
181
|
|
|
|
|
|
|
|
182
|
|
|
|
|
|
|
# get the index of the last token to link to the current token in |
183
|
|
|
|
|
|
|
# the list. |
184
|
5097
|
|
|
|
|
140213
|
my $lastToken = $i + $edgeCreationSpan + 1; |
185
|
5097
|
100
|
|
|
|
10206
|
$lastToken = $totalTokens if ($lastToken > $totalTokens); |
186
|
5097
|
|
|
|
|
11317
|
for (my $j = $i + 1; $j < $lastToken; $j++) |
187
|
|
|
|
|
|
|
{ |
188
|
|
|
|
|
|
|
# skip loop edges. |
189
|
5066
|
100
|
|
|
|
15693
|
next if ($list->[$i] eq $list->[$j]); |
190
|
|
|
|
|
|
|
|
191
|
|
|
|
|
|
|
# create the edge. |
192
|
4931
|
|
|
|
|
12120
|
my @edge = ($list->[$i], $list->[$j]); |
193
|
|
|
|
|
|
|
|
194
|
|
|
|
|
|
|
# if the edge exists already, add to its weight. |
195
|
4931
|
100
|
|
|
|
13537
|
if ($graph->has_edge (@edge)) |
196
|
|
|
|
|
|
|
{ |
197
|
3306
|
|
|
|
|
83316
|
my $weight = $graph->get_edge_weight (@edge); |
198
|
3306
|
50
|
|
|
|
355707
|
$weight = 1 unless defined $weight; |
199
|
3306
|
|
|
|
|
3762
|
++$weight; |
200
|
3306
|
|
|
|
|
9211
|
$graph->add_weighted_edge (@edge, $weight); |
201
|
|
|
|
|
|
|
} |
202
|
|
|
|
|
|
|
else |
203
|
|
|
|
|
|
|
{ |
204
|
|
|
|
|
|
|
# edge does not exist, default edge weight is one. |
205
|
1625
|
|
|
|
|
31069
|
$graph->add_weighted_edge (@edge, 1); |
206
|
|
|
|
|
|
|
} |
207
|
|
|
|
|
|
|
} |
208
|
|
|
|
|
|
|
} |
209
|
|
|
|
|
|
|
} |
210
|
|
|
|
|
|
|
|
211
|
|
|
|
|
|
|
# get the pagerank computer. |
212
|
31
|
|
|
|
|
234
|
my $pageRankEngine = Graph::Centrality::Pagerank->new (); |
213
|
|
|
|
|
|
|
|
214
|
|
|
|
|
|
|
# set the parameter for inclusion of edge weights. |
215
|
31
|
|
|
|
|
6694
|
$Parameters{useEdgeWeights} = 1; |
216
|
|
|
|
|
|
|
|
217
|
|
|
|
|
|
|
# set the node weights if provided. |
218
|
31
|
50
|
33
|
|
|
122
|
$Parameters{nodeWeights} = $Parameters{tokenWeights} if ((exists $Parameters{tokenWeights}) && (defined $Parameters{tokenWeights})); |
219
|
|
|
|
|
|
|
|
220
|
|
|
|
|
|
|
# compute and return the pagerank of the tokens. |
221
|
31
|
|
|
|
|
210
|
return $pageRankEngine->getPagerankOfNodes (%Parameters, graph => $graph); |
222
|
|
|
|
|
|
|
} |
223
|
|
|
|
|
|
|
|
224
|
|
|
|
|
|
|
=head1 INSTALLATION |
225
|
|
|
|
|
|
|
|
226
|
|
|
|
|
|
|
To install the module run the following commands: |
227
|
|
|
|
|
|
|
|
228
|
|
|
|
|
|
|
perl Makefile.PL |
229
|
|
|
|
|
|
|
make |
230
|
|
|
|
|
|
|
make test |
231
|
|
|
|
|
|
|
make install |
232
|
|
|
|
|
|
|
|
233
|
|
|
|
|
|
|
If you are on a windows box you should use 'nmake' rather than 'make'. |
234
|
|
|
|
|
|
|
|
235
|
|
|
|
|
|
|
=head1 BUGS |
236
|
|
|
|
|
|
|
|
237
|
|
|
|
|
|
|
Please email bugs reports or feature requests to C, or through |
238
|
|
|
|
|
|
|
the web interface at L. The author |
239
|
|
|
|
|
|
|
will be notified and you can be automatically notified of progress on the bug fix or feature request. |
240
|
|
|
|
|
|
|
|
241
|
|
|
|
|
|
|
=head1 AUTHOR |
242
|
|
|
|
|
|
|
|
243
|
|
|
|
|
|
|
Jeff Kubina |
244
|
|
|
|
|
|
|
|
245
|
|
|
|
|
|
|
=head1 COPYRIGHT |
246
|
|
|
|
|
|
|
|
247
|
|
|
|
|
|
|
Copyright (c) 2009 Jeff Kubina. All rights reserved. |
248
|
|
|
|
|
|
|
This program is free software; you can redistribute |
249
|
|
|
|
|
|
|
it and/or modify it under the same terms as Perl itself. |
250
|
|
|
|
|
|
|
|
251
|
|
|
|
|
|
|
The full text of the license can be found in the |
252
|
|
|
|
|
|
|
LICENSE file included with this module. |
253
|
|
|
|
|
|
|
|
254
|
|
|
|
|
|
|
=head1 KEYWORDS |
255
|
|
|
|
|
|
|
|
256
|
|
|
|
|
|
|
categorize, keywords, keyphrases, nlp, pagerank, textrank |
257
|
|
|
|
|
|
|
|
258
|
|
|
|
|
|
|
=head1 SEE ALSO |
259
|
|
|
|
|
|
|
|
260
|
|
|
|
|
|
|
=begin html |
261
|
|
|
|
|
|
|
|
262
|
|
|
|
|
|
|
This package implements the Textrank algorithm from the report |
263
|
|
|
|
|
|
|
TextRank: Bringing Order into Texts |
264
|
|
|
|
|
|
|
by Rada Mihalcea and Paul Tarau; |
265
|
|
|
|
|
|
|
which is related to pagerank. |
266
|
|
|
|
|
|
|
|
267
|
|
|
|
|
|
|
=end html |
268
|
|
|
|
|
|
|
|
269
|
|
|
|
|
|
|
L, L, L, L |
270
|
|
|
|
|
|
|
|
271
|
|
|
|
|
|
|
=cut |
272
|
|
|
|
|
|
|
|
273
|
|
|
|
|
|
|
1; |
274
|
|
|
|
|
|
|
# The preceding line will help the module return a true value |