| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package Text::Categorize::Textrank; |
|
2
|
|
|
|
|
|
|
|
|
3
|
1
|
|
|
1
|
|
36283
|
use strict; |
|
|
1
|
|
|
|
|
3
|
|
|
|
1
|
|
|
|
|
49
|
|
|
4
|
1
|
|
|
1
|
|
9
|
use warnings; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
45
|
|
|
5
|
1
|
|
|
1
|
|
3865
|
use Graph; |
|
|
1
|
|
|
|
|
182182
|
|
|
|
1
|
|
|
|
|
38
|
|
|
6
|
1
|
|
|
1
|
|
978
|
use Graph::Centrality::Pagerank; |
|
|
1
|
|
|
|
|
9212
|
|
|
|
1
|
|
|
|
|
67
|
|
|
7
|
1
|
|
|
1
|
|
14
|
use Data::Dump qw(dump); |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
44
|
|
|
8
|
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
BEGIN { |
|
10
|
1
|
|
|
1
|
|
4
|
use Exporter (); |
|
|
1
|
|
|
|
|
3
|
|
|
|
1
|
|
|
|
|
20
|
|
|
11
|
1
|
|
|
1
|
|
5
|
use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS); |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
97
|
|
|
12
|
1
|
|
|
1
|
|
3
|
$VERSION = '0.51'; |
|
13
|
1
|
|
|
|
|
14
|
@ISA = qw(Exporter); |
|
14
|
1
|
|
|
|
|
3
|
@EXPORT = qw(getTextrankOfListOfTokens); |
|
15
|
1
|
|
|
|
|
2
|
@EXPORT_OK = qw(getTextrankOfListOfTokens); |
|
16
|
1
|
|
|
|
|
431
|
%EXPORT_TAGS = (); |
|
17
|
|
|
|
|
|
|
} |
|
18
|
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
#12345678901234567890123456789012345678901234 |
|
20
|
|
|
|
|
|
|
#Method to rank potential keywords of text. |
|
21
|
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
=head1 NAME |
|
23
|
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
C - Method to rank potential keywords of text. |
|
25
|
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
=head1 SYNOPSIS |
|
27
|
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
use strict; |
|
29
|
|
|
|
|
|
|
use warnings; |
|
30
|
|
|
|
|
|
|
use Text::Categorize::Textrank; |
|
31
|
|
|
|
|
|
|
use Data::Dump qw(dump); |
|
32
|
|
|
|
|
|
|
my $listOfTokens = [ [qw(This is the first sentence)], [qw(Here is the second sentence)] ]; |
|
33
|
|
|
|
|
|
|
my $hashOfTextrankValues = getTextrankOfListOfTokens(listOfTokens => $listOfTokens); |
|
34
|
|
|
|
|
|
|
dump $hashOfTextrankValues; |
|
35
|
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
=head1 DESCRIPTION |
|
37
|
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
C provides a routine for ranking the words in |
|
39
|
|
|
|
|
|
|
text as potential keywords. It implements a version of the textrank algorithm |
|
40
|
|
|
|
|
|
|
from the report I by R. Mihalcea and P. Tarau. |
|
41
|
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
=head1 ROUTINES |
|
43
|
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
=head2 C |
|
45
|
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
The routine C returns a hash reference containing the textrank value |
|
47
|
|
|
|
|
|
|
for all the tokens in the lists provided; the textrank values sum to one. The textrank of a token |
|
48
|
|
|
|
|
|
|
is its pagerank in the graph obtained by joining neighboring tokens with an edge, called the |
|
49
|
|
|
|
|
|
|
token graph. |
|
50
|
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
Usually, C should I be applied to all the words of the text. The |
|
52
|
|
|
|
|
|
|
complete textrank algorithm first filters the words in the text to only the nouns and adjectives. |
|
53
|
|
|
|
|
|
|
See L to compute the textrank of English text. |
|
54
|
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
=over |
|
56
|
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
=item C |
|
58
|
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
listOfTokens => [[...], [...], ...[...]] |
|
60
|
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
C is an array reference containing the list of tokens that are |
|
62
|
|
|
|
|
|
|
to be ranked using textrank. Each list is also an array reference of tokens that should |
|
63
|
|
|
|
|
|
|
correspond to the list of tokens in a sentence. For example, |
|
64
|
|
|
|
|
|
|
C<[[qw(This is the first sentence)], [qw(Here is the second sentence)]]>. |
|
65
|
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
=item C |
|
67
|
|
|
|
|
|
|
|
|
68
|
|
|
|
|
|
|
edgeCreationSpan => 1 |
|
69
|
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
For each token in the C, C is the number of successive |
|
71
|
|
|
|
|
|
|
tokens used to make an edge in the token graph. For example, if |
|
72
|
|
|
|
|
|
|
C is two, then given the token sequence C<"apple orange pear"> |
|
73
|
|
|
|
|
|
|
the edges C<[apple, orange]> and C<[apple, pear]> will be added to the token |
|
74
|
|
|
|
|
|
|
graph for the token C. The default is one. |
|
75
|
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
Note that loop edges are ignored. For example, |
|
77
|
|
|
|
|
|
|
if C is two, then given the token sequence C<"daba daba doo"> |
|
78
|
|
|
|
|
|
|
the edge C<[daba, daba]> is disguarded but the edge C<[daba, doo]> is |
|
79
|
|
|
|
|
|
|
added to the token graph. |
|
80
|
|
|
|
|
|
|
|
|
81
|
|
|
|
|
|
|
=item C |
|
82
|
|
|
|
|
|
|
|
|
83
|
|
|
|
|
|
|
directedGraph => 0 |
|
84
|
|
|
|
|
|
|
|
|
85
|
|
|
|
|
|
|
If C is true, the textranks |
|
86
|
|
|
|
|
|
|
are computed from the directed token graph, if false, they |
|
87
|
|
|
|
|
|
|
are computed from the undirected version of the graph. The default is false. |
|
88
|
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
=item C |
|
90
|
|
|
|
|
|
|
|
|
91
|
|
|
|
|
|
|
dampeningFactor => 0.85 |
|
92
|
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
When computing the textranks of the token graph, the dampening factor |
|
94
|
|
|
|
|
|
|
specified by C will |
|
95
|
|
|
|
|
|
|
be used; it should range from zero to one. The default is 0.85. |
|
96
|
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
=begin html |
|
98
|
|
|
|
|
|
|
|
|
99
|
|
|
|
|
|
|
The Wikipedia article on pagerank has a good explaination of the |
|
100
|
|
|
|
|
|
|
dampening factor. |
|
101
|
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
=end html |
|
103
|
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
=item C |
|
105
|
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
addEdgesSpanningLists => 1 |
|
107
|
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
If C is true, then when building the token graph, links |
|
109
|
|
|
|
|
|
|
between the tokens at the end of a list and the beginning of the next list |
|
110
|
|
|
|
|
|
|
will be made. For example, for the lists C<[[qw(This is the first list)], [qw(Here is the second list)]]> |
|
111
|
|
|
|
|
|
|
the edge C<[list, Here]> will be added to the token graph. The default is true. |
|
112
|
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
=item C |
|
114
|
|
|
|
|
|
|
|
|
115
|
|
|
|
|
|
|
tokenWeights => {} |
|
116
|
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
C is an optional hash reference that can provide a weight for a subset |
|
118
|
|
|
|
|
|
|
of the tokens provided by C. If C is not defined for any token |
|
119
|
|
|
|
|
|
|
in C, then each |
|
120
|
|
|
|
|
|
|
token has a weight of one. If C is defined for |
|
121
|
|
|
|
|
|
|
at least one node in the graph, then the default weight of any undefined |
|
122
|
|
|
|
|
|
|
token is zero. |
|
123
|
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
=back |
|
125
|
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
=cut |
|
127
|
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
sub getTextrankOfListOfTokens |
|
129
|
|
|
|
|
|
|
{ |
|
130
|
31
|
|
|
31
|
1
|
706944
|
my %Parameters = @_; |
|
131
|
|
|
|
|
|
|
|
|
132
|
|
|
|
|
|
|
# if there are no lists, return undef. |
|
133
|
31
|
50
|
|
|
|
198
|
return undef unless exists $Parameters{listOfTokens}; |
|
134
|
31
|
|
|
|
|
81
|
my $listOfTokens = $Parameters{listOfTokens}; |
|
135
|
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
# edgeCreationSpan holds the number of sucessive tokens linked by an edge for each token. |
|
137
|
|
|
|
|
|
|
# for example, given tokens "apple orange pear", the edges [apple, orange] and [apple, pear] |
|
138
|
|
|
|
|
|
|
# will be created for token apple if $edgeCreationSpan is two. |
|
139
|
31
|
|
|
|
|
60
|
my $edgeCreationSpan = 1; |
|
140
|
31
|
50
|
|
|
|
124
|
$edgeCreationSpan = abs $Parameters{edgeCreationSpan} if (exists ($Parameters{edgeCreationSpan})); |
|
141
|
31
|
50
|
|
|
|
119
|
$edgeCreationSpan = 1 if ($edgeCreationSpan < 1); |
|
142
|
|
|
|
|
|
|
|
|
143
|
|
|
|
|
|
|
# set the type of graph built from the tokens to compute the textrank. |
|
144
|
31
|
|
|
|
|
68
|
$Parameters{undirected} = 1; |
|
145
|
31
|
50
|
|
|
|
128
|
$Parameters{undirected} = !$Parameters{directedGraph} if exists ($Parameters{directedGraph}); |
|
146
|
|
|
|
|
|
|
|
|
147
|
|
|
|
|
|
|
# get the pagerank dampening factor. |
|
148
|
31
|
50
|
|
|
|
122
|
$Parameters{dampeningFactor} = 0.85 unless exists ($Parameters{dampeningFactor}); |
|
149
|
31
|
|
|
|
|
71
|
$Parameters{dampeningFactor} = abs $Parameters{dampeningFactor}; |
|
150
|
31
|
50
|
|
|
|
123
|
$Parameters{dampeningFactor} = 1 if ($Parameters{dampeningFactor} > 1); |
|
151
|
|
|
|
|
|
|
|
|
152
|
|
|
|
|
|
|
# set the addEdgesSpanningLists flag. |
|
153
|
31
|
|
|
|
|
45
|
my $addEdgesSpanningLists = 1; |
|
154
|
31
|
50
|
|
|
|
81
|
$addEdgesSpanningLists = $Parameters{addEdgesSpanningLists} if exists $Parameters{addEdgesSpanningLists}; |
|
155
|
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
# build the graph of the links between tokens. |
|
157
|
31
|
|
|
|
|
245
|
my $graph = Graph->new(directed => !$Parameters{undirected}); |
|
158
|
|
|
|
|
|
|
|
|
159
|
|
|
|
|
|
|
# if edges will span lists, make listOfTokens into just one list. |
|
160
|
31
|
50
|
|
|
|
9083
|
if ($addEdgesSpanningLists) |
|
161
|
|
|
|
|
|
|
{ |
|
162
|
31
|
|
|
|
|
53
|
my @allWords; |
|
163
|
31
|
|
|
|
|
86
|
foreach my $list (@$listOfTokens) |
|
164
|
|
|
|
|
|
|
{ |
|
165
|
362
|
|
|
|
|
1370
|
push @allWords, @$list; |
|
166
|
|
|
|
|
|
|
} |
|
167
|
31
|
|
|
|
|
126
|
$listOfTokens = [\@allWords]; |
|
168
|
|
|
|
|
|
|
} |
|
169
|
|
|
|
|
|
|
|
|
170
|
|
|
|
|
|
|
# add all the edges to the graph. |
|
171
|
31
|
|
|
|
|
78
|
foreach my $list (@$listOfTokens) |
|
172
|
|
|
|
|
|
|
{ |
|
173
|
|
|
|
|
|
|
# get the total tokens in the list. |
|
174
|
31
|
|
|
|
|
86
|
my $totalTokens = $#$list + 1; |
|
175
|
31
|
|
|
|
|
123
|
for (my $i = 0; $i < $totalTokens; $i++) |
|
176
|
|
|
|
|
|
|
{ |
|
177
|
|
|
|
|
|
|
# since isolalted vertices are possible, add the vertex. |
|
178
|
|
|
|
|
|
|
# for example, if a list had only one token in it, it would |
|
179
|
|
|
|
|
|
|
# not be linked to any other token. |
|
180
|
5097
|
|
|
|
|
960205
|
$graph->add_vertex ($list->[0]); |
|
181
|
|
|
|
|
|
|
|
|
182
|
|
|
|
|
|
|
# get the index of the last token to link to the current token in |
|
183
|
|
|
|
|
|
|
# the list. |
|
184
|
5097
|
|
|
|
|
140213
|
my $lastToken = $i + $edgeCreationSpan + 1; |
|
185
|
5097
|
100
|
|
|
|
10206
|
$lastToken = $totalTokens if ($lastToken > $totalTokens); |
|
186
|
5097
|
|
|
|
|
11317
|
for (my $j = $i + 1; $j < $lastToken; $j++) |
|
187
|
|
|
|
|
|
|
{ |
|
188
|
|
|
|
|
|
|
# skip loop edges. |
|
189
|
5066
|
100
|
|
|
|
15693
|
next if ($list->[$i] eq $list->[$j]); |
|
190
|
|
|
|
|
|
|
|
|
191
|
|
|
|
|
|
|
# create the edge. |
|
192
|
4931
|
|
|
|
|
12120
|
my @edge = ($list->[$i], $list->[$j]); |
|
193
|
|
|
|
|
|
|
|
|
194
|
|
|
|
|
|
|
# if the edge exists already, add to its weight. |
|
195
|
4931
|
100
|
|
|
|
13537
|
if ($graph->has_edge (@edge)) |
|
196
|
|
|
|
|
|
|
{ |
|
197
|
3306
|
|
|
|
|
83316
|
my $weight = $graph->get_edge_weight (@edge); |
|
198
|
3306
|
50
|
|
|
|
355707
|
$weight = 1 unless defined $weight; |
|
199
|
3306
|
|
|
|
|
3762
|
++$weight; |
|
200
|
3306
|
|
|
|
|
9211
|
$graph->add_weighted_edge (@edge, $weight); |
|
201
|
|
|
|
|
|
|
} |
|
202
|
|
|
|
|
|
|
else |
|
203
|
|
|
|
|
|
|
{ |
|
204
|
|
|
|
|
|
|
# edge does not exist, default edge weight is one. |
|
205
|
1625
|
|
|
|
|
31069
|
$graph->add_weighted_edge (@edge, 1); |
|
206
|
|
|
|
|
|
|
} |
|
207
|
|
|
|
|
|
|
} |
|
208
|
|
|
|
|
|
|
} |
|
209
|
|
|
|
|
|
|
} |
|
210
|
|
|
|
|
|
|
|
|
211
|
|
|
|
|
|
|
# get the pagerank computer. |
|
212
|
31
|
|
|
|
|
234
|
my $pageRankEngine = Graph::Centrality::Pagerank->new (); |
|
213
|
|
|
|
|
|
|
|
|
214
|
|
|
|
|
|
|
# set the parameter for inclusion of edge weights. |
|
215
|
31
|
|
|
|
|
6694
|
$Parameters{useEdgeWeights} = 1; |
|
216
|
|
|
|
|
|
|
|
|
217
|
|
|
|
|
|
|
# set the node weights if provided. |
|
218
|
31
|
50
|
33
|
|
|
122
|
$Parameters{nodeWeights} = $Parameters{tokenWeights} if ((exists $Parameters{tokenWeights}) && (defined $Parameters{tokenWeights})); |
|
219
|
|
|
|
|
|
|
|
|
220
|
|
|
|
|
|
|
# compute and return the pagerank of the tokens. |
|
221
|
31
|
|
|
|
|
210
|
return $pageRankEngine->getPagerankOfNodes (%Parameters, graph => $graph); |
|
222
|
|
|
|
|
|
|
} |
|
223
|
|
|
|
|
|
|
|
|
224
|
|
|
|
|
|
|
=head1 INSTALLATION |
|
225
|
|
|
|
|
|
|
|
|
226
|
|
|
|
|
|
|
To install the module run the following commands: |
|
227
|
|
|
|
|
|
|
|
|
228
|
|
|
|
|
|
|
perl Makefile.PL |
|
229
|
|
|
|
|
|
|
make |
|
230
|
|
|
|
|
|
|
make test |
|
231
|
|
|
|
|
|
|
make install |
|
232
|
|
|
|
|
|
|
|
|
233
|
|
|
|
|
|
|
If you are on a windows box you should use 'nmake' rather than 'make'. |
|
234
|
|
|
|
|
|
|
|
|
235
|
|
|
|
|
|
|
=head1 BUGS |
|
236
|
|
|
|
|
|
|
|
|
237
|
|
|
|
|
|
|
Please email bugs reports or feature requests to C, or through |
|
238
|
|
|
|
|
|
|
the web interface at L. The author |
|
239
|
|
|
|
|
|
|
will be notified and you can be automatically notified of progress on the bug fix or feature request. |
|
240
|
|
|
|
|
|
|
|
|
241
|
|
|
|
|
|
|
=head1 AUTHOR |
|
242
|
|
|
|
|
|
|
|
|
243
|
|
|
|
|
|
|
Jeff Kubina |
|
244
|
|
|
|
|
|
|
|
|
245
|
|
|
|
|
|
|
=head1 COPYRIGHT |
|
246
|
|
|
|
|
|
|
|
|
247
|
|
|
|
|
|
|
Copyright (c) 2009 Jeff Kubina. All rights reserved. |
|
248
|
|
|
|
|
|
|
This program is free software; you can redistribute |
|
249
|
|
|
|
|
|
|
it and/or modify it under the same terms as Perl itself. |
|
250
|
|
|
|
|
|
|
|
|
251
|
|
|
|
|
|
|
The full text of the license can be found in the |
|
252
|
|
|
|
|
|
|
LICENSE file included with this module. |
|
253
|
|
|
|
|
|
|
|
|
254
|
|
|
|
|
|
|
=head1 KEYWORDS |
|
255
|
|
|
|
|
|
|
|
|
256
|
|
|
|
|
|
|
categorize, keywords, keyphrases, nlp, pagerank, textrank |
|
257
|
|
|
|
|
|
|
|
|
258
|
|
|
|
|
|
|
=head1 SEE ALSO |
|
259
|
|
|
|
|
|
|
|
|
260
|
|
|
|
|
|
|
=begin html |
|
261
|
|
|
|
|
|
|
|
|
262
|
|
|
|
|
|
|
This package implements the Textrank algorithm from the report |
|
263
|
|
|
|
|
|
|
TextRank: Bringing Order into Texts |
|
264
|
|
|
|
|
|
|
by Rada Mihalcea and Paul Tarau; |
|
265
|
|
|
|
|
|
|
which is related to pagerank. |
|
266
|
|
|
|
|
|
|
|
|
267
|
|
|
|
|
|
|
=end html |
|
268
|
|
|
|
|
|
|
|
|
269
|
|
|
|
|
|
|
L, L, L, L |
|
270
|
|
|
|
|
|
|
|
|
271
|
|
|
|
|
|
|
=cut |
|
272
|
|
|
|
|
|
|
|
|
273
|
|
|
|
|
|
|
1; |
|
274
|
|
|
|
|
|
|
# The preceding line will help the module return a true value |