File Coverage

blib/lib/Text/Summarize.pm

Criterion	Covered	Total	%
statement	10	12	83.3
branch			n/a
condition			n/a
subroutine	4	4	100.0
pod			n/a
total	14	16	87.5

line	stmt	sub	time	code
1				package Text::Summarize;
2	1	1	392968	use strict;
	1		4
	1		40
3	1	1	6	use warnings;
	1		1
	1		30
4	1	1	1760	use Log::Log4perl;
	1		187936
	1		6
5	1	1	1847	use Text::Categorize::Textrank;
	0
	0
6				use Data::Dump qw(dump);
7
8				BEGIN
9				{
10				use Exporter ();
11				use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
12				$VERSION = '0.50';
13				@ISA = qw(Exporter);
14				@EXPORT = qw(getSumbasicRankingOfSentences);
15				@EXPORT_OK = qw(getSumbasicRankingOfSentences);
16				%EXPORT_TAGS = ();
17				}
18
19				#12345678901234567890123456789012345678901234
20				#Routine to compute summaries of text.
21
22				=head1 NAME
23
24				C - Routine to compute summaries of text.
25
26				=head1 SYNOPSIS
27
28				use strict;
29				use warnings;
30				use Text::Summarize;
31				use Data::Dump qw(dump);
32				my $listOfSentences = [
33				{ id => 0, listOfTokens => [qw(all people are equal)] },
34				{ id => 1, listOfTokens => [qw(all men are equal)] },
35				{ id => 2, listOfTokens => [qw(all are equal)] },
36				];
37				dump getSumbasicRankingOfSentences(listOfSentences => $listOfSentences);
38
39				=head1 DESCRIPTION
40
41				C contains a routine to score a list of sentences
42				for inclusion in a summary of the text using the
43				SumBasic algorithm from the report I
44				by L. Vanderwendea, H. Suzukia, C. Brocketta, and A. Nenkovab.
45
46				=head1 ROUTINES
47
48				=head2 C
49
50				use Text::Summarize;
51				use Data::Dump qw(dump);
52				my $listOfSentences = [
53				{ id => 0, listOfTokens => [qw(all people are equal)] },
54				{ id => 1, listOfTokens => [qw(all men are equal)] },
55				{ id => 2, listOfTokens => [qw(all are equal)] },
56				];
57				dump getSumbasicRankingOfSentences(listOfSentences => $listOfSentences);
58
59				C computes the sumBasic score of the list of sentences
60				provided. It returns an array reference containing the pairs C<[id, score]> sorted
61				in descending order of score, where C is from C.
62
63				=over
64
65				=item C
66
67				listOfSentences => [{id => '..', listOfTokens => [...]}, ..., {id => '..', listOfTokens => [...]}]
68
69				C holds the list of sentences that are to be scored. Each
70				item in the list is a hash reference of the form C<{id =E '..', listOfTokens =E [...]}> where
71				C is a unique identifier for the sentence and C is an array
72				reference of the list of tokens comprizing the sentence.
73
74				=item C
75
76				tokenWeight => {}
77
78				C is a optional hash reference that provides the weight of the tokens defined
79				in C. If C is defined, but undefined for a token in a sentence,
80				then the tokens weight defaults to zero unless C is true,
81				in which case the token is ignored and not used to compute the average weight
82				of the sentences containing it. If C is undefined then the weights of the tokens
83				are either their frequency of occurrence in the filtered text, or their textranks if C is defined.
84
85				=item C
86
87				ignoreUndefinedTokens => 0
88
89				If C is true, then any tokens for which C is
90				undefined are ignored and not used to compute the average weight of a
91				sentence; the default is false.
92
93				=item C
94
95				tokenWeightUpdateFunction => &subroutine (currentTokenWeight, initialTokenWeight, token, selectedSentenceId, selectedSentenceWeight)
96
97				C is an optional parameter for defining the function that updates the
98				weight of a token when it is contained in a selected sentence. Five parameters are passed to the
99				subroutine: the token's current weight (float), the token's initial weight (float), the token (string), the C of the
100				selected sentence (string), and the current average weight of the tokens in the selected sentence (float).
101				The default is L.
102
103				=item C
104
105				textRankParameters => undef
106
107				If C is defined, then the token weights
108				are computed using L. The parameters to use for L,
109				excluding the C parameters, can be set using the hash reference defined by C.
110				For example, C {directedGraph =E 1}> would make the textrank weights
111				be computed using a directed token graph.
112
113				=back
114
115				=cut
116
117				sub getSumbasicRankingOfSentences
118				{
119				my (%Parameters) = @_;
120
121				# get the list of sentences.
122				my $listOfSentences = $Parameters{listOfSentences} if exists $Parameters{listOfSentences};
123				return [] unless defined $listOfSentences;
124
125				# get the original token weights.
126				my $originalTokenWeights;
127				$originalTokenWeights = $Parameters{tokenWeight} if (exists($Parameters{tokenWeight}) && defined($Parameters{tokenWeight}));
128
129				# if textRankParameters is defined, compute the token weights via textrank.
130				if (exists($Parameters{textRankParameters}) && defined($Parameters{textRankParameters}))
131				{
132				$originalTokenWeights = _getTextRankWeightOfTokens(%Parameters, listOfSentences => $listOfSentences);
133				}
134
135				# if $originalTokenWeights is not defined, then use the frequency of the tokens as their weight.
136				if (!defined ($originalTokenWeights))
137				{
138				$originalTokenWeights = _getFrequencyWeightOfTokens(listOfSentences => $listOfSentences);
139				}
140
141				# get the function to update the weights of the tokens.
142				my $tokenWeightUpdateFunction = \&tokenWeightUpdateFunction_Squared;
143				$tokenWeightUpdateFunction = $Parameters{tokenWeightUpdateFunction} if exists $Parameters{tokenWeightUpdateFunction};
144
145				# set the flag for ignoreUndefinedTokens.
146				my $ignoreUndefinedTokens = exists $Parameters{ignoreUndefinedTokens} && $Parameters{ignoreUndefinedTokens};
147
148				# copy the weights of only the tokens that occur in the sentences.
149				# the default weight of a token is zero.
150				my %tokenWeight;
151				for (my $i = 0 ; $i < @$listOfSentences ; $i++)
152				{
153
154				# if the sentence has no id, skip it.
155				unless (exists $listOfSentences->[$i]->{id})
156				{
157
158				# get the list of tokens in the sentence as a string.
159				my $stringOfTokens;
160				if (exists($listOfSentences->[$i]->{listOfTokens}))
161				{
162				$stringOfTokens = join(' ', @{ $listOfSentences->[$i]->{listOfTokens} });
163				}
164
165				# create the message to log.
166				my $logger = Log::Log4perl->get_logger();
167				my $message;
168				if (defined $stringOfTokens)
169				{
170				$message = "warning: skipping sentence number $i with tokens $stringOfTokens since it is missing an id.\n";
171				}
172				else
173				{
174				$message = "warning: skipping sentence number $i since it is missing an id and listOfTokens.\n";
175				}
176
177				# log the message as a warning.
178				$logger->logwarn($message);
179
180				# skip processing the sentence.
181				next;
182				}
183
184				# get the listOfTokens of the sentence.
185				if ((exists $listOfSentences->[$i]->{listOfTokens}) && (@{ $listOfSentences->[$i]->{listOfTokens} }))
186				{
187				my $listOfTokens = $listOfSentences->[$i]->{listOfTokens};
188				foreach my $token (@$listOfTokens)
189				{
190
191				# if the weight is already defined for the token, skip it.
192				next if exists $tokenWeight{$token};
193
194				# if weight for token not defined, it defaults to zero if ignoreUndefinedTokens is false.
195				if (exists $originalTokenWeights->{$token})
196				{
197				$tokenWeight{$token} = $originalTokenWeights->{$token};
198				}
199				elsif (!$ignoreUndefinedTokens)
200				{
201				$tokenWeight{$token} = 0;
202				}
203				}
204				}
205				}
206
207				# normalize the token weights to sum to one.
208				my $sum = 0;
209				while (my ($token, $weight) = each %tokenWeight) { $sum += $weight; }
210				$sum = 1 if ($sum == 0);
211				while (my ($token, $weight) = each %tokenWeight) { $tokenWeight{$token} /= $sum; }
212
213				# keep a copy of the initial token weights.
214				my %initialTokenWeight = %tokenWeight;
215
216				# @listOfEmptySentenceIds will hold the list of empty sentence ids.
217				my @listOfEmptySentenceIds;
218
219				# make a copy of the list of sentences
220				my @localListOfSentences;
221				for (my $i = 0 ; $i < @$listOfSentences ; $i++)
222				{
223
224				# if the sentence has no id, skip it.
225				next unless exists $listOfSentences->[$i]->{id};
226
227				# copy the id of the sentence.
228				my %sentence;
229				$sentence{id} = $listOfSentences->[$i]->{id};
230
231				# convert the list of tokens in a sentence to a hash with the key as the token and the value its occurance in the sentence.
232				if ((exists $listOfSentences->[$i]->{listOfTokens}) && (@{ $listOfSentences->[$i]->{listOfTokens} }))
233				{
234				my %tokenCount;
235				my $empty = 1;
236				foreach my $token (@{ $listOfSentences->[$i]->{listOfTokens} })
237				{
238
239				# if the weight for the token is not defined, skip it.
240				if (exists $tokenWeight{$token})
241				{
242				++$tokenCount{$token};
243				$empty = 0;
244				}
245				}
246
247				# if the sentence has no defined tokens, store the id on @listOfEmptySentenceIds.
248				if ($empty)
249				{
250				push @listOfEmptySentenceIds, [ $listOfSentences->[$i]->{id}, scalar @{ $listOfSentences->[$i]->{listOfTokens} } ];
251				}
252				else
253				{
254				$sentence{tokenCounts} = \%tokenCount;
255
256				# store the sentence in a list.
257				push @localListOfSentences, \%sentence;
258				}
259				}
260				else
261				{
262
263				# if the sentence has no tokens, store the id on @listOfEmptySentenceIds.
264				push @listOfEmptySentenceIds, [ $listOfSentences->[$i]->{id}, scalar @{ $listOfSentences->[$i]->{listOfTokens} } ];
265				}
266				}
267
268				# compute the average weight of each sentence and initialize its selected flag to false.
269				for (my $i = 0 ; $i < @localListOfSentences ; $i++)
270				{
271
272				# get the pointer to the sentence.
273				my $sentence = $localListOfSentences[$i];
274
275				# compute the weight of the sentence.
276				my $weight = 0;
277				my $tokenCountSum = 0;
278				while (my ($token, $count) = each %{ $sentence->{tokenCounts} })
279				{
280				$weight += $count * $tokenWeight{$token};
281				$tokenCountSum += $count;
282				}
283				$sentence->{size} = $tokenCountSum;
284				$sentence->{weight} = $weight / $sentence->{size};
285
286				# initialize each sentence as not selected.
287				$sentence->{selected} = 0;
288				}
289
290				# build the inverted index of the sentences and tokens, called tokenSentenceIndex.
291				my %tokenSentenceIndex;
292				for (my $i = 0 ; $i < @localListOfSentences ; $i++)
293				{
294
295				# get the pointer to the sentence.
296				my $sentence = $localListOfSentences[$i];
297
298				# get the list of tokens in the sentence.
299				foreach my $token (keys %{ $sentence->{tokenCounts} })
300				{
301
302				# add the weightSentence pointer to the tokenSentenceIndex.
303				$tokenSentenceIndex{$token} = [] unless exists $tokenSentenceIndex{$token};
304
305				# note we are storing the index of the sentence, not the pointer to the sentence.
306				push @{ $tokenSentenceIndex{$token} }, $i;
307				}
308				}
309
310				# make the list of just the tokens.
311				my @listOfTokens = keys %tokenWeight;
312
313				# @rankedListOfSentences will hold the sentences in sumbasic order.
314				my @rankedListOfSentences;
315
316				# loop over the sentences until they have all been selected.
317				while (scalar(@rankedListOfSentences) < scalar(@localListOfSentences))
318				{
319
320				# if there are no tokens left, exit the loop.
321				last unless @listOfTokens > 0;
322
323				# get the token with the greatest (weight, length, -order).
324				my $maxIndex = 0;
325				my $maxToken = $listOfTokens[$maxIndex];
326				my $maxTokenWeight = $tokenWeight{$maxToken};
327				for (my $i = 1 ; $i < scalar(@listOfTokens) ; $i++)
328				{
329				my $cmp;
330				if ($maxTokenWeight < $tokenWeight{ $listOfTokens[$i] })
331				{
332
333				# $maxTokenWeight is smaller.
334				$cmp = -1;
335				}
336				elsif ($maxTokenWeight > $tokenWeight{ $listOfTokens[$i] })
337				{
338
339				# $maxTokenWeight is larger.
340				$cmp = 1;
341				}
342				else
343				{
344
345				# weights are equal, compare token lengths, choose the longer one.
346				$cmp = length($maxToken) <=> length($listOfTokens[$i]);
347
348				# if tokens have equal length, choose the one lexically smaller.
349				if ($cmp == 0) { $cmp = $listOfTokens[$i] cmp $maxToken; }
350				}
351
352				# if the current max is smaller, replace it.
353				if ($cmp == -1)
354				{
355				$maxIndex = $i;
356				$maxToken = $listOfTokens[$maxIndex];
357				$maxTokenWeight = $tokenWeight{$maxToken};
358				}
359				}
360
361				# copy the last token to where the max was, it may be popped off if there are no
362				# sentences left containing it.
363				$listOfTokens[$maxIndex] = $listOfTokens[-1];
364				$listOfTokens[-1] = $maxToken;
365
366				# if there are no sentences remaining with the token, move on to the next token.
367				unless (exists $tokenSentenceIndex{$maxToken})
368				{
369				pop @listOfTokens;
370				next;
371				}
372
373				# get the list of sentences that have the token.
374				my $listOfSentencesWithToken = $tokenSentenceIndex{$maxToken};
375
376				# if there are no sentences remaining with the token, move on to the next token.
377				unless (scalar(@$listOfSentencesWithToken) > 0)
378				{
379				pop @listOfTokens;
380				delete $tokenSentenceIndex{$maxToken};
381				next;
382				}
383
384				# find the sentence having the token with the highest weight not yet selected.
385				my $maxSentenceIndex;
386				my $maxSentence;
387				my @remainingListOfSentencesWithToken;
388				foreach my $sentenceIndex (@$listOfSentencesWithToken)
389				{
390
391				# get the pointer to the sentence.
392				my $sentence = $localListOfSentences[$sentenceIndex];
393
394				# skip the sentence if already selected.
395				next if $sentence->{selected};
396
397				# if no sentence has been selected, just take the first valid sentence.
398				unless (defined($maxSentence))
399				{
400				$maxSentence = $sentence;
401				$maxSentenceIndex = $sentenceIndex;
402				next;
403				}
404
405				# choose the sentence with the greater weight, or the greater size, or the lesser id.
406				my $cmp =
407				($sentence->{weight} <=> $maxSentence->{weight})
408				\|\| ($sentence->{size} <=> $maxSentence->{size})
409				\|\| ($sentence->{id} cmp $maxSentence->{id});
410
411				# store the new maximum sentence.
412				if ($cmp == 1)
413				{
414
415				# store the previous maximum as an unselected sentence.
416				push @remainingListOfSentencesWithToken, $maxSentenceIndex;
417				$maxSentence = $sentence;
418				$maxSentenceIndex = $sentenceIndex;
419				}
420				else
421				{
422
423				# store the current sentence as unselected.
424				push @remainingListOfSentencesWithToken, $sentenceIndex;
425				}
426				}
427
428				# update the list of sentences with the token that were not selected for the summary.
429				if (@remainingListOfSentencesWithToken == 0)
430				{
431				delete $tokenSentenceIndex{$maxToken};
432				}
433				else
434				{
435
436				# update the list of sentences that the token is contained in.
437				$tokenSentenceIndex{$maxToken} = \@remainingListOfSentencesWithToken;
438				}
439
440				# if no sentence selected, then there are no unselected sentences with the
441				# token, so move on to the next token.
442				unless (defined $maxSentence)
443				{
444				pop @listOfTokens;
445				delete $tokenSentenceIndex{$maxToken};
446				next;
447				}
448
449				# store the sentence selected and its weight.
450				$maxSentence->{selected} = 1;
451				push @rankedListOfSentences, [ $maxSentence, $maxSentence->{weight} ];
452
453				# update the weight of all the tokens in the max sentence.
454				my @sentenceTokens = keys %{ $maxSentence->{tokenCounts} };
455				foreach my $token (@sentenceTokens)
456				{
457
458				# (currentTokenWeight, initialTokenWeight, token, selectedSentenceId, selectedSentenceWeight)
459				$tokenWeight{$token} =
460				&$tokenWeightUpdateFunction($tokenWeight{$token}, $initialTokenWeight{$token}, $token, $maxSentence->{id}, $maxSentence->{weight});
461				}
462
463				# get all of the sentences that share tokens with the max sentence.
464				my %sentencesToUpdate;
465				foreach my $token (@sentenceTokens)
466				{
467				next unless exists $tokenSentenceIndex{$token};
468				foreach my $sentenceIndex (@{ $tokenSentenceIndex{$token} })
469				{
470				$sentencesToUpdate{$sentenceIndex} = 1;
471				}
472				}
473				my @listOfSentencesToUpdate = keys %sentencesToUpdate;
474
475				# recompute the weight of the sentences that have tokens whose weight changed.
476				# floating point calculations will become unstable due to rounding errors if the
477				# old weights are subtracted and the new weights added. slower, but best to
478				# recompute the average weights by summing.
479				foreach my $sentenceIndex (@listOfSentencesToUpdate)
480				{
481
482				# get the pointer to the sentence.
483				my $sentence = $localListOfSentences[$sentenceIndex];
484
485				# skip the sentence if it was already selected.
486				next if $sentence->{selected};
487
488				# compute the weight of the sentence.
489				my $weight = 0;
490				while (my ($token, $count) = each %{ $sentence->{tokenCounts} })
491				{
492				$weight += $count * $tokenWeight{$token};
493				}
494				$sentence->{weight} = $weight / $sentence->{size};
495				}
496				}
497
498				# normalize the sentence weights so they sum to one.
499				my $totalSentenceWeight = 0;
500				foreach my $sentenceWeight (@rankedListOfSentences)
501				{
502				$totalSentenceWeight += $sentenceWeight->[1];
503				}
504				$totalSentenceWeight = 1 if ($totalSentenceWeight == 0);
505
506				foreach my $sentenceWeight (@rankedListOfSentences)
507				{
508
509				# normalize the sentence weight.
510				$sentenceWeight = [ $sentenceWeight->[0]->{id}, $sentenceWeight->[1] / $totalSentenceWeight ];
511				}
512
513				# add the empty sentences to the list.
514				push @rankedListOfSentences, map { [ $_->[0], 0 ] } sort { ($a->[1] <=> $b->[1]) \|\| ($a->[0] cmp $b->[0]) } @listOfEmptySentenceIds;
515
516				# adjust the weights to be descending (a kludge).
517				if (@rankedListOfSentences)
518				{
519				my $totalSentenceWeight = 0;
520				my $runningSum = 0;
521				for (my $i = @rankedListOfSentences - 1; $i > -1; $i--)
522				{
523				$runningSum += $rankedListOfSentences[$i]->[1];
524				$rankedListOfSentences[$i]->[1] = $runningSum;
525				$totalSentenceWeight += $rankedListOfSentences[$i]->[1];
526				}
527				$totalSentenceWeight = 1 if ($totalSentenceWeight <= 0);
528				foreach my $idWeight (@rankedListOfSentences)
529				{
530				$idWeight->[1] = abs ($idWeight->[1]/ $totalSentenceWeight);
531				}
532				}
533
534				return \@rankedListOfSentences;
535				}
536
537				=head2 C
538
539				Returns the tokens current weight squared.
540
541				=cut
542
543				sub tokenWeightUpdateFunction_Squared # (currentTokenWeight, initialTokenWeight, token, selectedSentenceId, selectedSentenceWeight)
544				{
545				return $_[0] * $_[0];
546				}
547
548				=head2 C
549
550				Returns the tokens current weight times its intial weight.
551
552				=cut
553
554				sub tokenWeightUpdateFunction_Multiplicative # (currentTokenWeight, initialTokenWeight, token, selectedSentenceId, selectedSentenceWeight)
555				{
556				return $_[0] * $_[1];
557				}
558
559				=head2 C
560
561				Returns the tokens current weight times its the average weight of the tokens in the selected sentence.
562
563				=cut
564
565				sub tokenWeightUpdateFunction_Sentence # (currentTokenWeight, initialTokenWeight, token, selectedSentenceId, selectedSentenceWeight)
566				{
567				return $_[0] * $_[4];
568				}
569
570				# computes the textrank of the tokens.
571				sub _getTextRankWeightOfTokens
572				{
573				my %Parameters = @_;
574
575				# use any textrank parameters if defined.
576				my %textRankParameters;
577				%textRankParameters = %{ $Parameters{textRankParameters} } if ((exists $Parameters{textRankParameters}) && (defined $Parameters{textRankParameters}));
578
579				# if no sentences, return now.
580				return {} unless exists $Parameters{listOfSentences};
581				my $listOfSentences = $Parameters{listOfSentences};
582
583				# build the list of tokens.
584				my @listOfTokens = map { ($_->{listOfTokens}) } @$listOfSentences;
585
586				# return the textrank of each token.
587				return getTextrankOfListOfTokens(%textRankParameters, listOfTokens => \@listOfTokens);
588				}
589
590				# computes the frequency of the tokens.
591				sub _getFrequencyWeightOfTokens
592				{
593				my %Parameters = @_;
594
595				# if no sentences, return now.
596				return {} unless exists $Parameters{listOfSentences};
597				my $listOfSentences = $Parameters{listOfSentences};
598
599				# compute total occurrence and frequency of the tokens.
600				my $totalOccurrence = 0;
601				my %tokenFrequency;
602
603				foreach my $sentence (@$listOfSentences)
604				{
605				foreach my $token (@{$sentence->{listOfTokens}})
606				{
607				++$tokenFrequency{$token};
608				++$totalOccurrence;
609				}
610				}
611				$totalOccurrence = 1 if $totalOccurrence < 1;
612
613				while (my ($token, undef) = each %tokenFrequency)
614				{
615				$tokenFrequency{$token} /= $totalOccurrence;
616				}
617
618				# return the frequency of each token.
619				return \%tokenFrequency;
620				}
621
622				=head1 INSTALLATION
623
624				Use L to install the module and all its prerequisites:
625
626				perl -MCPAN -e shell
627				>install Text::Summarize
628
629				=head1 BUGS
630
631				Please email bugs reports or feature requests to C, or through
632				the web interface at L. The author
633				will be notified and you can be automatically notified of progress on the bug fix or feature request.
634
635				=head1 AUTHOR
636
637				Jeff Kubina
638
639				=head1 COPYRIGHT
640
641				Copyright (c) 2009 Jeff Kubina. All rights reserved.
642				This program is free software; you can redistribute
643				it and/or modify it under the same terms as Perl itself.
644
645				The full text of the license can be found in the
646				LICENSE file included with this module.
647
648				=head1 KEYWORDS
649
650				information processing, summary, summaries, summarization, summarize, sumbasic, textrank
651
652				=head1 SEE ALSO
653
654				L, L, L
655
656				=begin html
657
658				The SumBasic algorithm for ranking sentences is from
659				Beyond SumBasic: Task-Focused Summarization with Sentence Simplification and Lexical Expansion
660				by L. Vanderwendea, H. Suzukia, C. Brocketta, and A. Nenkovab.
661
662				=end html
663
664				=cut
665
666				1;
667
668				# The preceding line will help the module return a true value
669