|  line  | 
 stmt  | 
 bran  | 
 cond  | 
 sub  | 
 pod  | 
 time  | 
 code  | 
| 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 package Lingua::EN::Summarize;  | 
| 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
3
 | 
1
 | 
 
 | 
 
 | 
  
1
  
 | 
 
 | 
620
 | 
 use strict;  | 
| 
 
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
2
 | 
    | 
| 
 
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
24
 | 
    | 
| 
4
 | 
1
 | 
 
 | 
 
 | 
  
1
  
 | 
 
 | 
4
 | 
 use Carp;  | 
| 
 
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
2
 | 
    | 
| 
 
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
73
 | 
    | 
| 
5
 | 
1
 | 
 
 | 
 
 | 
  
1
  
 | 
 
 | 
5
 | 
 use Exporter;  | 
| 
 
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
3
 | 
    | 
| 
 
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
27
 | 
    | 
| 
6
 | 
1
 | 
 
 | 
 
 | 
  
1
  
 | 
 
 | 
1768
 | 
 use Text::Wrap qw(wrap);  | 
| 
 
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
2892
 | 
    | 
| 
 
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
56
 | 
    | 
| 
7
 | 
1
 | 
 
 | 
 
 | 
  
1
  
 | 
 
 | 
805
 | 
 use Text::Sentence qw(split_sentences);  | 
| 
 
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
470
 | 
    | 
| 
 
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
59
 | 
    | 
| 
8
 | 
1
 | 
 
 | 
 
 | 
  
1
  
 | 
 
 | 
501
 | 
 use Lingua::EN::Summarize::Filters;  | 
| 
 
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
2
 | 
    | 
| 
 
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
25
 | 
    | 
| 
9
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
10
 | 
1
 | 
 
 | 
 
 | 
  
1
  
 | 
 
 | 
4
 | 
 use vars qw($VERSION @ISA @EXPORT);  | 
| 
 
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
2
 | 
    | 
| 
 
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
100
 | 
    | 
| 
11
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
12
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 @ISA = qw(Exporter);  | 
| 
13
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 @EXPORT = qw(summarize);  | 
| 
14
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 $VERSION = '0.2';  | 
| 
15
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
16
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
17
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub summarize {  | 
| 
18
 | 
3
 | 
 
 | 
 
 | 
  
3
  
 | 
  
0
  
 | 
204
 | 
   my ($text, %options) = @_;  | 
| 
19
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
20
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   ### This section massages the text into a usable format before summarizing.  | 
| 
21
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
22
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # Run each filter over the text.  | 
| 
23
 | 
3
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
12
 | 
   return unless $text;  | 
| 
24
 | 
3
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
9
 | 
   if ($options{filter}) {  | 
| 
25
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
     my @filters = ref $options{filter} eq 'ARRAY' ?  | 
| 
26
 | 
1
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
7
 | 
                             @{$options{filter}} : $options{filter};  | 
| 
27
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
28
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
3
 | 
     foreach (@filters) {  | 
| 
29
 | 
1
 | 
 
 | 
 
 | 
  
1
  
 | 
 
 | 
4
 | 
       no strict 'refs';  | 
| 
 
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
2
 | 
    | 
| 
 
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
517
 | 
    | 
| 
30
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
31
 | 
1
 | 
  
 50
  
 | 
  
 50
  
 | 
 
 | 
 
 | 
9
 | 
       if (ref $_ eq 'CODE') {  | 
| 
 
 | 
1
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
12
 | 
    | 
| 
32
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
 	$text = $_->( $text );  | 
| 
33
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       } elsif (exists $Lingua::EN::Summarize::Filters::{$_}  | 
| 
34
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	       and *{"Lingua::EN::Summarize::Filters::$_"}{CODE}) {  | 
| 
35
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
2
 | 
 	$text = &{"Lingua::EN::Summarize::Filters::$_"}( $text );  | 
| 
 
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
8
 | 
    | 
| 
36
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       } else {  | 
| 
37
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
 	croak "Unknown text filter \"$_\"";  | 
| 
38
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       }  | 
| 
39
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
40
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   }  | 
| 
41
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
42
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # Strip whitespace and formatting out of the text.  | 
| 
43
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
70
 | 
   $text =~ s/^\s+//;  | 
| 
44
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
2425
 | 
   $text =~ s/\s+/ /sg;  | 
| 
45
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
763
 | 
   $text =~ s/\s+$//;  | 
| 
46
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
47
 | 
3
 | 
  
100
  
 | 
  
 66
  
 | 
 
 | 
 
 | 
22
 | 
   unless (exists $options{maxlength} and $options{maxlength} > 0) {  | 
| 
48
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
105
 | 
     $options{maxlength} = length( $text ) / 30;  | 
| 
49
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   }  | 
| 
50
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
51
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
52
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   ### Here's where the interesting logic happens.  | 
| 
53
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     | 
| 
54
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # First we break it into sentence pieces. Kind of. Sort of.  | 
| 
55
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
7
 | 
   my $keywords = "(is|are|was|were|will|have)";  | 
| 
56
 | 
659
 | 
 
 | 
 
 | 
 
 | 
 
 | 
3359
 | 
   my @clauses = grep { /\b$keywords\b/i }  | 
| 
 
 | 
201
 | 
 
 | 
 
 | 
 
 | 
 
 | 
4486
 | 
    | 
| 
57
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
14
 | 
                    map { split /(,|;|--)/ } split_sentences( $text );  | 
| 
58
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
59
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
66
 | 
   my $stopwords = "(and|but|instead)";  | 
| 
60
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
10
 | 
   foreach (@clauses) {  | 
| 
61
 | 
170
 | 
 
 | 
 
 | 
 
 | 
 
 | 
348
 | 
     s/^\s+//;  | 
| 
62
 | 
170
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1267
 | 
     s/^$stopwords\s+//i;  | 
| 
63
 | 
170
 | 
 
 | 
 
 | 
 
 | 
 
 | 
258
 | 
     $_ = ucfirst;  | 
| 
64
 | 
170
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
755
 | 
     $_ .= ". " unless /[.!?]$/;  | 
| 
65
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   }  | 
| 
66
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
67
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # Assemble the resulting phrases into the summary response.  | 
| 
68
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
6
 | 
   my $summary = '';  | 
| 
69
 | 
3
 | 
 
 | 
  
100
  
 | 
 
 | 
 
 | 
34
 | 
   while (@clauses and length $summary < $options{maxlength}) {  | 
| 
70
 | 
19
 | 
 
 | 
 
 | 
 
 | 
 
 | 
125
 | 
     $summary .= " " . shift @clauses;  | 
| 
71
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   }  | 
| 
72
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
73
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
74
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   ### Done! Do any necessary postprocessing before returning.  | 
| 
75
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
76
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # Prettyprint the summary to make it look nice on a terminal, if requested.  | 
| 
77
 | 
3
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
12
 | 
   if ($options{wrap}) {  | 
| 
78
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
     $Text::Wrap::columns = $options{wrap};  | 
| 
79
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
     $summary = wrap( '', '', $summary );  | 
| 
80
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   }  | 
| 
81
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
82
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
18
 | 
   $summary =~ s/^\s+//mg;  | 
| 
83
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
143
 | 
   $summary =~ s/[ \t]+/ /g;    # if we use \s, that screws up the wrapping  | 
| 
84
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
54
 | 
   $summary =~ s/\s+$//mg;  | 
| 
85
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
86
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
38
 | 
   return $summary;  | 
| 
87
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
88
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
89
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
90
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 1;  | 
| 
91
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 __END__  |