| line | stmt | bran | cond | sub | pod | time | code | 
| 1 |  |  |  |  |  |  | package HTML::Summary; | 
| 2 |  |  |  |  |  |  |  | 
| 3 |  |  |  |  |  |  | #============================================================================== | 
| 4 |  |  |  |  |  |  | # | 
| 5 |  |  |  |  |  |  | # Start of POD | 
| 6 |  |  |  |  |  |  | # | 
| 7 |  |  |  |  |  |  | #============================================================================== | 
| 8 |  |  |  |  |  |  |  | 
| 9 |  |  |  |  |  |  | =head1 NAME | 
| 10 |  |  |  |  |  |  |  | 
| 11 |  |  |  |  |  |  | HTML::Summary - generate a summary from a web page | 
| 12 |  |  |  |  |  |  |  | 
| 13 |  |  |  |  |  |  | =head1 SYNOPSIS | 
| 14 |  |  |  |  |  |  |  | 
| 15 |  |  |  |  |  |  | use HTML::Summary; | 
| 16 |  |  |  |  |  |  | use HTML::TreeBuilder; | 
| 17 |  |  |  |  |  |  |  | 
| 18 |  |  |  |  |  |  | my $tree = HTML::TreeBuilder->new; | 
| 19 |  |  |  |  |  |  | $tree->parse( $document ); | 
| 20 |  |  |  |  |  |  |  | 
| 21 |  |  |  |  |  |  | my $summarizer = HTML::Summary->new( | 
| 22 |  |  |  |  |  |  | LENGTH      => 200, | 
| 23 |  |  |  |  |  |  | USE_META    => 1, | 
| 24 |  |  |  |  |  |  | ); | 
| 25 |  |  |  |  |  |  |  | 
| 26 |  |  |  |  |  |  | $summary = $summarizer->generate( $tree ); | 
| 27 |  |  |  |  |  |  | $summarizer->option( 'USE_META' => 1 ); | 
| 28 |  |  |  |  |  |  | $length = $summarizer->option( 'LENGTH' ); | 
| 29 |  |  |  |  |  |  | if ( $summarizer->meta_used() ) { | 
| 30 |  |  |  |  |  |  | # do something | 
| 31 |  |  |  |  |  |  | } | 
| 32 |  |  |  |  |  |  |  | 
| 33 |  |  |  |  |  |  | =head1 DESCRIPTION | 
| 34 |  |  |  |  |  |  |  | 
| 35 |  |  |  |  |  |  | The C module produces summaries from the textual content of | 
| 36 |  |  |  |  |  |  | web pages. It does so using the location heuristic, which determines the value | 
| 37 |  |  |  |  |  |  | of a given sentence based on its position and status within the document; for | 
| 38 |  |  |  |  |  |  | example, headings, section titles and opening paragraph sentences may be | 
| 39 |  |  |  |  |  |  | favoured over other textual content. A LENGTH option can be used to restrict | 
| 40 |  |  |  |  |  |  | the length of the summary produced. | 
| 41 |  |  |  |  |  |  |  | 
| 42 |  |  |  |  |  |  | =head1 CONSTRUCTOR | 
| 43 |  |  |  |  |  |  |  | 
| 44 |  |  |  |  |  |  | =head2 new( $attr1 => $value1 [, $attr2 => $value2 ] ) | 
| 45 |  |  |  |  |  |  |  | 
| 46 |  |  |  |  |  |  | Possible attributes are: | 
| 47 |  |  |  |  |  |  |  | 
| 48 |  |  |  |  |  |  | =over 4 | 
| 49 |  |  |  |  |  |  |  | 
| 50 |  |  |  |  |  |  | =item VERBOSE | 
| 51 |  |  |  |  |  |  |  | 
| 52 |  |  |  |  |  |  | Generate verbose messages to STDERR. | 
| 53 |  |  |  |  |  |  |  | 
| 54 |  |  |  |  |  |  | =item LENGTH | 
| 55 |  |  |  |  |  |  |  | 
| 56 |  |  |  |  |  |  | Maximum length of summary (in bytes). Default is 500. | 
| 57 |  |  |  |  |  |  |  | 
| 58 |  |  |  |  |  |  | =item USE_META | 
| 59 |  |  |  |  |  |  |  | 
| 60 |  |  |  |  |  |  | Flag to tell summarizer whether to use the content of the C<> tag | 
| 61 |  |  |  |  |  |  | in the page header, if one is present, instead of generating a summary from the | 
| 62 |  |  |  |  |  |  | body text. B if the USE_META flag is set, this overrides the LENGTH | 
| 63 |  |  |  |  |  |  | flag - in other words, the summary provided by the C<> tag is | 
| 64 |  |  |  |  |  |  | returned in full, even if it is greater than LENGTH bytes. Default is 0 (no). | 
| 65 |  |  |  |  |  |  |  | 
| 66 |  |  |  |  |  |  | =back | 
| 67 |  |  |  |  |  |  |  | 
| 68 |  |  |  |  |  |  | my $summarizer = HTML::Summary->new(LENGTH => 200); | 
| 69 |  |  |  |  |  |  |  | 
| 70 |  |  |  |  |  |  | =head1 METHODS | 
| 71 |  |  |  |  |  |  |  | 
| 72 |  |  |  |  |  |  | =head2 option( ) | 
| 73 |  |  |  |  |  |  |  | 
| 74 |  |  |  |  |  |  | Get / set HTML::Summary configuration options. | 
| 75 |  |  |  |  |  |  |  | 
| 76 |  |  |  |  |  |  | my $length = $summarizer->option( 'LENGTH' ); | 
| 77 |  |  |  |  |  |  | $summarizer->option( 'USE_META' => 1 ); | 
| 78 |  |  |  |  |  |  |  | 
| 79 |  |  |  |  |  |  | =head2 generate( $tree ) | 
| 80 |  |  |  |  |  |  |  | 
| 81 |  |  |  |  |  |  | Takes an HTML::Element object, and generates a summary from it. | 
| 82 |  |  |  |  |  |  |  | 
| 83 |  |  |  |  |  |  | my $tree = HTML::TreeBuilder->new; | 
| 84 |  |  |  |  |  |  | $tree->parse( $document ); | 
| 85 |  |  |  |  |  |  | my $summary = $summarizer->generate( $tree ); | 
| 86 |  |  |  |  |  |  |  | 
| 87 |  |  |  |  |  |  | =head2 meta_used( ) | 
| 88 |  |  |  |  |  |  |  | 
| 89 |  |  |  |  |  |  | Returns 1 if the META tag description was used to generate the summary. | 
| 90 |  |  |  |  |  |  |  | 
| 91 |  |  |  |  |  |  | if ( $summarizer->meta_used() ) { | 
| 92 |  |  |  |  |  |  | # do something ... | 
| 93 |  |  |  |  |  |  | } | 
| 94 |  |  |  |  |  |  |  | 
| 95 |  |  |  |  |  |  | =head1 SEE ALSO | 
| 96 |  |  |  |  |  |  |  | 
| 97 |  |  |  |  |  |  | L, | 
| 98 |  |  |  |  |  |  | L, | 
| 99 |  |  |  |  |  |  | L, | 
| 100 |  |  |  |  |  |  | L. | 
| 101 |  |  |  |  |  |  |  | 
| 102 |  |  |  |  |  |  | =head1 REPOSITORY | 
| 103 |  |  |  |  |  |  |  | 
| 104 |  |  |  |  |  |  | L | 
| 105 |  |  |  |  |  |  |  | 
| 106 |  |  |  |  |  |  | =head1 AUTHORS | 
| 107 |  |  |  |  |  |  |  | 
| 108 |  |  |  |  |  |  | This module was originally whipped up by Neil Bowers and Tony Rose. | 
| 109 |  |  |  |  |  |  | It was then developed and maintained by Ave Wrigley and Tony Rose. | 
| 110 |  |  |  |  |  |  |  | 
| 111 |  |  |  |  |  |  | Neil Bowers is currently maintaining the HTML-Summary distribution. | 
| 112 |  |  |  |  |  |  |  | 
| 113 |  |  |  |  |  |  | Neil Bowers Eneilb@cpan.orgE | 
| 114 |  |  |  |  |  |  |  | 
| 115 |  |  |  |  |  |  | =head1 COPYRIGHT AND LICENSE | 
| 116 |  |  |  |  |  |  |  | 
| 117 |  |  |  |  |  |  | Copyright (c) 1997 Canon Research Centre Europe (CRE). All rights reserved. | 
| 118 |  |  |  |  |  |  |  | 
| 119 |  |  |  |  |  |  | This is free software; you can redistribute it and/or modify it under | 
| 120 |  |  |  |  |  |  | the same terms as the Perl 5 programming language system itself. | 
| 121 |  |  |  |  |  |  |  | 
| 122 |  |  |  |  |  |  | =cut | 
| 123 |  |  |  |  |  |  |  | 
| 124 |  |  |  |  |  |  | #============================================================================== | 
| 125 |  |  |  |  |  |  | # | 
| 126 |  |  |  |  |  |  | # End of POD | 
| 127 |  |  |  |  |  |  | # | 
| 128 |  |  |  |  |  |  | #============================================================================== | 
| 129 |  |  |  |  |  |  |  | 
| 130 |  |  |  |  |  |  | #============================================================================== | 
| 131 |  |  |  |  |  |  | # | 
| 132 |  |  |  |  |  |  | # Pragmas | 
| 133 |  |  |  |  |  |  | # | 
| 134 |  |  |  |  |  |  | #============================================================================== | 
| 135 |  |  |  |  |  |  |  | 
| 136 |  |  |  |  |  |  | require 5.006; | 
| 137 | 2 |  |  | 2 |  | 1503 | use strict; | 
|  | 2 |  |  |  |  | 4 |  | 
|  | 2 |  |  |  |  | 48 |  | 
| 138 | 2 |  |  | 2 |  | 8 | use warnings; | 
|  | 2 |  |  |  |  | 4 |  | 
|  | 2 |  |  |  |  | 59 |  | 
| 139 |  |  |  |  |  |  |  | 
| 140 |  |  |  |  |  |  | #============================================================================== | 
| 141 |  |  |  |  |  |  | # | 
| 142 |  |  |  |  |  |  | # Modules | 
| 143 |  |  |  |  |  |  | # | 
| 144 |  |  |  |  |  |  | #============================================================================== | 
| 145 |  |  |  |  |  |  |  | 
| 146 | 2 |  |  | 2 |  | 1019 | use Text::Sentence qw( split_sentences ); | 
|  | 2 |  |  |  |  | 5 |  | 
|  | 2 |  |  |  |  | 121 |  | 
| 147 | 2 |  |  | 2 |  | 1127 | use Lingua::JA::Jtruncate qw( jtruncate ); | 
|  | 2 |  |  |  |  | 7 |  | 
|  | 2 |  |  |  |  | 140 |  | 
| 148 |  |  |  |  |  |  |  | 
| 149 |  |  |  |  |  |  | #============================================================================== | 
| 150 |  |  |  |  |  |  | # | 
| 151 |  |  |  |  |  |  | # Constants | 
| 152 |  |  |  |  |  |  | # | 
| 153 |  |  |  |  |  |  | #============================================================================== | 
| 154 |  |  |  |  |  |  |  | 
| 155 | 2 |  |  | 2 |  | 13 | use constant IGNORE_TEXT => 1; | 
|  | 2 |  |  |  |  | 4 |  | 
|  | 2 |  |  |  |  | 137 |  | 
| 156 |  |  |  |  |  |  |  | 
| 157 |  |  |  |  |  |  | #============================================================================== | 
| 158 |  |  |  |  |  |  | # | 
| 159 |  |  |  |  |  |  | # Public globals | 
| 160 |  |  |  |  |  |  | # | 
| 161 |  |  |  |  |  |  | #============================================================================== | 
| 162 |  |  |  |  |  |  |  | 
| 163 | 2 |  |  | 2 |  | 10 | use vars qw( $VERSION ); | 
|  | 2 |  |  |  |  | 4 |  | 
|  | 2 |  |  |  |  | 2786 |  | 
| 164 |  |  |  |  |  |  |  | 
| 165 |  |  |  |  |  |  | our $VERSION = '0.020'; | 
| 166 |  |  |  |  |  |  |  | 
| 167 |  |  |  |  |  |  | #============================================================================== | 
| 168 |  |  |  |  |  |  | # | 
| 169 |  |  |  |  |  |  | # Private globals | 
| 170 |  |  |  |  |  |  | # | 
| 171 |  |  |  |  |  |  | #============================================================================== | 
| 172 |  |  |  |  |  |  |  | 
| 173 |  |  |  |  |  |  | my $DEFAULT_SCORE = 0; | 
| 174 |  |  |  |  |  |  |  | 
| 175 |  |  |  |  |  |  | my %ELEMENT_SCORES = ( | 
| 176 |  |  |  |  |  |  | 'p'         => 100, | 
| 177 |  |  |  |  |  |  | 'h1'        => 90, | 
| 178 |  |  |  |  |  |  | 'h2'        => 80, | 
| 179 |  |  |  |  |  |  | 'h3'        => 70, | 
| 180 |  |  |  |  |  |  | ); | 
| 181 |  |  |  |  |  |  |  | 
| 182 |  |  |  |  |  |  | my %DEFAULTS = ( | 
| 183 |  |  |  |  |  |  | 'USE_META'  => 0, | 
| 184 |  |  |  |  |  |  | 'VERBOSE'   => 0, | 
| 185 |  |  |  |  |  |  | 'LENGTH'    => 500, | 
| 186 |  |  |  |  |  |  | ); | 
| 187 |  |  |  |  |  |  |  | 
| 188 |  |  |  |  |  |  | #============================================================================== | 
| 189 |  |  |  |  |  |  | # | 
| 190 |  |  |  |  |  |  | # Public methods | 
| 191 |  |  |  |  |  |  | # | 
| 192 |  |  |  |  |  |  | #============================================================================== | 
| 193 |  |  |  |  |  |  |  | 
| 194 |  |  |  |  |  |  | #------------------------------------------------------------------------------ | 
| 195 |  |  |  |  |  |  | # | 
| 196 |  |  |  |  |  |  | # new - constructor. Configuration through "hash" type arguments, i.e. | 
| 197 |  |  |  |  |  |  | # my $abs = HTML::Summary->new( VAR1 => 'foo', VAR2 => 'bar' ); | 
| 198 |  |  |  |  |  |  | # | 
| 199 |  |  |  |  |  |  | #------------------------------------------------------------------------------ | 
| 200 |  |  |  |  |  |  |  | 
| 201 |  |  |  |  |  |  | sub new | 
| 202 |  |  |  |  |  |  | { | 
| 203 | 25 |  |  | 25 | 1 | 955053 | my $class = shift; | 
| 204 | 25 |  |  |  |  | 62 | my $self = bless { }, $class; | 
| 205 | 25 |  |  |  |  | 96 | return $self->_initialize( @_ ); | 
| 206 |  |  |  |  |  |  | } | 
| 207 |  |  |  |  |  |  |  | 
| 208 |  |  |  |  |  |  | #------------------------------------------------------------------------------ | 
| 209 |  |  |  |  |  |  | # | 
| 210 |  |  |  |  |  |  | # generate - main public interface method to generate a summary | 
| 211 |  |  |  |  |  |  | # | 
| 212 |  |  |  |  |  |  | #------------------------------------------------------------------------------ | 
| 213 |  |  |  |  |  |  |  | 
| 214 |  |  |  |  |  |  | sub generate | 
| 215 |  |  |  |  |  |  | { | 
| 216 | 26 |  |  | 26 | 1 | 2564 | my $self = shift; | 
| 217 | 26 |  |  |  |  | 42 | my $tree = shift; | 
| 218 |  |  |  |  |  |  |  | 
| 219 | 26 |  |  |  |  | 48 | my $summary; | 
| 220 |  |  |  |  |  |  |  | 
| 221 | 26 |  |  |  |  | 80 | $self->_verbose( 'Generate summary ...' ); | 
| 222 |  |  |  |  |  |  |  | 
| 223 |  |  |  |  |  |  | # check to see if there is a summary already defined in a META tag ... | 
| 224 |  |  |  |  |  |  |  | 
| 225 | 26 | 100 | 100 |  |  | 101 | if ( | 
| 226 |  |  |  |  |  |  | $self->{ USE_META } and | 
| 227 |  |  |  |  |  |  | $summary = $self->_get_summary_from_meta( $tree ) | 
| 228 |  |  |  |  |  |  | ) | 
| 229 |  |  |  |  |  |  | { | 
| 230 | 1 |  |  |  |  | 3 | $self->_verbose( "use summary from META tag ..." ); | 
| 231 | 1 |  |  |  |  | 2 | $self->_verbose( $summary ); | 
| 232 | 1 |  |  |  |  | 2 | return $summary; | 
| 233 |  |  |  |  |  |  | } | 
| 234 |  |  |  |  |  |  |  | 
| 235 |  |  |  |  |  |  | # traverse the HTML tree, building up @summary array | 
| 236 |  |  |  |  |  |  |  | 
| 237 | 25 |  |  |  |  | 85 | my @summary = $self->_get_summary( $tree ); | 
| 238 |  |  |  |  |  |  |  | 
| 239 |  |  |  |  |  |  | # sort @summary by score, truncate if it is greater than LENGTH | 
| 240 |  |  |  |  |  |  | # characters, and the re-sort by original order. Truncate AFTER the LENGTH | 
| 241 |  |  |  |  |  |  | # has been exceeded, so that last sentence is truncated later by | 
| 242 |  |  |  |  |  |  | # jtruncate | 
| 243 |  |  |  |  |  |  |  | 
| 244 | 25 |  |  |  |  | 148 | @summary = sort { $b->{ score } <=> $a->{ score } } @summary; | 
|  | 1261 |  |  |  |  | 1709 |  | 
| 245 |  |  |  |  |  |  |  | 
| 246 | 25 |  |  |  |  | 41 | my $tot_length = 0; | 
| 247 | 25 |  |  |  |  | 39 | my @truncated = (); | 
| 248 |  |  |  |  |  |  |  | 
| 249 | 25 |  |  |  |  | 62 | for ( @summary ) | 
| 250 |  |  |  |  |  |  | { | 
| 251 | 135 |  |  |  |  | 158 | push( @truncated, $_ ); | 
| 252 | 135 | 100 |  |  |  | 367 | last if ( $tot_length += $_->{ 'length' } ) > $self->{ LENGTH }; | 
| 253 |  |  |  |  |  |  | } | 
| 254 | 25 |  |  |  |  | 54 | @truncated = sort { $a->{ order } <=> $b->{ order } } @truncated; | 
|  | 236 |  |  |  |  | 323 |  | 
| 255 |  |  |  |  |  |  |  | 
| 256 |  |  |  |  |  |  | # these whitespaces will push the length over LENGTH, but jtruncate | 
| 257 |  |  |  |  |  |  | # should take care of this | 
| 258 |  |  |  |  |  |  |  | 
| 259 | 25 |  |  |  |  | 45 | $summary = join( ' ', map { $_->{ text } } @truncated ); | 
|  | 135 |  |  |  |  | 285 |  | 
| 260 | 25 |  |  |  |  | 87 | $self->_verbose( "truncate the summary to ", $self->{ LENGTH } ); | 
| 261 | 25 |  |  |  |  | 116 | $summary = jtruncate( $summary, $self->{ LENGTH } ); | 
| 262 | 25 |  |  |  |  | 334 | return $summary; | 
| 263 |  |  |  |  |  |  | } | 
| 264 |  |  |  |  |  |  |  | 
| 265 |  |  |  |  |  |  | #------------------------------------------------------------------------------ | 
| 266 |  |  |  |  |  |  | # | 
| 267 |  |  |  |  |  |  | # meta_used - tells whether the description from the META tag was used; returns | 
| 268 |  |  |  |  |  |  | # 1 if it was, 0 if the summary was generated automatically | 
| 269 |  |  |  |  |  |  | # | 
| 270 |  |  |  |  |  |  | #------------------------------------------------------------------------------ | 
| 271 |  |  |  |  |  |  |  | 
| 272 |  |  |  |  |  |  | sub meta_used | 
| 273 |  |  |  |  |  |  | { | 
| 274 | 0 |  |  | 0 | 1 | 0 | my $self = shift; | 
| 275 |  |  |  |  |  |  |  | 
| 276 | 0 |  |  |  |  | 0 | return $self->{ META_USED }; | 
| 277 |  |  |  |  |  |  | } | 
| 278 |  |  |  |  |  |  |  | 
| 279 |  |  |  |  |  |  | #------------------------------------------------------------------------------ | 
| 280 |  |  |  |  |  |  | # | 
| 281 |  |  |  |  |  |  | # option - get / set configuration option | 
| 282 |  |  |  |  |  |  | # | 
| 283 |  |  |  |  |  |  | #------------------------------------------------------------------------------ | 
| 284 |  |  |  |  |  |  |  | 
| 285 |  |  |  |  |  |  | sub option | 
| 286 |  |  |  |  |  |  | { | 
| 287 | 0 |  |  | 0 | 1 | 0 | my $self    = shift; | 
| 288 | 0 |  |  |  |  | 0 | my $option  = shift; | 
| 289 | 0 |  |  |  |  | 0 | my $val     = shift; | 
| 290 |  |  |  |  |  |  |  | 
| 291 | 0 | 0 |  |  |  | 0 | die "No HTML::Summary option name given" unless defined $option; | 
| 292 |  |  |  |  |  |  | die "$option is not an HTML::Summary option" unless | 
| 293 | 0 | 0 |  |  |  | 0 | grep { $_ eq $option } keys %DEFAULTS | 
|  | 0 |  |  |  |  | 0 |  | 
| 294 |  |  |  |  |  |  | ; | 
| 295 |  |  |  |  |  |  |  | 
| 296 | 0 | 0 |  |  |  | 0 | if ( defined $val ) | 
| 297 |  |  |  |  |  |  | { | 
| 298 | 0 |  |  |  |  | 0 | $self->{ $option } = $val; | 
| 299 |  |  |  |  |  |  | } | 
| 300 |  |  |  |  |  |  |  | 
| 301 | 0 |  |  |  |  | 0 | return $self->{ $option } = $val; | 
| 302 |  |  |  |  |  |  | } | 
| 303 |  |  |  |  |  |  |  | 
| 304 |  |  |  |  |  |  | #============================================================================== | 
| 305 |  |  |  |  |  |  | # | 
| 306 |  |  |  |  |  |  | # Private methods | 
| 307 |  |  |  |  |  |  | # | 
| 308 |  |  |  |  |  |  | #============================================================================== | 
| 309 |  |  |  |  |  |  |  | 
| 310 |  |  |  |  |  |  | #------------------------------------------------------------------------------ | 
| 311 |  |  |  |  |  |  | # | 
| 312 |  |  |  |  |  |  | # _initialize - supports sub-classing | 
| 313 |  |  |  |  |  |  | # | 
| 314 |  |  |  |  |  |  | #------------------------------------------------------------------------------ | 
| 315 |  |  |  |  |  |  |  | 
| 316 |  |  |  |  |  |  | sub _initialize | 
| 317 |  |  |  |  |  |  | { | 
| 318 | 25 |  |  | 25 |  | 38 | my $self = shift; | 
| 319 |  |  |  |  |  |  |  | 
| 320 | 25 | 50 |  |  |  | 107 | return undef unless @_ % 2 == 0;    # check that config hash has even no. | 
| 321 |  |  |  |  |  |  | # of elements | 
| 322 |  |  |  |  |  |  |  | 
| 323 | 25 |  |  |  |  | 90 | %{ $self } = ( %DEFAULTS, @_ );     # set options from defaults / config. | 
|  | 25 |  |  |  |  | 147 |  | 
| 324 |  |  |  |  |  |  | # hash passed as arguments | 
| 325 |  |  |  |  |  |  |  | 
| 326 | 25 |  |  |  |  | 122 | return $self; | 
| 327 |  |  |  |  |  |  | } | 
| 328 |  |  |  |  |  |  |  | 
| 329 |  |  |  |  |  |  | #------------------------------------------------------------------------------ | 
| 330 |  |  |  |  |  |  | # | 
| 331 |  |  |  |  |  |  | # _verbose - generate verbose error messages, if the VERBOSE option has been | 
| 332 |  |  |  |  |  |  | # selected | 
| 333 |  |  |  |  |  |  | # | 
| 334 |  |  |  |  |  |  | #------------------------------------------------------------------------------ | 
| 335 |  |  |  |  |  |  |  | 
| 336 |  |  |  |  |  |  | sub _verbose | 
| 337 |  |  |  |  |  |  | { | 
| 338 | 867 |  |  | 867 |  | 1171 | my $self = shift; | 
| 339 |  |  |  |  |  |  |  | 
| 340 | 867 | 50 |  |  |  | 2261 | return unless $self->{ VERBOSE }; | 
| 341 | 0 |  |  |  |  | 0 | print STDERR @_, "\n"; | 
| 342 |  |  |  |  |  |  | } | 
| 343 |  |  |  |  |  |  |  | 
| 344 |  |  |  |  |  |  | #------------------------------------------------------------------------------ | 
| 345 |  |  |  |  |  |  | # | 
| 346 |  |  |  |  |  |  | # _get_summary - get sentences from an element to generate the summary from. | 
| 347 |  |  |  |  |  |  | # Uses lexically scoped array @sentences to build up result from the traversal | 
| 348 |  |  |  |  |  |  | # callback | 
| 349 |  |  |  |  |  |  | # | 
| 350 |  |  |  |  |  |  | #------------------------------------------------------------------------------ | 
| 351 |  |  |  |  |  |  |  | 
| 352 |  |  |  |  |  |  | sub _get_summary | 
| 353 |  |  |  |  |  |  | { | 
| 354 | 25 |  |  | 25 |  | 37 | my $self = shift; | 
| 355 | 25 |  |  |  |  | 39 | my $tree = shift; | 
| 356 |  |  |  |  |  |  |  | 
| 357 | 25 |  |  |  |  | 46 | my @summary = (); | 
| 358 |  |  |  |  |  |  | my $add_sentence = sub { | 
| 359 | 470 |  |  | 470 |  | 636 | my $text        = shift; | 
| 360 | 470 |  |  |  |  | 617 | my $tag         = shift; | 
| 361 | 470 |  | 66 |  |  | 1179 | my $score       = shift || $DEFAULT_SCORE; | 
| 362 |  |  |  |  |  |  |  | 
| 363 | 470 | 100 |  |  |  | 1300 | return unless $text =~ /\w/; | 
| 364 |  |  |  |  |  |  |  | 
| 365 | 440 |  |  |  |  | 1202 | $text =~ s!^\s*!!; # remove leading ... | 
| 366 | 440 |  |  |  |  | 5100 | $text =~ s!\s*$!!; # ... and trailing whitespace | 
| 367 |  |  |  |  |  |  |  | 
| 368 | 440 |  |  |  |  | 2070 | $summary[ scalar( @summary ) ] = { | 
| 369 |  |  |  |  |  |  | 'text'          => $text, | 
| 370 |  |  |  |  |  |  | 'length'        => length( $text ), | 
| 371 |  |  |  |  |  |  | 'tag'           => $tag, | 
| 372 |  |  |  |  |  |  | 'score'         => $score, | 
| 373 |  |  |  |  |  |  | 'order'         => scalar( @summary ), | 
| 374 |  |  |  |  |  |  | }; | 
| 375 | 25 |  |  |  |  | 134 | }; | 
| 376 |  |  |  |  |  |  | $tree->traverse( | 
| 377 |  |  |  |  |  |  | sub { | 
| 378 | 1339 |  |  | 1339 |  | 19851 | my $node = shift; | 
| 379 | 1339 |  |  |  |  | 1619 | my $flag = shift; | 
| 380 |  |  |  |  |  |  |  | 
| 381 | 1339 | 100 |  |  |  | 2558 | if ( $flag ) # entering node ... | 
| 382 |  |  |  |  |  |  | { | 
| 383 | 1001 |  |  |  |  | 2403 | my $tag = $node->tag; | 
| 384 | 1001 | 100 |  |  |  | 6158 | return 0 if $tag eq 'head'; | 
| 385 |  |  |  |  |  |  |  | 
| 386 |  |  |  |  |  |  | # add sentences which either are scoring, or span no other | 
| 387 |  |  |  |  |  |  | # scoring sentences (and have a score of 0).  In this way, all | 
| 388 |  |  |  |  |  |  | # text is captured, even if it scores 0; the only exception is | 
| 389 |  |  |  |  |  |  | # something like some text foobar, where | 
| 390 |  |  |  |  |  |  | # everything but "foobar" will be lost. However, if you have | 
| 391 |  |  |  |  |  |  | # | some text | foobar
 you should get 
| 392 |  |  |  |  |  |  | # all the text. | 
| 393 |  |  |  |  |  |  |  | 
| 394 | 976 | 100 | 100 |  |  | 2958 | if ( | 
| 395 |  |  |  |  |  |  | $ELEMENT_SCORES{ $tag } || | 
| 396 |  |  |  |  |  |  | ! _has_scoring_element( $node ) | 
| 397 |  |  |  |  |  |  | ) | 
| 398 |  |  |  |  |  |  | { | 
| 399 | 638 |  |  |  |  | 1172 | my $text = _get_text( $node ); | 
| 400 | 638 |  |  |  |  | 1322 | foreach ( $text ) # alias $_ to $text | 
| 401 |  |  |  |  |  |  | { | 
| 402 |  |  |  |  |  |  | # get rid of whitespace (including  ) from start / | 
| 403 |  |  |  |  |  |  | # end of $text | 
| 404 | 638 |  |  |  |  | 1958 | s/^[\s\160]*//; | 
| 405 | 638 |  |  |  |  | 5827 | s/[\s\160]*$//; | 
| 406 |  |  |  |  |  |  | # get rid of any spurious tags that have slipped | 
| 407 |  |  |  |  |  |  | # through the HTML::TreeBuilder | 
| 408 | 638 |  |  |  |  | 1369 | s!<[^>]+>!!g; | 
| 409 |  |  |  |  |  |  | } | 
| 410 |  |  |  |  |  |  |  | 
| 411 | 638 | 100 |  |  |  | 1600 | if ( $text =~ /\S/ ) | 
| 412 |  |  |  |  |  |  | { | 
| 413 | 344 |  | 66 |  |  | 963 | my $score = $ELEMENT_SCORES{ $tag } || $DEFAULT_SCORE; | 
| 414 |  |  |  |  |  |  |  | 
| 415 |  |  |  |  |  |  | # add all the sentences in the text. Only the first | 
| 416 |  |  |  |  |  |  | # sentence gets the element score - the rest get the | 
| 417 |  |  |  |  |  |  | # default score | 
| 418 |  |  |  |  |  |  |  | 
| 419 | 344 |  |  |  |  | 1059 | $self->_verbose( "TEXT: $text" ); | 
| 420 | 344 |  |  |  |  | 1402 | for my $sentence ( | 
| 421 |  |  |  |  |  |  | split_sentences( $text, $self->{ 'LOCALE' } ) | 
| 422 |  |  |  |  |  |  | ) | 
| 423 |  |  |  |  |  |  | { | 
| 424 | 470 |  |  |  |  | 1454 | $self->_verbose( "SENTENCE: $text" ); | 
| 425 | 470 |  |  |  |  | 951 | $add_sentence->( $sentence, $tag, $score ); | 
| 426 | 470 |  |  |  |  | 955 | $score = $DEFAULT_SCORE; | 
| 427 |  |  |  |  |  |  | } | 
| 428 |  |  |  |  |  |  | } | 
| 429 |  |  |  |  |  |  |  | 
| 430 |  |  |  |  |  |  | # return 0 to avoid getting the same sentence in a scoring | 
| 431 |  |  |  |  |  |  | # "daughter" element | 
| 432 |  |  |  |  |  |  |  | 
| 433 | 638 |  |  |  |  | 1946 | return 0; | 
| 434 |  |  |  |  |  |  | } | 
| 435 |  |  |  |  |  |  | } | 
| 436 |  |  |  |  |  |  |  | 
| 437 |  |  |  |  |  |  | # continue traversal ... | 
| 438 |  |  |  |  |  |  |  | 
| 439 | 676 |  |  |  |  | 1410 | return 1; | 
| 440 |  |  |  |  |  |  | }, | 
| 441 | 25 |  |  |  |  | 186 | IGNORE_TEXT | 
| 442 |  |  |  |  |  |  | ); | 
| 443 | 25 |  |  |  |  | 628 | return @summary; | 
| 444 |  |  |  |  |  |  | } | 
| 445 |  |  |  |  |  |  |  | 
| 446 |  |  |  |  |  |  | #------------------------------------------------------------------------------ | 
| 447 |  |  |  |  |  |  | # | 
| 448 |  |  |  |  |  |  | # _get_summary_from_meta - check to see if there is already a summary | 
| 449 |  |  |  |  |  |  | # defined in the META tag in the HEAD | 
| 450 |  |  |  |  |  |  | # | 
| 451 |  |  |  |  |  |  | #------------------------------------------------------------------------------ | 
| 452 |  |  |  |  |  |  |  | 
| 453 |  |  |  |  |  |  | sub _get_summary_from_meta | 
| 454 |  |  |  |  |  |  | { | 
| 455 | 2 |  |  | 2 |  | 3 | my $self = shift; | 
| 456 | 2 |  |  |  |  | 3 | my $tree = shift; | 
| 457 |  |  |  |  |  |  |  | 
| 458 | 2 |  |  |  |  | 2 | my $summary; | 
| 459 |  |  |  |  |  |  |  | 
| 460 |  |  |  |  |  |  | $tree->traverse( | 
| 461 |  |  |  |  |  |  | sub { | 
| 462 | 23 |  |  | 23 |  | 443 | my $node = shift; | 
| 463 | 23 |  |  |  |  | 29 | my $flag = shift; | 
| 464 |  |  |  |  |  |  |  | 
| 465 | 23 | 50 | 100 |  |  | 62 | if ($node->tag eq 'meta' | 
|  |  |  | 66 |  |  |  |  | 
|  |  |  | 66 |  |  |  |  | 
| 466 |  |  |  |  |  |  | && defined($node->attr('name')) | 
| 467 |  |  |  |  |  |  | && lc( $node->attr('name') ) eq 'description' | 
| 468 |  |  |  |  |  |  | && defined($node->attr('content'))) | 
| 469 |  |  |  |  |  |  | { | 
| 470 | 1 |  |  |  |  | 49 | $summary = $node->attr( 'content' ); | 
| 471 | 1 | 50 |  |  |  | 13 | $summary = undef if $summary eq 'content'; | 
| 472 | 1 |  |  |  |  | 3 | return 0; | 
| 473 |  |  |  |  |  |  | } | 
| 474 | 22 |  |  |  |  | 193 | return 1; | 
| 475 |  |  |  |  |  |  | }, | 
| 476 | 2 |  |  |  |  | 18 | IGNORE_TEXT | 
| 477 |  |  |  |  |  |  | ); | 
| 478 |  |  |  |  |  |  |  | 
| 479 | 2 | 100 |  |  |  | 38 | $self->{ META_USED } = defined( $summary ) ? 1 : 0; | 
| 480 | 2 |  |  |  |  | 10 | return $summary; | 
| 481 |  |  |  |  |  |  | } | 
| 482 |  |  |  |  |  |  |  | 
| 483 |  |  |  |  |  |  | #============================================================================== | 
| 484 |  |  |  |  |  |  | # | 
| 485 |  |  |  |  |  |  | # Private functions | 
| 486 |  |  |  |  |  |  | # | 
| 487 |  |  |  |  |  |  | #============================================================================== | 
| 488 |  |  |  |  |  |  |  | 
| 489 |  |  |  |  |  |  | #------------------------------------------------------------------------------ | 
| 490 |  |  |  |  |  |  | # | 
| 491 |  |  |  |  |  |  | # _get_text - get all the text spanned by an element. Uses lexically scoped | 
| 492 |  |  |  |  |  |  | # variable $html to build up result from the traversal callback | 
| 493 |  |  |  |  |  |  | # | 
| 494 |  |  |  |  |  |  | #------------------------------------------------------------------------------ | 
| 495 |  |  |  |  |  |  |  | 
| 496 |  |  |  |  |  |  | sub _get_text | 
| 497 |  |  |  |  |  |  | { | 
| 498 | 638 |  |  | 638 |  | 815 | my $node = shift; | 
| 499 |  |  |  |  |  |  |  | 
| 500 | 638 |  |  |  |  | 858 | my $html = ''; | 
| 501 |  |  |  |  |  |  | $node->traverse( | 
| 502 |  |  |  |  |  |  | sub { | 
| 503 | 7356 |  |  | 7356 |  | 111772 | my $node = shift; | 
| 504 | 7356 | 100 |  |  |  | 14374 | $html .= $node unless ref( $node ); | 
| 505 | 7356 |  |  |  |  | 13448 | return 1; | 
| 506 |  |  |  |  |  |  | } | 
| 507 | 638 |  |  |  |  | 2756 | ); | 
| 508 | 638 |  |  |  |  | 10311 | return $html; | 
| 509 |  |  |  |  |  |  | } | 
| 510 |  |  |  |  |  |  |  | 
| 511 |  |  |  |  |  |  | #------------------------------------------------------------------------------ | 
| 512 |  |  |  |  |  |  | # | 
| 513 |  |  |  |  |  |  | # _has_scoring_element - check to see if this element spans any scoring | 
| 514 |  |  |  |  |  |  | # element.  Uses lexically scoped variable $has_scoring_element to build up | 
| 515 |  |  |  |  |  |  | # result from the traversal callback. | 
| 516 |  |  |  |  |  |  | # | 
| 517 |  |  |  |  |  |  | #------------------------------------------------------------------------------ | 
| 518 |  |  |  |  |  |  |  | 
| 519 |  |  |  |  |  |  | sub _has_scoring_element | 
| 520 |  |  |  |  |  |  | { | 
| 521 | 518 |  |  | 518 |  | 591 | my $node = shift; | 
| 522 |  |  |  |  |  |  |  | 
| 523 | 518 |  |  |  |  | 593 | my $has_scoring_element = 0; | 
| 524 |  |  |  |  |  |  | $node->traverse( | 
| 525 |  |  |  |  |  |  | sub { | 
| 526 | 34325 |  |  | 34325 |  | 592596 | my $node = shift; | 
| 527 | 34325 |  |  |  |  | 75054 | my $tag = $node->tag; | 
| 528 | 34325 |  | 100 |  |  | 193711 | $has_scoring_element ||= $ELEMENT_SCORES{ $tag }; | 
| 529 | 34325 |  |  |  |  | 64908 | return 1; | 
| 530 |  |  |  |  |  |  | }, | 
| 531 | 518 |  |  |  |  | 2461 | IGNORE_TEXT | 
| 532 |  |  |  |  |  |  | ); | 
| 533 | 518 |  |  |  |  | 9725 | return $has_scoring_element; | 
| 534 |  |  |  |  |  |  | } | 
| 535 |  |  |  |  |  |  |  | 
| 536 |  |  |  |  |  |  | #============================================================================== | 
| 537 |  |  |  |  |  |  | # | 
| 538 |  |  |  |  |  |  | # Return TRUE | 
| 539 |  |  |  |  |  |  | # | 
| 540 |  |  |  |  |  |  | #============================================================================== | 
| 541 |  |  |  |  |  |  |  | 
| 542 |  |  |  |  |  |  | 1; |