| line | stmt | bran | cond | sub | pod | time | code | 
| 1 |  |  |  |  |  |  | package Search::Tools::HeatMap; | 
| 2 | 17 |  |  | 17 |  | 61 | use Moo; | 
|  | 17 |  |  |  |  | 57 |  | 
|  | 17 |  |  |  |  | 76 |  | 
| 3 | 17 |  |  | 17 |  | 3115 | use Carp; | 
|  | 17 |  |  |  |  | 20 |  | 
|  | 17 |  |  |  |  | 790 |  | 
| 4 | 17 |  |  | 17 |  | 62 | use Data::Dump qw( dump ); | 
|  | 17 |  |  |  |  | 17 |  | 
|  | 17 |  |  |  |  | 657 |  | 
| 5 |  |  |  |  |  |  | extends 'Search::Tools::Object'; | 
| 6 |  |  |  |  |  |  |  | 
| 7 | 17 |  |  | 17 |  | 58 | use namespace::autoclean; | 
|  | 17 |  |  |  |  | 19 |  | 
|  | 17 |  |  |  |  | 93 |  | 
| 8 |  |  |  |  |  |  |  | 
| 9 |  |  |  |  |  |  | our $VERSION = '1.004'; | 
| 10 |  |  |  |  |  |  |  | 
| 11 |  |  |  |  |  |  | # debugging only | 
| 12 |  |  |  |  |  |  | my $OPEN  = '['; | 
| 13 |  |  |  |  |  |  | my $CLOSE = ']'; | 
| 14 |  |  |  |  |  |  | eval { require Term::ANSIColor; }; | 
| 15 |  |  |  |  |  |  | if ( !$@ ) { | 
| 16 |  |  |  |  |  |  | $OPEN .= Term::ANSIColor::color('bold red'); | 
| 17 |  |  |  |  |  |  | $CLOSE = Term::ANSIColor::color('reset') . $CLOSE; | 
| 18 |  |  |  |  |  |  | } | 
| 19 |  |  |  |  |  |  |  | 
| 20 |  |  |  |  |  |  | my @attrs = qw( window_size | 
| 21 |  |  |  |  |  |  | tokens | 
| 22 |  |  |  |  |  |  | spans | 
| 23 |  |  |  |  |  |  | as_sentences | 
| 24 |  |  |  |  |  |  | _treat_phrases_as_singles | 
| 25 |  |  |  |  |  |  | _qre | 
| 26 |  |  |  |  |  |  | _query | 
| 27 |  |  |  |  |  |  | _stemmer | 
| 28 |  |  |  |  |  |  | ); | 
| 29 |  |  |  |  |  |  |  | 
| 30 |  |  |  |  |  |  | for my $attr (@attrs) { | 
| 31 |  |  |  |  |  |  | has $attr => ( is => 'rw' ); | 
| 32 |  |  |  |  |  |  | } | 
| 33 |  |  |  |  |  |  |  | 
| 34 |  |  |  |  |  |  | =head1 NAME | 
| 35 |  |  |  |  |  |  |  | 
| 36 |  |  |  |  |  |  | Search::Tools::HeatMap - locate the best matches in a snippet extract | 
| 37 |  |  |  |  |  |  |  | 
| 38 |  |  |  |  |  |  | =head1 SYNOPSIS | 
| 39 |  |  |  |  |  |  |  | 
| 40 |  |  |  |  |  |  | use Search::Tools::Tokenizer; | 
| 41 |  |  |  |  |  |  | use Search::Tools::HeatMap; | 
| 42 |  |  |  |  |  |  |  | 
| 43 |  |  |  |  |  |  | my $tokens = $self->tokenizer->tokenize( $my_string, qr/^(interesting)$/ ); | 
| 44 |  |  |  |  |  |  | my $heatmap = Search::Tools::HeatMap->new( | 
| 45 |  |  |  |  |  |  | tokens         => $tokens, | 
| 46 |  |  |  |  |  |  | window_size    => 20,  # default | 
| 47 |  |  |  |  |  |  | as_sentences   => 0,   # default | 
| 48 |  |  |  |  |  |  | ); | 
| 49 |  |  |  |  |  |  |  | 
| 50 |  |  |  |  |  |  | if ( $heatmap->has_spans ) { | 
| 51 |  |  |  |  |  |  |  | 
| 52 |  |  |  |  |  |  | my $tokens_arr = $tokens->as_array; | 
| 53 |  |  |  |  |  |  |  | 
| 54 |  |  |  |  |  |  | # stringify positions | 
| 55 |  |  |  |  |  |  | my @snips; | 
| 56 |  |  |  |  |  |  | for my $span ( @{ $heatmap->spans } ) { | 
| 57 |  |  |  |  |  |  | push( @snips, $span->{str} ); | 
| 58 |  |  |  |  |  |  | } | 
| 59 |  |  |  |  |  |  | my $occur_index = $self->occur - 1; | 
| 60 |  |  |  |  |  |  | if ( $#snips > $occur_index ) { | 
| 61 |  |  |  |  |  |  | @snips = @snips[ 0 .. $occur_index ]; | 
| 62 |  |  |  |  |  |  | } | 
| 63 |  |  |  |  |  |  | printf("%s\n", join( ' ... ', @snips )); | 
| 64 |  |  |  |  |  |  |  | 
| 65 |  |  |  |  |  |  | } | 
| 66 |  |  |  |  |  |  |  | 
| 67 |  |  |  |  |  |  | =head1 DESCRIPTION | 
| 68 |  |  |  |  |  |  |  | 
| 69 |  |  |  |  |  |  | Search::Tools::HeatMap implements a simple algorithm for locating | 
| 70 |  |  |  |  |  |  | the densest clusters of unique, hot terms in a TokenList. | 
| 71 |  |  |  |  |  |  |  | 
| 72 |  |  |  |  |  |  | HeatMap is used internally by Snipper but documented here in case | 
| 73 |  |  |  |  |  |  | someone wants to abuse and/or improve it. | 
| 74 |  |  |  |  |  |  |  | 
| 75 |  |  |  |  |  |  | =head1 METHODS | 
| 76 |  |  |  |  |  |  |  | 
| 77 |  |  |  |  |  |  | =head2 new( tokens => I ) | 
| 78 |  |  |  |  |  |  |  | 
| 79 |  |  |  |  |  |  | Create a new HeatMap. The I object may be either a | 
| 80 |  |  |  |  |  |  | Search::Tools::TokenList or Search::Tools::TokenListPP object. | 
| 81 |  |  |  |  |  |  |  | 
| 82 |  |  |  |  |  |  | =head2 BUILD | 
| 83 |  |  |  |  |  |  |  | 
| 84 |  |  |  |  |  |  | Builds the HeatMap object. Called internally by new(). | 
| 85 |  |  |  |  |  |  |  | 
| 86 |  |  |  |  |  |  | =cut | 
| 87 |  |  |  |  |  |  |  | 
| 88 |  |  |  |  |  |  | sub BUILD { | 
| 89 | 30 |  |  | 30 | 1 | 485 | my $self = shift; | 
| 90 | 30 |  |  |  |  | 96 | $self->_build; | 
| 91 | 30 |  |  |  |  | 741 | return $self; | 
| 92 |  |  |  |  |  |  | } | 
| 93 |  |  |  |  |  |  |  | 
| 94 |  |  |  |  |  |  | =head2 window_size | 
| 95 |  |  |  |  |  |  |  | 
| 96 |  |  |  |  |  |  | The max width of a span. Defaults to 20 tokens, including the | 
| 97 |  |  |  |  |  |  | matches. | 
| 98 |  |  |  |  |  |  |  | 
| 99 |  |  |  |  |  |  | Set this in new(). Access it later if you need to, but the spans | 
| 100 |  |  |  |  |  |  | will have already been created by new(). | 
| 101 |  |  |  |  |  |  |  | 
| 102 |  |  |  |  |  |  | =head2 as_sentences | 
| 103 |  |  |  |  |  |  |  | 
| 104 |  |  |  |  |  |  | Try to match clusters at sentence boundaries. Default is false. | 
| 105 |  |  |  |  |  |  |  | 
| 106 |  |  |  |  |  |  | Set this in new(). | 
| 107 |  |  |  |  |  |  |  | 
| 108 |  |  |  |  |  |  | =head2 spans | 
| 109 |  |  |  |  |  |  |  | 
| 110 |  |  |  |  |  |  | Returns an array ref of matching clusters. Each span in the array | 
| 111 |  |  |  |  |  |  | is a hash ref with the following keys: | 
| 112 |  |  |  |  |  |  |  | 
| 113 |  |  |  |  |  |  | =over | 
| 114 |  |  |  |  |  |  |  | 
| 115 |  |  |  |  |  |  | =item cluster | 
| 116 |  |  |  |  |  |  |  | 
| 117 |  |  |  |  |  |  | =item pos | 
| 118 |  |  |  |  |  |  |  | 
| 119 |  |  |  |  |  |  | =item heat | 
| 120 |  |  |  |  |  |  |  | 
| 121 |  |  |  |  |  |  | =item str | 
| 122 |  |  |  |  |  |  |  | 
| 123 |  |  |  |  |  |  | =item str_w_pos | 
| 124 |  |  |  |  |  |  |  | 
| 125 |  |  |  |  |  |  | This item is available only if debug() is true. | 
| 126 |  |  |  |  |  |  |  | 
| 127 |  |  |  |  |  |  | =item unique | 
| 128 |  |  |  |  |  |  |  | 
| 129 |  |  |  |  |  |  | =back | 
| 130 |  |  |  |  |  |  |  | 
| 131 |  |  |  |  |  |  | =cut | 
| 132 |  |  |  |  |  |  |  | 
| 133 |  |  |  |  |  |  | # TODO this is mostly integer math and might be much | 
| 134 |  |  |  |  |  |  | # faster if rewritten in XS once the algorithm is "final". | 
| 135 |  |  |  |  |  |  | sub _build { | 
| 136 | 30 |  |  | 30 |  | 39 | my $self         = shift; | 
| 137 | 30 | 50 |  |  |  | 544 | my $tokens       = $self->tokens or croak "tokens required"; | 
| 138 | 30 |  | 50 |  |  | 126 | my $window       = $self->window_size || 20; | 
| 139 | 30 |  | 100 |  |  | 152 | my $as_sentences = $self->as_sentences || 0; | 
| 140 | 30 | 100 |  |  |  | 131 | return $as_sentences | 
| 141 |  |  |  |  |  |  | ? $self->_as_sentences( $tokens, $window ) | 
| 142 |  |  |  |  |  |  | : $self->_no_sentences( $tokens, $window ); | 
| 143 |  |  |  |  |  |  | } | 
| 144 |  |  |  |  |  |  |  | 
| 145 |  |  |  |  |  |  | # currently _as_sentences() is mostly identical to _no_sentences() | 
| 146 |  |  |  |  |  |  | # with slightly fewer gymnastics. | 
| 147 |  |  |  |  |  |  | # Since we already know via sentence_starts where our boundaries are, | 
| 148 |  |  |  |  |  |  | # we do not have to call $tokens->get_window(). | 
| 149 |  |  |  |  |  |  | # Who knows how we might improve the sentence algorithm in future, | 
| 150 |  |  |  |  |  |  | # so already having it in its own method seems like a win. | 
| 151 |  |  |  |  |  |  | sub _as_sentences { | 
| 152 | 13 |  |  | 13 |  | 24 | my ( $self, $tokens, $window ) = @_; | 
| 153 | 13 |  | 50 |  |  | 265 | my $debug = $self->debug || 0; | 
| 154 | 13 |  |  |  |  | 121 | my $sentence_length = $window * 2; | 
| 155 |  |  |  |  |  |  |  | 
| 156 |  |  |  |  |  |  | # build heatmap with sentence starts | 
| 157 | 13 |  |  |  |  | 51 | my $num_tokens           = $tokens->len; | 
| 158 | 13 |  |  |  |  | 41 | my $tokens_arr           = $tokens->as_array; | 
| 159 | 13 |  |  |  |  | 26 | my %heatmap              = (); | 
| 160 | 13 |  |  |  |  | 44 | my $token_list_heat      = $tokens->get_heat; | 
| 161 | 13 |  |  |  |  | 43 | my $heat_sentence_starts = $tokens->get_sentence_starts; | 
| 162 |  |  |  |  |  |  |  | 
| 163 |  |  |  |  |  |  | # this regex is a sanity check for phrases. we replace the \ with a | 
| 164 |  |  |  |  |  |  | # more promiscuous check because the single space is too naive | 
| 165 |  |  |  |  |  |  | # for real text (e.g. st. john's) | 
| 166 | 13 |  |  |  |  | 24 | my $qre              = $self->{_qre}; | 
| 167 | 13 |  |  |  |  | 20 | my @phrases          = @{ $self->{_query}->phrases }; | 
|  | 13 |  |  |  |  | 62 |  | 
| 168 | 13 |  |  |  |  | 53 | my $n_terms          = $self->{_query}->num_terms; | 
| 169 | 13 |  |  |  |  | 80 | my $query_has_phrase = $qre =~ s/(\\ )+/.+/g; | 
| 170 |  |  |  |  |  |  |  | 
| 171 | 13 | 50 |  |  |  | 41 | if ($debug) { | 
| 172 | 0 |  |  |  |  | 0 | warn "heat_sentence_starts: " . dump($heat_sentence_starts); | 
| 173 | 0 |  |  |  |  | 0 | warn "token_list_heat: " . dump($token_list_heat); | 
| 174 | 0 |  |  |  |  | 0 | warn "n_terms: $n_terms"; | 
| 175 | 0 |  |  |  |  | 0 | warn "phrases: " . dump( \@phrases ); | 
| 176 | 0 |  |  |  |  | 0 | warn "query_has_phrase: $query_has_phrase"; | 
| 177 |  |  |  |  |  |  | } | 
| 178 |  |  |  |  |  |  |  | 
| 179 |  |  |  |  |  |  | # find the "sentence" that each hot token appears in. | 
| 180 | 13 |  |  |  |  | 14 | my @starts_ends; | 
| 181 | 13 |  |  |  |  | 15 | my $i                  = 0; | 
| 182 | 13 |  |  |  |  | 23 | my %heat_sentence_ends = ();    # cache | 
| 183 | 13 |  |  |  |  | 29 | for (@$token_list_heat) { | 
| 184 | 42 |  |  |  |  | 111 | my $token     = $tokens->get_token($_); | 
| 185 | 42 |  |  |  |  | 100 | my $token_pos = $token->pos; | 
| 186 | 42 |  |  |  |  | 54 | my $start     = $heat_sentence_starts->[ $i++ ]; | 
| 187 | 42 |  |  |  |  | 138 | $heatmap{$token_pos} = $token->is_hot; | 
| 188 |  |  |  |  |  |  |  | 
| 189 |  |  |  |  |  |  | # a little optimization for when we've got | 
| 190 |  |  |  |  |  |  | # multiple hot tokens in the same sentence | 
| 191 | 42 | 100 |  |  |  | 84 | if ( exists $heat_sentence_ends{$start} ) { | 
| 192 | 22 | 50 |  |  |  | 40 | $debug | 
| 193 |  |  |  |  |  |  | and warn "found cached end $heat_sentence_ends{$start} " | 
| 194 |  |  |  |  |  |  | . "for start $start token $token_pos\n"; | 
| 195 |  |  |  |  |  |  |  | 
| 196 |  |  |  |  |  |  | push( @starts_ends, | 
| 197 | 22 |  |  |  |  | 47 | [ $start, $token_pos, $heat_sentence_ends{$start} ] ); | 
| 198 | 22 |  |  |  |  | 35 | next; | 
| 199 |  |  |  |  |  |  | } | 
| 200 |  |  |  |  |  |  |  | 
| 201 |  |  |  |  |  |  | # find the outermost limit of where this sentence might end | 
| 202 | 20 |  |  |  |  | 19 | my $max_end; | 
| 203 |  |  |  |  |  |  |  | 
| 204 |  |  |  |  |  |  | # is there a "next" start? | 
| 205 | 20 | 100 | 100 |  |  | 98 | if ( defined $heat_sentence_starts->[$i] | 
| 206 |  |  |  |  |  |  | and $heat_sentence_starts->[$i] != $start ) | 
| 207 |  |  |  |  |  |  | { | 
| 208 |  |  |  |  |  |  |  | 
| 209 |  |  |  |  |  |  | # this token is unique in this non-final sentence | 
| 210 | 3 |  |  |  |  | 8 | $max_end = $heat_sentence_starts->[$i] - 1; | 
| 211 |  |  |  |  |  |  | } | 
| 212 |  |  |  |  |  |  | else { | 
| 213 |  |  |  |  |  |  |  | 
| 214 |  |  |  |  |  |  | # this is the final sentence | 
| 215 | 17 |  |  |  |  | 30 | $max_end = $num_tokens - 1; | 
| 216 |  |  |  |  |  |  | } | 
| 217 | 20 |  |  |  |  | 29 | my $end = $start; | 
| 218 |  |  |  |  |  |  |  | 
| 219 |  |  |  |  |  |  | # find the nearest sentence end to the start | 
| 220 | 20 |  |  |  |  | 47 | while ( $end < $max_end ) { | 
| 221 | 1990 |  |  |  |  | 2299 | my $tok = $tokens->get_token( $end++ ); | 
| 222 | 1990 | 50 |  |  |  | 2868 | if ( !$tok ) { | 
| 223 | 0 | 0 |  |  |  | 0 | $debug and warn "No token at end=$end"; | 
| 224 | 0 |  |  |  |  | 0 | last; | 
| 225 |  |  |  |  |  |  | } | 
| 226 | 1990 | 100 |  |  |  | 4422 | if ( $tok->is_sentence_end ) { | 
| 227 | 10 |  |  |  |  | 17 | $end--;    # move back one position | 
| 228 | 10 | 50 |  |  |  | 25 | if ($debug) { | 
| 229 | 0 |  |  |  |  | 0 | warn "tok $_ is_sentence_end end=$end"; | 
| 230 | 0 |  |  |  |  | 0 | $tok->dump; | 
| 231 |  |  |  |  |  |  | } | 
| 232 | 10 |  |  |  |  | 23 | last; | 
| 233 |  |  |  |  |  |  | } | 
| 234 |  |  |  |  |  |  | } | 
| 235 |  |  |  |  |  |  |  | 
| 236 |  |  |  |  |  |  | # back up if we've exceeded the 0-based tokens array. | 
| 237 | 20 | 50 |  |  |  | 50 | $end = $num_tokens if $end > $num_tokens; | 
| 238 |  |  |  |  |  |  |  | 
| 239 | 20 | 50 |  |  |  | 45 | $debug | 
| 240 |  |  |  |  |  |  | and warn "start=$start max_end=$max_end " | 
| 241 |  |  |  |  |  |  | . "sentence_length=$sentence_length end=$end " | 
| 242 |  |  |  |  |  |  | . "token_pos=$token_pos\n"; | 
| 243 |  |  |  |  |  |  |  | 
| 244 |  |  |  |  |  |  | # if we didn't yet set the actual hot token, | 
| 245 |  |  |  |  |  |  | # include everything up to it. | 
| 246 | 20 | 50 |  |  |  | 45 | if ( $end < $token_pos ) { | 
| 247 | 0 | 0 |  |  |  | 0 | $debug | 
| 248 |  |  |  |  |  |  | and warn "resetting end=$token_pos\n"; | 
| 249 |  |  |  |  |  |  |  | 
| 250 | 0 |  |  |  |  | 0 | $end = $token_pos; | 
| 251 |  |  |  |  |  |  | } | 
| 252 | 20 |  |  |  |  | 57 | push( @starts_ends, [ $start, $token_pos, $end ] ); | 
| 253 |  |  |  |  |  |  |  | 
| 254 |  |  |  |  |  |  | # cache | 
| 255 | 20 |  |  |  |  | 61 | $heat_sentence_ends{$start} = $end; | 
| 256 |  |  |  |  |  |  | } | 
| 257 |  |  |  |  |  |  |  | 
| 258 | 13 | 50 |  |  |  | 36 | $debug and warn "starts_ends: " . dump( \@starts_ends ); | 
| 259 |  |  |  |  |  |  |  | 
| 260 | 13 |  |  |  |  | 19 | my @spans; | 
| 261 |  |  |  |  |  |  | my %seen_pos; | 
| 262 |  |  |  |  |  |  | START_END: | 
| 263 | 13 |  |  |  |  | 44 | for my $start_end (@starts_ends) { | 
| 264 |  |  |  |  |  |  |  | 
| 265 |  |  |  |  |  |  | # get full window, ignoring positions we've already seen. | 
| 266 | 42 |  |  |  |  | 42 | my $heat = 0; | 
| 267 | 42 |  |  |  |  | 37 | my %span; | 
| 268 |  |  |  |  |  |  | my @cluster_tokens; | 
| 269 |  |  |  |  |  |  |  | 
| 270 | 42 |  |  |  |  | 85 | my ( $start, $hot_pos, $end ) = @$start_end; | 
| 271 | 42 |  |  |  |  | 89 | POS: for my $pos ( $start .. $end ) { | 
| 272 | 5075 | 100 |  |  |  | 9889 | next POS if $seen_pos{$pos}++; | 
| 273 | 2000 | 100 |  |  |  | 2064 | $heat += ( exists $heatmap{$pos} ? $heatmap{$pos} : 0 ); | 
| 274 | 2000 |  |  |  |  | 2426 | push( @cluster_tokens, $tokens->get_token($pos) ); | 
| 275 |  |  |  |  |  |  | } | 
| 276 |  |  |  |  |  |  |  | 
| 277 |  |  |  |  |  |  | # if we had already seen_pos all positions. | 
| 278 | 42 | 100 |  |  |  | 109 | next START_END unless @cluster_tokens; | 
| 279 |  |  |  |  |  |  |  | 
| 280 |  |  |  |  |  |  | # sanity: make sure we still have something hot | 
| 281 | 20 |  |  |  |  | 29 | my $has_hot = 0; | 
| 282 | 20 |  |  |  |  | 24 | my @cluster_pos; | 
| 283 |  |  |  |  |  |  | my @strings; | 
| 284 | 20 |  |  |  |  | 34 | TOK: for (@cluster_tokens) { | 
| 285 | 2000 |  |  |  |  | 1855 | my $pos = $_->pos; | 
| 286 | 2000 | 100 |  |  |  | 2246 | $has_hot++ if exists $heatmap{$pos}; | 
| 287 | 2000 |  |  |  |  | 2235 | push @strings,     $_->str; | 
| 288 | 2000 |  |  |  |  | 1799 | push @cluster_pos, $pos; | 
| 289 |  |  |  |  |  |  | } | 
| 290 | 20 | 50 |  |  |  | 43 | next START_END unless $has_hot; | 
| 291 |  |  |  |  |  |  |  | 
| 292 |  |  |  |  |  |  | # the final string is a sentence end, | 
| 293 |  |  |  |  |  |  | # but we only want the first char in it, | 
| 294 |  |  |  |  |  |  | # and not any whitespace, stray punctuation or other | 
| 295 |  |  |  |  |  |  | # non-word noise. | 
| 296 | 20 |  |  |  |  | 122 | $strings[$#strings] =~ s/^([\.\?\!]).*/$1/; | 
| 297 |  |  |  |  |  |  |  | 
| 298 | 20 |  |  |  |  | 48 | $span{start_end} = $start_end; | 
| 299 | 20 |  |  |  |  | 32 | $span{heat}      = $heat; | 
| 300 | 20 |  |  |  |  | 33 | $span{pos}       = \@cluster_pos; | 
| 301 | 20 |  |  |  |  | 30 | $span{tokens}    = \@cluster_tokens; | 
| 302 | 20 |  |  |  |  | 130 | $span{str}       = join( '', @strings ); | 
| 303 |  |  |  |  |  |  |  | 
| 304 |  |  |  |  |  |  | # spans with more *unique* hot tokens in a single span rank higher | 
| 305 |  |  |  |  |  |  | # spans with more *proximate* hot tokens in a single span rank higher | 
| 306 | 20 |  |  |  |  | 40 | my %uniq          = (); | 
| 307 | 20 |  |  |  |  | 19 | my $i             = 0; | 
| 308 | 20 |  |  |  |  | 23 | my $num_proximate = 1;    # one for the single hot token | 
| 309 | 20 |  |  |  |  | 36 | for (@cluster_pos) { | 
| 310 | 2000 | 100 |  |  |  | 2124 | if ( exists $heatmap{$_} ) { | 
| 311 | 42 |  |  |  |  | 107 | $uniq{ lc $strings[$i] } += $heatmap{$_}; | 
| 312 | 42 | 100 | 100 |  |  | 171 | if ( $i && exists $heatmap{ $cluster_pos[ $i - 2 ] } ) { | 
| 313 | 10 |  |  |  |  | 11 | $num_proximate++; | 
| 314 |  |  |  |  |  |  | } | 
| 315 |  |  |  |  |  |  | } | 
| 316 | 2000 |  |  |  |  | 1282 | $i++; | 
| 317 |  |  |  |  |  |  | } | 
| 318 | 20 |  |  |  |  | 45 | $span{unique}    = scalar keys %uniq; | 
| 319 | 20 |  |  |  |  | 30 | $span{proximate} = $num_proximate; | 
| 320 |  |  |  |  |  |  |  | 
| 321 |  |  |  |  |  |  | # no false phrase matches if !_treat_phrases_as_singles | 
| 322 |  |  |  |  |  |  | # stemmer check because regex will likely fail | 
| 323 |  |  |  |  |  |  | # when stemmer is on | 
| 324 | 20 | 100 | 66 |  |  | 80 | if ( $query_has_phrase | 
| 325 |  |  |  |  |  |  | and !$self->{_treat_phrases_as_singles} ) | 
| 326 |  |  |  |  |  |  | { | 
| 327 | 7 | 100 |  |  |  | 16 | if ( !$self->{_stemmer} ) { | 
| 328 |  |  |  |  |  |  |  | 
| 329 |  |  |  |  |  |  | #warn "_treat_phrases_as_singles NOT true"; | 
| 330 | 3 | 50 |  |  |  | 106 | if ( $span{str} !~ m/$qre/ ) { | 
| 331 | 0 | 0 |  |  |  | 0 | $debug | 
| 332 |  |  |  |  |  |  | and warn | 
| 333 |  |  |  |  |  |  | "treat_phrases_as_singles=FALSE and '$span{str}' failed to match $qre\n"; | 
| 334 | 0 |  |  |  |  | 0 | next START_END; | 
| 335 |  |  |  |  |  |  | } | 
| 336 |  |  |  |  |  |  | } | 
| 337 |  |  |  |  |  |  | else { | 
| 338 |  |  |  |  |  |  |  | 
| 339 |  |  |  |  |  |  | # if stemmer was on, we cannot rely on the regex, | 
| 340 |  |  |  |  |  |  | # but we assume that number of uniq terms must match query | 
| 341 |  |  |  |  |  |  |  | 
| 342 | 4 | 50 | 66 |  |  | 16 | if (   $n_terms == $query_has_phrase | 
| 343 |  |  |  |  |  |  | && $n_terms > $span{unique} ) | 
| 344 |  |  |  |  |  |  | { | 
| 345 |  |  |  |  |  |  |  | 
| 346 | 0 | 0 |  |  |  | 0 | $debug | 
| 347 |  |  |  |  |  |  | and warn | 
| 348 |  |  |  |  |  |  | "treat_phrases_as_singles=FALSE and '$span{str}' " | 
| 349 |  |  |  |  |  |  | . "expected $n_terms unique terms, got $span{unique}\n"; | 
| 350 | 0 |  |  |  |  | 0 | next START_END; | 
| 351 |  |  |  |  |  |  | } | 
| 352 |  |  |  |  |  |  |  | 
| 353 |  |  |  |  |  |  | } | 
| 354 |  |  |  |  |  |  | } | 
| 355 |  |  |  |  |  |  |  | 
| 356 |  |  |  |  |  |  | # just for debug | 
| 357 | 20 | 50 |  |  |  | 53 | if ($debug) { | 
| 358 | 0 |  |  |  |  | 0 | my $i = 0; | 
| 359 |  |  |  |  |  |  | $span{str_w_pos} = join( | 
| 360 |  |  |  |  |  |  | '', | 
| 361 |  |  |  |  |  |  | map { | 
| 362 | 0 |  |  |  |  | 0 | $strings[ $i++ ] | 
| 363 |  |  |  |  |  |  | . ( exists $heatmap{$_} ? $OPEN : '[' ) | 
| 364 |  |  |  |  |  |  | . $_ | 
| 365 | 0 | 0 |  |  |  | 0 | . ( exists $heatmap{$_} ? $CLOSE : ']' ) | 
|  |  | 0 |  |  |  |  |  | 
| 366 |  |  |  |  |  |  | } @cluster_pos | 
| 367 |  |  |  |  |  |  | ); | 
| 368 |  |  |  |  |  |  | } | 
| 369 |  |  |  |  |  |  |  | 
| 370 | 20 |  |  |  |  | 130 | push @spans, \%span; | 
| 371 |  |  |  |  |  |  |  | 
| 372 |  |  |  |  |  |  | } | 
| 373 |  |  |  |  |  |  |  | 
| 374 | 13 |  |  |  |  | 50 | $self->{spans}   = $self->_sort_spans( \@spans ); | 
| 375 | 13 |  |  |  |  | 30 | $self->{heatmap} = \%heatmap; | 
| 376 |  |  |  |  |  |  |  | 
| 377 | 13 |  |  |  |  | 195 | return $self; | 
| 378 |  |  |  |  |  |  | } | 
| 379 |  |  |  |  |  |  |  | 
| 380 |  |  |  |  |  |  | sub _sort_spans { | 
| 381 |  |  |  |  |  |  | return [ | 
| 382 |  |  |  |  |  |  |  | 
| 383 |  |  |  |  |  |  | # sort by unique, | 
| 384 |  |  |  |  |  |  | # then by proximity | 
| 385 |  |  |  |  |  |  | # then by heat | 
| 386 |  |  |  |  |  |  | # then by pos | 
| 387 |  |  |  |  |  |  |  | 
| 388 |  |  |  |  |  |  | sort { | 
| 389 |  |  |  |  |  |  | $b->{unique} <=> $a->{unique} | 
| 390 |  |  |  |  |  |  | || $b->{proximate} <=> $a->{proximate} | 
| 391 |  |  |  |  |  |  | || $b->{heat} <=> $a->{heat} | 
| 392 | 51 | 50 | 66 |  |  | 267 | || $a->{pos}->[0] <=> $b->{pos}->[0] | 
|  |  |  | 100 |  |  |  |  | 
| 393 | 30 |  |  | 30 |  | 49 | } @{ $_[1] } | 
|  | 30 |  |  |  |  | 126 |  | 
| 394 |  |  |  |  |  |  |  | 
| 395 |  |  |  |  |  |  | ]; | 
| 396 |  |  |  |  |  |  | } | 
| 397 |  |  |  |  |  |  |  | 
| 398 |  |  |  |  |  |  | sub _no_sentences { | 
| 399 | 17 |  |  | 17 |  | 31 | my ( $self, $tokens, $window ) = @_; | 
| 400 | 17 |  |  |  |  | 52 | my $lhs_window = int( $window / 2 ); | 
| 401 | 17 |  | 50 |  |  | 299 | my $debug = $self->debug || 0; | 
| 402 |  |  |  |  |  |  |  | 
| 403 | 17 |  |  |  |  | 154 | my $num_tokens      = $tokens->len; | 
| 404 | 17 |  |  |  |  | 46 | my $tokens_arr      = $tokens->as_array; | 
| 405 | 17 |  |  |  |  | 29 | my %heatmap         = (); | 
| 406 | 17 |  |  |  |  | 50 | my $token_list_heat = $tokens->get_heat; | 
| 407 |  |  |  |  |  |  |  | 
| 408 |  |  |  |  |  |  | # this regex is a sanity check for phrases. we replace the \ with a | 
| 409 |  |  |  |  |  |  | # more promiscuous check because the single space is too naive | 
| 410 |  |  |  |  |  |  | # for real text (e.g. st. john's) | 
| 411 | 17 |  |  |  |  | 30 | my $qre              = $self->{_qre}; | 
| 412 | 17 |  |  |  |  | 21 | my @phrases          = @{ $self->{_query}->phrases }; | 
|  | 17 |  |  |  |  | 65 |  | 
| 413 | 17 |  |  |  |  | 64 | my $n_terms          = $self->{_query}->num_terms; | 
| 414 | 17 |  |  |  |  | 70 | my $query_has_phrase = $qre =~ s/(\\ )+/.+/g; | 
| 415 |  |  |  |  |  |  |  | 
| 416 | 17 | 50 |  |  |  | 43 | if ($debug) { | 
| 417 | 0 |  |  |  |  | 0 | warn "token_list_heat: " . dump($token_list_heat); | 
| 418 | 0 |  |  |  |  | 0 | warn "n_terms: $n_terms"; | 
| 419 | 0 |  |  |  |  | 0 | warn "phrases: " . dump( \@phrases ); | 
| 420 | 0 |  |  |  |  | 0 | warn "query_has_phrase: $query_has_phrase"; | 
| 421 |  |  |  |  |  |  | } | 
| 422 |  |  |  |  |  |  |  | 
| 423 |  |  |  |  |  |  | # build heatmap | 
| 424 | 17 |  |  |  |  | 39 | for (@$token_list_heat) { | 
| 425 | 77 |  |  |  |  | 139 | my $token = $tokens->get_token($_); | 
| 426 | 77 |  |  |  |  | 247 | $heatmap{ $token->pos } = $token->is_hot; | 
| 427 |  |  |  |  |  |  | } | 
| 428 |  |  |  |  |  |  |  | 
| 429 |  |  |  |  |  |  | # make clusters | 
| 430 |  |  |  |  |  |  |  | 
| 431 |  |  |  |  |  |  | # $proximity == (1/4 of $window)+1 is somewhat arbitrary, | 
| 432 |  |  |  |  |  |  | # but since we want to err in having too much context, | 
| 433 |  |  |  |  |  |  | # we aim high. Worst case scenario is where there are | 
| 434 |  |  |  |  |  |  | # multiple hot spots in a cluster and each is a full | 
| 435 |  |  |  |  |  |  | # $proximity length apart, which will grow the | 
| 436 |  |  |  |  |  |  | # eventual span far beyond $window size. We rely | 
| 437 |  |  |  |  |  |  | # on max_chars in Snipper to catch that worst case. | 
| 438 | 17 |  |  |  |  | 45 | my $proximity = int( $lhs_window / 2 ) + 1; | 
| 439 | 17 |  |  |  |  | 76 | my @positions = sort { $a <=> $b } keys %heatmap; | 
|  | 151 |  |  |  |  | 142 |  | 
| 440 | 17 |  |  |  |  | 38 | my @clusters  = ( [] ); | 
| 441 | 17 |  |  |  |  | 21 | my $i         = 0; | 
| 442 | 17 |  |  |  |  | 30 | for my $pos (@positions) { | 
| 443 |  |  |  |  |  |  |  | 
| 444 |  |  |  |  |  |  | # if we have advanced past the first position | 
| 445 |  |  |  |  |  |  | # and the previous position is not "close" to this one, | 
| 446 |  |  |  |  |  |  | # start a new cluster | 
| 447 | 77 | 100 | 100 |  |  | 214 | if ( $i && ( $pos - $positions[ $i - 1 ] ) > $proximity ) { | 
| 448 | 33 |  |  |  |  | 46 | push( @clusters, [$pos] ); | 
| 449 |  |  |  |  |  |  | } | 
| 450 |  |  |  |  |  |  | else { | 
| 451 | 44 |  |  |  |  | 33 | push( @{ $clusters[-1] }, $pos ); | 
|  | 44 |  |  |  |  | 60 |  | 
| 452 |  |  |  |  |  |  | } | 
| 453 | 77 |  |  |  |  | 72 | $i++; | 
| 454 |  |  |  |  |  |  | } | 
| 455 |  |  |  |  |  |  |  | 
| 456 |  |  |  |  |  |  | $debug | 
| 457 | 17 | 50 |  |  |  | 36 | and warn "proximity: $proximity   clusters: " . dump \@clusters; | 
| 458 |  |  |  |  |  |  |  | 
| 459 |  |  |  |  |  |  | # create spans from each cluster, each with a weight. | 
| 460 |  |  |  |  |  |  | # we do the initial sort so that clusters that overlap | 
| 461 |  |  |  |  |  |  | # other clusters via get_window() are weeded out via %seen_pos. | 
| 462 | 17 |  |  |  |  | 26 | my @spans; | 
| 463 |  |  |  |  |  |  | my %seen_pos; | 
| 464 |  |  |  |  |  |  | CLUSTER: | 
| 465 | 17 |  |  |  |  | 32 | for my $cluster ( | 
| 466 |  |  |  |  |  |  | sort { | 
| 467 |  |  |  |  |  |  | scalar(@$b) <=> scalar(@$a) | 
| 468 | 61 | 50 | 66 |  |  | 206 | || $heatmap{ $b->[0] } <=> $heatmap{ $a->[0] } | 
| 469 |  |  |  |  |  |  | || $a->[0] <=> $b->[0] | 
| 470 |  |  |  |  |  |  | } @clusters | 
| 471 |  |  |  |  |  |  | ) | 
| 472 |  |  |  |  |  |  | { | 
| 473 |  |  |  |  |  |  |  | 
| 474 |  |  |  |  |  |  | # get full window, ignoring positions we've already seen. | 
| 475 | 50 |  |  |  |  | 45 | my $heat = 0; | 
| 476 | 50 |  |  |  |  | 36 | my %span; | 
| 477 |  |  |  |  |  |  | my @cluster_tokens; | 
| 478 | 50 |  |  |  |  | 59 | POS: for my $pos (@$cluster) { | 
| 479 | 77 |  |  |  |  | 198 | my ( $start, $end ) = $tokens->get_window( $pos, $window ); | 
| 480 | 77 |  |  |  |  | 122 | POS_TWO: for my $pos2 ( $start .. $end ) { | 
| 481 | 3357 | 100 |  |  |  | 5728 | next if $seen_pos{$pos2}++; | 
| 482 | 1513 | 100 |  |  |  | 1431 | $heat += ( exists $heatmap{$pos2} ? $heatmap{$pos2} : 0 ); | 
| 483 | 1513 |  |  |  |  | 1851 | push( @cluster_tokens, $tokens->get_token($pos2) ); | 
| 484 |  |  |  |  |  |  | } | 
| 485 |  |  |  |  |  |  | } | 
| 486 |  |  |  |  |  |  |  | 
| 487 |  |  |  |  |  |  | # we may have skipped a $seen_pos from the $slice above | 
| 488 |  |  |  |  |  |  | # so make sure we still start/end on a match | 
| 489 | 50 |  | 66 |  |  | 226 | while ( @cluster_tokens && !$cluster_tokens[0]->is_match ) { | 
| 490 | 11 |  |  |  |  | 49 | shift @cluster_tokens; | 
| 491 |  |  |  |  |  |  | } | 
| 492 | 50 |  | 66 |  |  | 2075 | while ( @cluster_tokens && !$cluster_tokens[-1]->is_match ) { | 
| 493 | 6 |  |  |  |  | 25 | pop @cluster_tokens; | 
| 494 |  |  |  |  |  |  | } | 
| 495 |  |  |  |  |  |  |  | 
| 496 | 50 | 50 |  |  |  | 79 | next CLUSTER unless @cluster_tokens; | 
| 497 |  |  |  |  |  |  |  | 
| 498 |  |  |  |  |  |  | # sanity: make sure we still have something hot | 
| 499 | 50 |  |  |  |  | 43 | my $has_hot = 0; | 
| 500 | 50 |  |  |  |  | 38 | my @cluster_pos; | 
| 501 |  |  |  |  |  |  | my @strings; | 
| 502 | 50 |  |  |  |  | 62 | for (@cluster_tokens) { | 
| 503 | 1496 |  |  |  |  | 1378 | my $pos = $_->pos; | 
| 504 | 1496 | 100 |  |  |  | 1711 | $has_hot++ if exists $heatmap{$pos}; | 
| 505 | 1496 |  |  |  |  | 1715 | push @strings,     $_->str; | 
| 506 | 1496 |  |  |  |  | 1366 | push @cluster_pos, $pos; | 
| 507 |  |  |  |  |  |  | } | 
| 508 | 50 | 100 |  |  |  | 111 | next CLUSTER unless $has_hot; | 
| 509 |  |  |  |  |  |  |  | 
| 510 | 39 |  |  |  |  | 52 | $span{cluster} = $cluster; | 
| 511 | 39 |  |  |  |  | 51 | $span{heat}    = $heat; | 
| 512 | 39 |  |  |  |  | 44 | $span{pos}     = \@cluster_pos; | 
| 513 | 39 |  |  |  |  | 48 | $span{tokens}  = \@cluster_tokens; | 
| 514 | 39 |  |  |  |  | 125 | $span{str}     = join( '', @strings ); | 
| 515 |  |  |  |  |  |  |  | 
| 516 |  |  |  |  |  |  | # spans with more *unique* hot tokens in a single span rank higher | 
| 517 |  |  |  |  |  |  | # spans with more *proximate* hot tokens in a single span rank higher | 
| 518 | 39 |  |  |  |  | 53 | my %uniq          = (); | 
| 519 | 39 |  |  |  |  | 34 | my $i             = 0; | 
| 520 | 39 |  |  |  |  | 30 | my $num_proximate = 1;    # one for the single hot token | 
| 521 | 39 |  |  |  |  | 46 | for (@cluster_pos) { | 
| 522 | 1341 | 100 |  |  |  | 1454 | if ( exists $heatmap{$_} ) { | 
| 523 | 77 |  |  |  |  | 154 | $uniq{ lc $strings[$i] } += $heatmap{$_}; | 
| 524 | 77 | 100 | 100 |  |  | 251 | if ( $i && exists $heatmap{ $cluster_pos[ $i - 2 ] } ) { | 
| 525 | 23 |  |  |  |  | 22 | $num_proximate++; | 
| 526 |  |  |  |  |  |  | } | 
| 527 |  |  |  |  |  |  | } | 
| 528 | 1341 |  |  |  |  | 849 | $i++; | 
| 529 |  |  |  |  |  |  | } | 
| 530 | 39 |  |  |  |  | 55 | $span{unique}    = scalar keys %uniq; | 
| 531 | 39 |  |  |  |  | 45 | $span{proximate} = $num_proximate; | 
| 532 |  |  |  |  |  |  |  | 
| 533 |  |  |  |  |  |  | # no false phrase matches if !_treat_phrases_as_singles | 
| 534 |  |  |  |  |  |  | # stemmer check because regex will likely fail when stemmer is on | 
| 535 | 39 | 100 | 66 |  |  | 98 | if ( $query_has_phrase | 
| 536 |  |  |  |  |  |  | and !$self->{_treat_phrases_as_singles} ) | 
| 537 |  |  |  |  |  |  | { | 
| 538 | 3 | 100 |  |  |  | 11 | if ( !$self->{_stemmer} ) { | 
| 539 |  |  |  |  |  |  |  | 
| 540 |  |  |  |  |  |  | #warn "_treat_phrases_as_singles NOT true"; | 
| 541 | 1 | 50 |  |  |  | 37 | if ( $span{str} !~ m/$qre/ ) { | 
| 542 | 1 | 50 |  |  |  | 3 | $debug | 
| 543 |  |  |  |  |  |  | and warn | 
| 544 |  |  |  |  |  |  | "treat_phrases_as_singles=FALSE and '$span{str}' failed to match $qre\n"; | 
| 545 | 1 |  |  |  |  | 6 | next CLUSTER; | 
| 546 |  |  |  |  |  |  | } | 
| 547 |  |  |  |  |  |  | } | 
| 548 |  |  |  |  |  |  | else { | 
| 549 |  |  |  |  |  |  |  | 
| 550 |  |  |  |  |  |  | # stemmer used, so check unique term count against n_terms | 
| 551 | 2 | 50 | 66 |  |  | 13 | if (   $n_terms == $query_has_phrase | 
| 552 |  |  |  |  |  |  | && $n_terms > $span{unique} ) | 
| 553 |  |  |  |  |  |  | { | 
| 554 | 0 | 0 |  |  |  | 0 | $debug | 
| 555 |  |  |  |  |  |  | and warn | 
| 556 |  |  |  |  |  |  | "treat_phrases_as_singles=FALSE and '$span{str}' " | 
| 557 |  |  |  |  |  |  | . "expected $n_terms but got $span{unique}\n"; | 
| 558 | 0 |  |  |  |  | 0 | next CLUSTER; | 
| 559 |  |  |  |  |  |  | } | 
| 560 |  |  |  |  |  |  |  | 
| 561 |  |  |  |  |  |  | } | 
| 562 |  |  |  |  |  |  | } | 
| 563 |  |  |  |  |  |  |  | 
| 564 |  |  |  |  |  |  | # just for debug | 
| 565 | 38 | 50 |  |  |  | 61 | if ($debug) { | 
| 566 | 0 |  |  |  |  | 0 | my $i = 0; | 
| 567 |  |  |  |  |  |  | $span{str_w_pos} = join( | 
| 568 |  |  |  |  |  |  | '', | 
| 569 |  |  |  |  |  |  | map { | 
| 570 | 0 |  |  |  |  | 0 | $strings[ $i++ ] | 
| 571 |  |  |  |  |  |  | . ( exists $heatmap{$_} ? $OPEN : '[' ) | 
| 572 |  |  |  |  |  |  | . $_ | 
| 573 | 0 | 0 |  |  |  | 0 | . ( exists $heatmap{$_} ? $CLOSE : ']' ) | 
|  |  | 0 |  |  |  |  |  | 
| 574 |  |  |  |  |  |  | } @cluster_pos | 
| 575 |  |  |  |  |  |  | ); | 
| 576 |  |  |  |  |  |  | } | 
| 577 |  |  |  |  |  |  |  | 
| 578 | 38 |  |  |  |  | 137 | push @spans, \%span; | 
| 579 |  |  |  |  |  |  |  | 
| 580 |  |  |  |  |  |  | } | 
| 581 |  |  |  |  |  |  |  | 
| 582 | 17 |  |  |  |  | 1288 | $self->{spans}   = $self->_sort_spans( \@spans ); | 
| 583 | 17 |  |  |  |  | 33 | $self->{heatmap} = \%heatmap; | 
| 584 |  |  |  |  |  |  |  | 
| 585 | 17 |  |  |  |  | 161 | return $self; | 
| 586 |  |  |  |  |  |  | } | 
| 587 |  |  |  |  |  |  |  | 
| 588 |  |  |  |  |  |  | =head2 has_spans | 
| 589 |  |  |  |  |  |  |  | 
| 590 |  |  |  |  |  |  | Returns the number of spans found. | 
| 591 |  |  |  |  |  |  |  | 
| 592 |  |  |  |  |  |  | =cut | 
| 593 |  |  |  |  |  |  |  | 
| 594 |  |  |  |  |  |  | sub has_spans { | 
| 595 | 30 |  |  | 30 | 1 | 43 | return scalar @{ $_[0]->{spans} }; | 
|  | 30 |  |  |  |  | 111 |  | 
| 596 |  |  |  |  |  |  | } | 
| 597 |  |  |  |  |  |  |  | 
| 598 |  |  |  |  |  |  | 1; | 
| 599 |  |  |  |  |  |  |  | 
| 600 |  |  |  |  |  |  | __END__ |