| line | stmt | bran | cond | sub | pod | time | code | 
| 1 |  |  |  |  |  |  | package Search::Tools::HiLiter; | 
| 2 | 16 |  |  | 16 |  | 17055 | use Moo; | 
|  | 16 |  |  |  |  | 60072 |  | 
|  | 16 |  |  |  |  | 120 |  | 
| 3 |  |  |  |  |  |  | extends 'Search::Tools::Object'; | 
| 4 |  |  |  |  |  |  | with 'Search::Tools::ArgNormalizer'; | 
| 5 | 16 |  |  | 16 |  | 12420 | use Carp; | 
|  | 16 |  |  |  |  | 41 |  | 
|  | 16 |  |  |  |  | 1233 |  | 
| 6 | 16 |  |  | 16 |  | 2529 | use Search::Tools::Tokenizer; | 
|  | 16 |  |  |  |  | 52 |  | 
|  | 16 |  |  |  |  | 566 |  | 
| 7 | 16 |  |  | 16 |  | 2759 | use Search::Tools::XML; | 
|  | 16 |  |  |  |  | 55 |  | 
|  | 16 |  |  |  |  | 613 |  | 
| 8 | 16 |  |  | 16 |  | 124 | use Search::Tools::UTF8; | 
|  | 16 |  |  |  |  | 43 |  | 
|  | 16 |  |  |  |  | 1848 |  | 
| 9 | 16 |  |  | 16 |  | 120 | use Data::Dump qw( dump ); | 
|  | 16 |  |  |  |  | 53 |  | 
|  | 16 |  |  |  |  | 1001 |  | 
| 10 |  |  |  |  |  |  |  | 
| 11 | 16 |  |  | 16 |  | 124 | use namespace::autoclean; | 
|  | 16 |  |  |  |  | 42 |  | 
|  | 16 |  |  |  |  | 143 |  | 
| 12 |  |  |  |  |  |  |  | 
| 13 |  |  |  |  |  |  | our $VERSION = '1.007'; | 
| 14 |  |  |  |  |  |  |  | 
| 15 |  |  |  |  |  |  | my $XML = Search::Tools::XML->new; | 
| 16 |  |  |  |  |  |  |  | 
| 17 |  |  |  |  |  |  | my @attrs = qw( | 
| 18 |  |  |  |  |  |  | query | 
| 19 |  |  |  |  |  |  | tag | 
| 20 |  |  |  |  |  |  | class | 
| 21 |  |  |  |  |  |  | style | 
| 22 |  |  |  |  |  |  | text_color | 
| 23 |  |  |  |  |  |  | colors | 
| 24 |  |  |  |  |  |  | tty | 
| 25 |  |  |  |  |  |  | ttycolors | 
| 26 |  |  |  |  |  |  | no_html | 
| 27 |  |  |  |  |  |  | ); | 
| 28 |  |  |  |  |  |  |  | 
| 29 |  |  |  |  |  |  | for my $attr (@attrs) { | 
| 30 |  |  |  |  |  |  | has $attr => ( is => 'rw' ); | 
| 31 |  |  |  |  |  |  | } | 
| 32 |  |  |  |  |  |  |  | 
| 33 |  |  |  |  |  |  | sub BUILD { | 
| 34 | 23 |  |  | 23 | 1 | 489 | my $self = shift; | 
| 35 |  |  |  |  |  |  |  | 
| 36 | 23 | 50 |  |  |  | 529 | if ( $self->debug ) { | 
| 37 | 0 |  |  |  |  | 0 | carp "debug level set at " . $self->debug; | 
| 38 |  |  |  |  |  |  | } | 
| 39 |  |  |  |  |  |  |  | 
| 40 | 23 |  |  |  |  | 766 | $self->{_tokenizer} = Search::Tools::Tokenizer->new( | 
| 41 |  |  |  |  |  |  | re    => $self->query->qp->term_re, | 
| 42 |  |  |  |  |  |  | debug => $self->debug, | 
| 43 |  |  |  |  |  |  | ); | 
| 44 |  |  |  |  |  |  |  | 
| 45 | 23 |  | 100 |  |  | 178 | $self->{tag} ||= 'span'; | 
| 46 | 23 |  | 50 |  |  | 194 | $self->{colors} ||= [ '#ffff99', '#99ffff', '#ffccff', '#ccccff' ]; | 
| 47 | 23 |  | 50 |  |  | 159 | $self->{ttycolors} ||= [ 'bold blue', 'bold red', 'bold green' ]; | 
| 48 |  |  |  |  |  |  |  | 
| 49 | 23 | 100 |  |  |  | 112 | if ( $self->tty ) { | 
| 50 | 7 |  |  |  |  | 13 | eval { require Term::ANSIColor }; | 
|  | 7 |  |  |  |  | 2159 |  | 
| 51 | 7 | 50 |  |  |  | 30840 | $self->tty(0) if $@; | 
| 52 |  |  |  |  |  |  | } | 
| 53 |  |  |  |  |  |  |  | 
| 54 | 23 |  |  |  |  | 108 | $self->_build_tags; | 
| 55 |  |  |  |  |  |  | } | 
| 56 |  |  |  |  |  |  |  | 
| 57 |  |  |  |  |  |  | sub terms { | 
| 58 | 0 |  |  | 0 | 1 | 0 | return shift->{query}->terms; | 
| 59 |  |  |  |  |  |  | } | 
| 60 |  |  |  |  |  |  |  | 
| 61 |  |  |  |  |  |  | sub keywords { | 
| 62 | 0 |  |  | 0 | 1 | 0 | return @{ shift->terms }; | 
|  | 0 |  |  |  |  | 0 |  | 
| 63 |  |  |  |  |  |  | } | 
| 64 |  |  |  |  |  |  |  | 
| 65 |  |  |  |  |  |  | sub _phrases { | 
| 66 | 0 |  |  | 0 |  | 0 | my $self = shift; | 
| 67 | 0 |  |  |  |  | 0 | my $q    = $self->{query}; | 
| 68 | 0 |  |  |  |  | 0 | return grep { $self->_regex_for($_)->is_phrase } @{ $q->terms }; | 
|  | 0 |  |  |  |  | 0 |  | 
|  | 0 |  |  |  |  | 0 |  | 
| 69 |  |  |  |  |  |  | } | 
| 70 |  |  |  |  |  |  |  | 
| 71 |  |  |  |  |  |  | sub _singles { | 
| 72 | 0 |  |  | 0 |  | 0 | my $self = shift; | 
| 73 | 0 |  |  |  |  | 0 | my $q    = $self->{query}; | 
| 74 | 0 |  |  |  |  | 0 | return grep { !$self->_regex_for($_)->is_phrase } @{ $q->terms }; | 
|  | 0 |  |  |  |  | 0 |  | 
|  | 0 |  |  |  |  | 0 |  | 
| 75 |  |  |  |  |  |  | } | 
| 76 |  |  |  |  |  |  |  | 
| 77 |  |  |  |  |  |  | sub _kworder { | 
| 78 | 48 |  |  | 48 |  | 108 | my $self = shift; | 
| 79 | 48 |  |  |  |  | 118 | my $q    = $self->{query}; | 
| 80 | 48 |  |  |  |  | 166 | my $qstr = $q->str; | 
| 81 | 48 | 100 |  |  |  | 210 | if ( exists $self->{_kworder_cache}->{$qstr} ) { | 
| 82 | 25 |  |  |  |  | 64 | return @{ $self->{_kworder_cache}->{$qstr} }; | 
|  | 25 |  |  |  |  | 155 |  | 
| 83 |  |  |  |  |  |  | } | 
| 84 |  |  |  |  |  |  |  | 
| 85 |  |  |  |  |  |  | # do phrases first so that duplicates privilege phrases | 
| 86 | 23 |  |  |  |  | 60 | my ( @phrases, @singles ); | 
| 87 |  |  |  |  |  |  |  | 
| 88 | 23 |  |  |  |  | 49 | for ( @{ $q->terms } ) { | 
|  | 23 |  |  |  |  | 117 |  | 
| 89 | 60 | 100 |  |  |  | 224 | if ( $self->_regex_for($_)->is_phrase ) { | 
| 90 | 15 |  |  |  |  | 57 | push @phrases, $_; | 
| 91 |  |  |  |  |  |  | } | 
| 92 |  |  |  |  |  |  | else { | 
| 93 | 45 |  |  |  |  | 179 | push @singles, $_; | 
| 94 |  |  |  |  |  |  | } | 
| 95 |  |  |  |  |  |  | } | 
| 96 |  |  |  |  |  |  |  | 
| 97 | 23 |  |  |  |  | 153 | $self->{_kworder_cache}->{$qstr} = [ @phrases, @singles ]; | 
| 98 |  |  |  |  |  |  |  | 
| 99 | 23 |  |  |  |  | 131 | return ( @phrases, @singles ); | 
| 100 |  |  |  |  |  |  | } | 
| 101 |  |  |  |  |  |  |  | 
| 102 |  |  |  |  |  |  | sub _build_tags { | 
| 103 | 23 |  |  | 23 |  | 52 | my $self = shift; | 
| 104 |  |  |  |  |  |  |  | 
| 105 | 23 |  |  |  |  | 60 | my $t         = {}; | 
| 106 | 23 |  |  |  |  | 45 | my @colors    = @{ $self->colors }; | 
|  | 23 |  |  |  |  | 113 |  | 
| 107 | 23 |  |  |  |  | 56 | my @ttycolors = @{ $self->ttycolors }; | 
|  | 23 |  |  |  |  | 98 |  | 
| 108 | 23 |  |  |  |  | 90 | my $tag       = $self->tag; | 
| 109 |  |  |  |  |  |  |  | 
| 110 | 23 |  |  |  |  | 51 | my $n = 0; | 
| 111 | 23 |  |  |  |  | 51 | my $m = 0; | 
| 112 |  |  |  |  |  |  |  | 
| 113 | 23 |  |  |  |  | 91 | for my $q ( $self->_kworder ) { | 
| 114 |  |  |  |  |  |  |  | 
| 115 |  |  |  |  |  |  | # if tty flag is on, use ansicolor instead of html | 
| 116 |  |  |  |  |  |  | # if debug flag is on, use both html and ansicolor | 
| 117 |  |  |  |  |  |  |  | 
| 118 | 60 |  |  |  |  | 123 | my ( %tags, $opener ); | 
| 119 | 60 |  |  |  |  | 168 | $tags{open}  = ''; | 
| 120 | 60 |  |  |  |  | 152 | $tags{close} = ''; | 
| 121 | 60 | 100 |  |  |  | 365 | if ( $self->class ) { | 
|  |  | 50 |  |  |  |  |  | 
|  |  | 100 |  |  |  |  |  | 
| 122 | 10 |  |  |  |  | 41 | $opener = qq/<$tag class='/ . $self->class . qq/'>/; | 
| 123 |  |  |  |  |  |  | } | 
| 124 |  |  |  |  |  |  | elsif ( $self->style ) { | 
| 125 | 0 |  |  |  |  | 0 | $opener = qq/<$tag style='/ . $self->style . qq/'>/; | 
| 126 |  |  |  |  |  |  | } | 
| 127 |  |  |  |  |  |  | elsif ( $self->text_color ) { | 
| 128 | 6 |  |  |  |  | 23 | $opener | 
| 129 |  |  |  |  |  |  | = qq/<$tag style='color:/ | 
| 130 |  |  |  |  |  |  | . $self->text_color | 
| 131 |  |  |  |  |  |  | . qq/;background:/ | 
| 132 |  |  |  |  |  |  | . $colors[$n] . qq/'>/; | 
| 133 |  |  |  |  |  |  | } | 
| 134 |  |  |  |  |  |  | else { | 
| 135 | 44 |  |  |  |  | 190 | $opener = qq/<$tag style='background:/ . $colors[$n] . qq/'>/; | 
| 136 |  |  |  |  |  |  | } | 
| 137 |  |  |  |  |  |  |  | 
| 138 | 60 | 100 |  |  |  | 166 | if ( $self->tty ) { | 
| 139 | 15 | 50 | 33 |  |  | 336 | $tags{open} .= $opener if $self->debug && !$self->no_html; | 
| 140 | 15 |  |  |  |  | 173 | $tags{open}  .= Term::ANSIColor::color( $ttycolors[$m] ); | 
| 141 | 15 |  |  |  |  | 417 | $tags{close} .= Term::ANSIColor::color('reset'); | 
| 142 | 15 | 50 | 33 |  |  | 559 | $tags{close} .= "$tag>" if $self->debug && !$self->no_html; | 
| 143 |  |  |  |  |  |  | } | 
| 144 |  |  |  |  |  |  | else { | 
| 145 | 45 |  |  |  |  | 123 | $tags{open}  .= $opener; | 
| 146 | 45 |  |  |  |  | 116 | $tags{close} .= "$tag>"; | 
| 147 |  |  |  |  |  |  | } | 
| 148 |  |  |  |  |  |  |  | 
| 149 | 60 |  |  |  |  | 262 | $t->{$q} = \%tags; | 
| 150 |  |  |  |  |  |  |  | 
| 151 | 60 | 100 |  |  |  | 225 | $n = 0 if ++$n > $#colors; | 
| 152 | 60 | 100 |  |  |  | 209 | $m = 0 if ++$m > $#ttycolors; | 
| 153 |  |  |  |  |  |  | } | 
| 154 |  |  |  |  |  |  |  | 
| 155 | 23 |  |  |  |  | 2021 | $self->{_tags} = $t; | 
| 156 |  |  |  |  |  |  | } | 
| 157 |  |  |  |  |  |  |  | 
| 158 |  |  |  |  |  |  | sub open_tag { | 
| 159 | 69 |  |  | 69 | 1 | 127 | my $self = shift; | 
| 160 | 69 | 50 |  |  |  | 196 | my $q = shift or croak "need query to get open_tag"; | 
| 161 | 69 |  | 50 |  |  | 318 | return $self->{_tags}->{$q}->{open} || ''; | 
| 162 |  |  |  |  |  |  | } | 
| 163 |  |  |  |  |  |  |  | 
| 164 |  |  |  |  |  |  | sub close_tag { | 
| 165 | 69 |  |  | 69 | 1 | 193 | my $self = shift; | 
| 166 | 69 | 50 |  |  |  | 180 | my $q = shift or croak "need query to get close_tag"; | 
| 167 | 69 |  | 50 |  |  | 262 | return $self->{_tags}->{$q}->{close} || ''; | 
| 168 |  |  |  |  |  |  | } | 
| 169 |  |  |  |  |  |  |  | 
| 170 |  |  |  |  |  |  | sub light { | 
| 171 | 25 |  |  | 25 | 1 | 106 | my $self = shift; | 
| 172 | 25 | 50 |  |  |  | 109 | my $text = shift or return ''; | 
| 173 |  |  |  |  |  |  |  | 
| 174 |  |  |  |  |  |  | # force upgrade. this is so regex will match ok. | 
| 175 | 25 |  |  |  |  | 157 | $text = to_utf8($text); | 
| 176 |  |  |  |  |  |  |  | 
| 177 | 25 | 100 | 66 |  |  | 161 | if ( $XML->looks_like_html($text) && !$self->no_html ) { | 
| 178 |  |  |  |  |  |  |  | 
| 179 |  |  |  |  |  |  | #warn "running ->html"; | 
| 180 | 11 | 50 |  |  |  | 87 | if ( $self->query->qp->stemmer ) { | 
| 181 | 0 |  |  |  |  | 0 | return $self->html_stemmer($text); | 
| 182 |  |  |  |  |  |  | } | 
| 183 | 11 |  |  |  |  | 44 | return $self->html($text); | 
| 184 |  |  |  |  |  |  | } | 
| 185 |  |  |  |  |  |  | else { | 
| 186 |  |  |  |  |  |  |  | 
| 187 |  |  |  |  |  |  | #warn "running ->plain"; | 
| 188 | 14 | 100 |  |  |  | 135 | if ( $self->query->qp->stemmer ) { | 
| 189 | 1 |  |  |  |  | 4 | return $self->plain_stemmer($text); | 
| 190 |  |  |  |  |  |  | } | 
| 191 | 13 |  |  |  |  | 128 | return $self->plain($text); | 
| 192 |  |  |  |  |  |  | } | 
| 193 |  |  |  |  |  |  | } | 
| 194 |  |  |  |  |  |  |  | 
| 195 |  |  |  |  |  |  | *hilite = \&light; | 
| 196 |  |  |  |  |  |  |  | 
| 197 |  |  |  |  |  |  | sub _get_real_html { | 
| 198 | 29 |  |  | 29 |  | 46 | my $self  = shift; | 
| 199 | 29 |  |  |  |  | 61 | my $text  = shift; | 
| 200 | 29 |  |  |  |  | 47 | my $re    = shift; | 
| 201 | 29 |  |  |  |  | 49 | my $m     = {}; | 
| 202 | 29 | 50 |  |  |  | 714 | my $debug = $self->debug > 1 ? 1 : 0; | 
| 203 |  |  |  |  |  |  |  | 
| 204 |  |  |  |  |  |  | # $1 should be st_bound, $2 should be query, $3 should be end_bound | 
| 205 |  |  |  |  |  |  | # N.B. The XS version of this algorithm is only a hair faster, | 
| 206 |  |  |  |  |  |  | # since the $re is the bottleneck. | 
| 207 | 29 |  |  |  |  | 46940 | while ( $$text =~ m/$re/g ) { | 
| 208 |  |  |  |  |  |  |  | 
| 209 | 43 |  |  |  |  | 241 | my $pos = pos($$text); | 
| 210 |  |  |  |  |  |  |  | 
| 211 | 43 | 50 |  |  |  | 105 | if ($debug) { | 
| 212 | 0 |  |  |  |  | 0 | carp "$2 matches $re"; | 
| 213 | 0 |  |  |  |  | 0 | carp "\$1='$1'\n\$2='$2'\n\$3='$3'\npos=$pos"; | 
| 214 |  |  |  |  |  |  | } | 
| 215 |  |  |  |  |  |  |  | 
| 216 | 43 |  |  |  |  | 191 | $m->{$2}++; | 
| 217 |  |  |  |  |  |  |  | 
| 218 |  |  |  |  |  |  | # move back and consider $3 again as possible $1 for next match | 
| 219 | 43 | 50 |  |  |  | 124 | if ( length($3) ) { | 
| 220 | 43 |  |  |  |  | 39006 | pos($$text) = $pos - 1; | 
| 221 |  |  |  |  |  |  | } | 
| 222 |  |  |  |  |  |  |  | 
| 223 |  |  |  |  |  |  | } | 
| 224 |  |  |  |  |  |  |  | 
| 225 | 29 |  |  |  |  | 128 | return $m; | 
| 226 |  |  |  |  |  |  |  | 
| 227 |  |  |  |  |  |  | } | 
| 228 |  |  |  |  |  |  |  | 
| 229 |  |  |  |  |  |  | sub _regex_for { | 
| 230 | 124 |  |  | 124 |  | 213 | my $self = shift; | 
| 231 | 124 | 50 |  |  |  | 317 | my $term = shift or croak "term required"; | 
| 232 | 124 | 100 |  |  |  | 407 | if ( exists $self->{_regex_for}->{$term} ) { | 
| 233 | 64 |  |  |  |  | 220 | return $self->{_regex_for}->{$term}; | 
| 234 |  |  |  |  |  |  | } | 
| 235 | 60 |  |  |  |  | 304 | $self->{_regex_for}->{$term} = $self->query->regex_for($term); | 
| 236 | 60 |  |  |  |  | 310 | return $self->{_regex_for}->{$term}; | 
| 237 |  |  |  |  |  |  | } | 
| 238 |  |  |  |  |  |  |  | 
| 239 |  |  |  |  |  |  | # based on HTML::HiLiter hilite() | 
| 240 |  |  |  |  |  |  | sub html { | 
| 241 | 11 |  |  | 11 | 1 | 23 | my $self = shift; | 
| 242 | 11 | 50 |  |  |  | 38 | my $text = shift or croak "need text to light()"; | 
| 243 |  |  |  |  |  |  |  | 
| 244 |  |  |  |  |  |  | ################################################################### | 
| 245 |  |  |  |  |  |  | # 1.	create hash of query -> [ array of real HTML to hilite ] | 
| 246 |  |  |  |  |  |  | # 	    using the prebuilt regexp | 
| 247 |  |  |  |  |  |  | # 2.    hilite the real HTML | 
| 248 |  |  |  |  |  |  | ################################################################### | 
| 249 |  |  |  |  |  |  |  | 
| 250 |  |  |  |  |  |  | ## 1 | 
| 251 |  |  |  |  |  |  |  | 
| 252 | 11 |  |  |  |  | 35 | my $q2real = {}; | 
| 253 |  |  |  |  |  |  |  | 
| 254 |  |  |  |  |  |  | # this is going to be query => [ real_html ] | 
| 255 |  |  |  |  |  |  |  | 
| 256 |  |  |  |  |  |  | # if the query text matched in the text, then we need to | 
| 257 |  |  |  |  |  |  | # use our prebuilt regexp | 
| 258 | 11 |  |  |  |  | 37 | my @kworder = $self->_kworder; | 
| 259 |  |  |  |  |  |  |  | 
| 260 |  |  |  |  |  |  | # don't consider anything we've marked | 
| 261 |  |  |  |  |  |  | # with a 'nohiliter' attribute | 
| 262 | 11 |  |  |  |  | 29 | my $text_copy = $text; | 
| 263 | 11 |  |  |  |  | 212 | $text_copy =~ s/\002.*?\003//sgi; | 
| 264 |  |  |  |  |  |  |  | 
| 265 | 11 |  |  |  |  | 31 | Q: for my $query (@kworder) { | 
| 266 | 29 |  |  |  |  | 95 | my $re = $self->_regex_for($query)->html; | 
| 267 | 29 |  |  |  |  | 86 | my $real = $self->_get_real_html( \$text_copy, $re ); | 
| 268 |  |  |  |  |  |  |  | 
| 269 | 29 |  |  |  |  | 158 | R: for my $r ( keys %$real ) { | 
| 270 | 35 |  |  |  |  | 117 | push( @{ $q2real->{$query} }, $r ) while $real->{$r}--; | 
|  | 43 |  |  |  |  | 246 |  | 
| 271 |  |  |  |  |  |  | } | 
| 272 |  |  |  |  |  |  | } | 
| 273 |  |  |  |  |  |  |  | 
| 274 |  |  |  |  |  |  | ## 2 | 
| 275 |  |  |  |  |  |  |  | 
| 276 | 11 |  |  |  |  | 40 | HILITE: for my $q (@kworder) { | 
| 277 |  |  |  |  |  |  |  | 
| 278 | 29 |  |  |  |  | 81 | my %uniq_reals = (); | 
| 279 | 29 |  |  |  |  | 46 | $uniq_reals{$_}++ for @{ $q2real->{$q} }; | 
|  | 29 |  |  |  |  | 182 |  | 
| 280 |  |  |  |  |  |  |  | 
| 281 | 29 |  |  |  |  | 102 | REAL: for my $real ( keys %uniq_reals ) { | 
| 282 |  |  |  |  |  |  |  | 
| 283 | 35 |  |  |  |  | 149 | $self->_add_hilite_tags( \$text, $q, $real ); | 
| 284 |  |  |  |  |  |  |  | 
| 285 |  |  |  |  |  |  | } | 
| 286 |  |  |  |  |  |  |  | 
| 287 |  |  |  |  |  |  | } | 
| 288 |  |  |  |  |  |  |  | 
| 289 | 11 |  |  |  |  | 206 | return $text; | 
| 290 |  |  |  |  |  |  | } | 
| 291 |  |  |  |  |  |  |  | 
| 292 |  |  |  |  |  |  | sub _add_hilite_tags { | 
| 293 | 35 |  |  | 35 |  | 72 | my $self  = shift; | 
| 294 | 35 |  |  |  |  | 62 | my $text  = shift;    # reference | 
| 295 | 35 |  |  |  |  | 64 | my $query = shift; | 
| 296 | 35 |  |  |  |  | 84 | my $html  = shift; | 
| 297 |  |  |  |  |  |  |  | 
| 298 |  |  |  |  |  |  | # $text is reference to original text | 
| 299 |  |  |  |  |  |  | # $html is the real html that matched our regexp | 
| 300 |  |  |  |  |  |  |  | 
| 301 |  |  |  |  |  |  | # we still check boundaries just to be safe | 
| 302 | 35 |  |  |  |  | 219 | my $st_bound  = $self->query->qp->start_bound; | 
| 303 | 35 |  |  |  |  | 108 | my $end_bound = $self->query->qp->end_bound; | 
| 304 |  |  |  |  |  |  |  | 
| 305 | 35 |  |  |  |  | 98 | my $o = $self->open_tag($query); | 
| 306 | 35 |  |  |  |  | 104 | my $c = $self->close_tag($query); | 
| 307 |  |  |  |  |  |  |  | 
| 308 | 35 |  |  |  |  | 90 | my $safe = quotemeta($html); | 
| 309 |  |  |  |  |  |  |  | 
| 310 |  |  |  |  |  |  | # pre-fix nested tags in match | 
| 311 | 35 |  |  |  |  | 60 | my $pre_fixed = $html; | 
| 312 | 35 |  |  |  |  | 131 | my $tag_re    = $self->query->qp->tag_re; | 
| 313 | 35 |  |  |  |  | 482 | my $pre_added = $pre_fixed =~ s(${tag_re}+)$c$1$og; | 
| 314 | 35 |  |  |  |  | 131 | my $len_added = length( $c . $o ) * $pre_added; | 
| 315 |  |  |  |  |  |  |  | 
| 316 |  |  |  |  |  |  | # should be same as length( $to_hilite) - length( $prefixed ); | 
| 317 | 35 |  |  |  |  | 108 | my $len_diff = ( length($html) - length($pre_fixed) ); | 
| 318 | 35 | 100 |  |  |  | 98 | $len_diff *= -1 | 
| 319 |  |  |  |  |  |  | if $len_diff < 0;    # pre_added might be -1 if no subs were made | 
| 320 | 35 | 50 |  |  |  | 85 | if ( $len_diff != $len_added ) { | 
| 321 | 0 |  |  |  |  | 0 | carp "length math failed!" | 
| 322 |  |  |  |  |  |  | . "len_diff = $len_diff\nlen_added = $len_added"; | 
| 323 |  |  |  |  |  |  | } | 
| 324 |  |  |  |  |  |  |  | 
| 325 | 35 |  |  |  |  | 30873 | while ( $$text =~ m/($st_bound)($safe)($end_bound)/g ) { | 
| 326 | 43 |  |  |  |  | 489 | my $s = $1; | 
| 327 | 43 |  |  |  |  | 111 | my $m = $2; | 
| 328 | 43 |  |  |  |  | 83 | my $e = $3; | 
| 329 | 43 | 50 |  |  |  | 1269 | if ( $self->debug > 1 ) { | 
| 330 | 0 |  |  |  |  | 0 | carp "matched:\n'$s'\n'$m'\n'$e'\n" | 
| 331 |  |  |  |  |  |  | . "\$1 is " | 
| 332 |  |  |  |  |  |  | . ord($s) | 
| 333 |  |  |  |  |  |  | . "\$3 is " | 
| 334 |  |  |  |  |  |  | . ord($e); | 
| 335 |  |  |  |  |  |  | } | 
| 336 |  |  |  |  |  |  |  | 
| 337 |  |  |  |  |  |  | # use substr to do what s// would normally do if pos() wasn't an issue | 
| 338 |  |  |  |  |  |  | # -- is this a big speed hit? | 
| 339 | 43 |  |  |  |  | 456 | my $len       = length( $s . $m . $e ); | 
| 340 | 43 |  |  |  |  | 206 | my $pos       = pos($$text); | 
| 341 | 43 |  |  |  |  | 154 | my $newstring = $s . $o . $pre_fixed . $c . $e; | 
| 342 | 43 |  |  |  |  | 415 | substr( $$text, $pos - $len, $len, $newstring ); | 
| 343 |  |  |  |  |  |  |  | 
| 344 | 43 |  |  |  |  | 2402 | pos($$text) = $pos + length( $o . $c ) + $len_added - 1; | 
| 345 |  |  |  |  |  |  |  | 
| 346 |  |  |  |  |  |  | # adjust for new text added | 
| 347 |  |  |  |  |  |  | # $pre_fixed is the hard bit, since we must take $len_added into account | 
| 348 |  |  |  |  |  |  | # move back 1 to reconsider $3 as next $1 | 
| 349 |  |  |  |  |  |  |  | 
| 350 |  |  |  |  |  |  | #		warn "pos was $pos\nnow ", pos( $html ), "\n"; | 
| 351 |  |  |  |  |  |  | #		warn "new: '$html'\n"; | 
| 352 |  |  |  |  |  |  | #		warn "new text: '$newstring'\n"; | 
| 353 |  |  |  |  |  |  | #		warn "first chars of new pos are '", substr( $html, pos($html), 10 ), "'\n"; | 
| 354 |  |  |  |  |  |  |  | 
| 355 |  |  |  |  |  |  | } | 
| 356 |  |  |  |  |  |  |  | 
| 357 | 35 |  |  |  |  | 165 | $self->_clean_up_hilites( $text, $query, $o, $c, $safe ); | 
| 358 |  |  |  |  |  |  |  | 
| 359 |  |  |  |  |  |  | } | 
| 360 |  |  |  |  |  |  |  | 
| 361 |  |  |  |  |  |  | # no algorithm is perfect. fix it as best we can. | 
| 362 |  |  |  |  |  |  | sub _clean_up_hilites { | 
| 363 |  |  |  |  |  |  |  | 
| 364 | 35 |  |  | 35 |  | 83 | my $self = shift; | 
| 365 | 35 |  |  |  |  | 108 | my ( $text, $query, $o, $c, $safe ) = @_; | 
| 366 |  |  |  |  |  |  |  | 
| 367 |  |  |  |  |  |  | # empty hilites are useless | 
| 368 | 35 |  | 100 |  |  | 2250 | my $empty = ( $$text =~ s,\Q$o$c\E,,sgi ) || 0; | 
| 369 |  |  |  |  |  |  |  | 
| 370 |  |  |  |  |  |  | #$self->debug and carp "looking for split entities: (&[\\w#]*)\Q$o\E(?:\Q$c\E)(${safe})\Q$c\E([\\w#]*;)"; | 
| 371 |  |  |  |  |  |  |  | 
| 372 |  |  |  |  |  |  | # to be safe: in some cases we might match against entities or within tag content. | 
| 373 | 35 |  | 50 |  |  | 2684 | my $ent_split = ( | 
| 374 |  |  |  |  |  |  | $$text | 
| 375 |  |  |  |  |  |  | =~ s/(&[\w#]*)\Q$o\E(?:\Q$c\E)?(${safe})\Q$c\E([\w#]*;)/$1$2$3/igs # is i and s necessary? | 
| 376 |  |  |  |  |  |  | ) || 0; | 
| 377 |  |  |  |  |  |  |  | 
| 378 |  |  |  |  |  |  | #$self->debug and carp "found $ent_split split entities"; | 
| 379 |  |  |  |  |  |  |  | 
| 380 | 35 |  |  |  |  | 121 | my $tag_split = 0; | 
| 381 | 35 |  |  |  |  | 2200 | while ( | 
| 382 |  |  |  |  |  |  | $$text | 
| 383 |  |  |  |  |  |  | =~ m/(<[^<>]*)\Q$o\E($safe)\Q$c\E([^>]*>)/gxsi # are these xsi flags necessary? | 
| 384 |  |  |  |  |  |  | ) | 
| 385 |  |  |  |  |  |  | { | 
| 386 |  |  |  |  |  |  |  | 
| 387 | 4 |  |  |  |  | 18 | my $first  = $1; | 
| 388 | 4 |  |  |  |  | 10 | my $second = $2; | 
| 389 | 4 |  |  |  |  | 10 | my $third  = $3; | 
| 390 | 4 | 50 |  |  |  | 103 | carp "appears to split tag: $first - $second - $third" | 
| 391 |  |  |  |  |  |  | if $self->debug > 1; | 
| 392 |  |  |  |  |  |  |  | 
| 393 |  |  |  |  |  |  | # TODO this would be one place to highlight text where attributes match | 
| 394 |  |  |  |  |  |  |  | 
| 395 | 4 |  |  |  |  | 562 | $tag_split += ( | 
| 396 |  |  |  |  |  |  | $$text =~ s/(<[^<>]*)\Q$o\E($safe)\Q$c\E([^>]*>)/$1$2$3/gxsi ); | 
| 397 |  |  |  |  |  |  |  | 
| 398 |  |  |  |  |  |  | } | 
| 399 |  |  |  |  |  |  |  | 
| 400 |  |  |  |  |  |  | } | 
| 401 |  |  |  |  |  |  |  | 
| 402 |  |  |  |  |  |  | sub html_stemmer { | 
| 403 | 0 |  |  | 0 | 1 | 0 | my $self = shift; | 
| 404 | 0 |  |  |  |  | 0 | my $text = shift; | 
| 405 | 0 |  |  |  |  | 0 | return $self->plain_stemmer($text); | 
| 406 |  |  |  |  |  |  | } | 
| 407 |  |  |  |  |  |  |  | 
| 408 |  |  |  |  |  |  | sub plain_stemmer { | 
| 409 | 1 |  |  | 1 | 1 | 2 | my $self  = shift; | 
| 410 | 1 | 50 |  |  |  | 3 | my $text  = shift or croak "need text"; | 
| 411 | 1 |  |  |  |  | 37 | my $debug = $self->debug; | 
| 412 |  |  |  |  |  |  |  | 
| 413 | 1 |  |  |  |  | 9 | my @kworder = $self->_kworder; | 
| 414 |  |  |  |  |  |  |  | 
| 415 |  |  |  |  |  |  | # if stemmer is on, we must stem each token to look for a match | 
| 416 | 1 |  |  |  |  | 6 | my $qre = $self->query->terms_as_regex(1); | 
| 417 | 1 |  |  |  |  | 4 | $qre =~ s/(\\ )+/\|/g;    # TODO OR phrases together if (0) above? | 
| 418 |  |  |  |  |  |  |  | 
| 419 | 1 |  |  |  |  | 142 | my $re          = qr/^$qre$/; | 
| 420 | 1 |  |  |  |  | 11 | my $stemmer     = $self->query->qp->stemmer; | 
| 421 | 1 |  |  |  |  | 4 | my $qp          = $self->query->qp; | 
| 422 | 1 |  |  |  |  | 3 | my $wildcard    = $qp->wildcard; | 
| 423 |  |  |  |  |  |  | my $heat_seeker = sub { | 
| 424 | 34 |  |  | 34 |  | 66 | my ($token) = @_; | 
| 425 | 34 |  |  |  |  | 86 | my $st = $stemmer->( $qp, $token->str ); | 
| 426 | 34 |  |  |  |  | 490 | return $st =~ m/$re/; | 
| 427 | 1 |  |  |  |  | 6 | }; | 
| 428 |  |  |  |  |  |  |  | 
| 429 | 1 |  |  |  |  | 23 | my $tokens = $self->{_tokenizer}->tokenize( $text, $heat_seeker ); | 
| 430 |  |  |  |  |  |  |  | 
| 431 |  |  |  |  |  |  | # create a new string | 
| 432 | 1 |  |  |  |  | 2 | my $buf; | 
| 433 |  |  |  |  |  |  |  | 
| 434 |  |  |  |  |  |  | # iterate over tokens, looking for any hot ones, | 
| 435 |  |  |  |  |  |  | # and create a new string | 
| 436 | 1 |  |  |  |  | 20 | TOK: while ( my $tok = $tokens->next ) { | 
| 437 | 69 |  |  |  |  | 123 | my $str = $tok->str; | 
| 438 | 69 | 100 |  |  |  | 139 | if ( $tok->is_hot ) { | 
| 439 |  |  |  |  |  |  |  | 
| 440 |  |  |  |  |  |  | # find the matching query term | 
| 441 |  |  |  |  |  |  |  | 
| 442 | 3 |  |  |  |  | 7 | my $stemmed = $stemmer->( $qp, $str ); | 
| 443 | 3 |  |  |  |  | 24 | my $found_match = 0; | 
| 444 | 3 |  |  |  |  | 7 | Q: for my $query (@kworder) { | 
| 445 | 4 |  |  |  |  | 11 | my $regex = $self->_regex_for($query); | 
| 446 | 4 |  |  |  |  | 5 | my @regex_to_try; | 
| 447 |  |  |  |  |  |  |  | 
| 448 |  |  |  |  |  |  | # if it is a phrase, try each term in the phrase | 
| 449 | 4 | 100 |  |  |  | 16 | if ( $regex->is_phrase ) { | 
| 450 | 3 |  |  |  |  | 5 | @regex_to_try = @{ $regex->phrase_terms }; | 
|  | 3 |  |  |  |  | 10 |  | 
| 451 |  |  |  |  |  |  | } | 
| 452 |  |  |  |  |  |  | else { | 
| 453 | 1 |  |  |  |  | 2 | @regex_to_try = ($regex); | 
| 454 |  |  |  |  |  |  | } | 
| 455 | 4 |  |  |  |  | 7 | REGEX: for my $r (@regex_to_try) { | 
| 456 | 6 |  |  |  |  | 14 | my $term_re = $r->term_re; | 
| 457 | 6 | 50 |  |  |  | 12 | $debug | 
| 458 |  |  |  |  |  |  | and warn | 
| 459 |  |  |  |  |  |  | "testing '$stemmed' against '$query' with '$term_re'"; | 
| 460 | 6 | 100 |  |  |  | 33 | if ( $stemmed =~ m/$term_re/ ) { | 
| 461 | 3 |  |  |  |  | 9 | my $open  = $self->open_tag($query); | 
| 462 | 3 |  |  |  |  | 7 | my $close = $self->close_tag($query); | 
| 463 | 3 | 50 |  |  |  | 16 | $debug and warn "$str is hot with match '$query'"; | 
| 464 | 3 |  |  |  |  | 11 | $str         = $open . $str . $close; | 
| 465 | 3 |  |  |  |  | 5 | $found_match = 1; | 
| 466 | 3 |  |  |  |  | 7 | last Q; | 
| 467 |  |  |  |  |  |  | } | 
| 468 |  |  |  |  |  |  |  | 
| 469 |  |  |  |  |  |  | } | 
| 470 |  |  |  |  |  |  | } | 
| 471 |  |  |  |  |  |  |  | 
| 472 | 3 | 50 |  |  |  | 8 | if ( !$found_match ) { | 
| 473 |  |  |  |  |  |  |  | 
| 474 |  |  |  |  |  |  | # common case is phrases? | 
| 475 | 0 | 0 |  |  |  | 0 | $debug and warn "failed to find match for '$stemmed'"; | 
| 476 |  |  |  |  |  |  |  | 
| 477 |  |  |  |  |  |  | } | 
| 478 |  |  |  |  |  |  | } | 
| 479 | 69 |  |  |  |  | 218 | $buf .= $str; | 
| 480 |  |  |  |  |  |  | } | 
| 481 | 1 |  |  |  |  | 86 | return $buf; | 
| 482 |  |  |  |  |  |  | } | 
| 483 |  |  |  |  |  |  |  | 
| 484 |  |  |  |  |  |  | # based on HTML::HiLiter plaintext() | 
| 485 |  |  |  |  |  |  | sub plain { | 
| 486 | 13 |  |  | 13 | 1 | 32 | my $self      = shift; | 
| 487 | 13 | 50 |  |  |  | 87 | my $text      = shift or croak "need text to light()"; | 
| 488 | 13 |  |  |  |  | 431 | my $debug     = $self->debug; | 
| 489 | 13 |  |  |  |  | 139 | my $query_obj = $self->{query}; | 
| 490 | 13 |  |  |  |  | 68 | my @kworder   = $self->_kworder; | 
| 491 |  |  |  |  |  |  |  | 
| 492 | 13 |  |  |  |  | 35 | my $i = 0; | 
| 493 | 13 |  |  |  |  | 37 | my @markers; | 
| 494 | 13 |  |  |  |  | 38 | Q: for my $query (@kworder) { | 
| 495 | 31 |  |  |  |  | 97 | my $regex   = $self->_regex_for($query); | 
| 496 | 31 |  |  |  |  | 114 | my $re      = $regex->plain; | 
| 497 | 31 |  |  |  |  | 95 | my $term_re = $regex->term_re; | 
| 498 | 31 |  |  |  |  | 99 | my $open    = $self->open_tag($query); | 
| 499 | 31 |  |  |  |  | 117 | my $close   = $self->close_tag($query); | 
| 500 |  |  |  |  |  |  |  | 
| 501 |  |  |  |  |  |  | # use open/close markers rather than actual html tags | 
| 502 |  |  |  |  |  |  | # because we do not want to get double matches on text | 
| 503 |  |  |  |  |  |  | # like 'span' or 'style' | 
| 504 | 31 |  |  |  |  | 108 | my $o             = chr($i) . "\002"; | 
| 505 | 31 |  |  |  |  | 253 | my $c             = chr($i) . "\003"; | 
| 506 | 31 |  |  |  |  | 92 | my $length_we_add = length( $o . $c ) - 1; | 
| 507 | 31 |  |  |  |  | 94 | push @markers, [ $open, $close ]; | 
| 508 |  |  |  |  |  |  |  | 
| 509 |  |  |  |  |  |  | # cache this | 
| 510 | 31 |  | 66 |  |  | 199 | my $query_re = $self->{_compiled_query_regex}->{"$query"} | 
| 511 |  |  |  |  |  |  | || quotemeta($query); | 
| 512 | 31 | 100 |  |  |  | 168 | if ( !$self->{_compiled_query_regex}->{"$query"} ) { | 
| 513 | 30 |  |  |  |  | 525 | $self->{_compiled_query_regex}->{"$query"} = qr/$query_re/; | 
| 514 |  |  |  |  |  |  | } | 
| 515 |  |  |  |  |  |  |  | 
| 516 | 31 | 50 |  |  |  | 128 | $debug > 1 | 
| 517 |  |  |  |  |  |  | and carp | 
| 518 |  |  |  |  |  |  | "plain hiliter looking for: $re against '$query' in '$text'"; | 
| 519 |  |  |  |  |  |  |  | 
| 520 |  |  |  |  |  |  | # because s/// fails to find duplicate instances like 'foo foo' | 
| 521 |  |  |  |  |  |  | # we use a while loop and increment pos() | 
| 522 |  |  |  |  |  |  |  | 
| 523 |  |  |  |  |  |  | # this can suck into an infinite loop because increm pos()-- results | 
| 524 |  |  |  |  |  |  | # in repeated match on nonwordchar: > (since we just added a tag) | 
| 525 |  |  |  |  |  |  |  | 
| 526 | 31 | 50 |  |  |  | 87 | if ($debug) { | 
| 527 | 0 | 0 | 0 |  |  | 0 | if ( $text =~ m/\b$query_re\b/i && $text !~ m/$re/i ) { | 
| 528 | 0 |  |  |  |  | 0 | my ($snip) = ( $text =~ m/(.....$query_re.....)/gi ); | 
| 529 | 0 |  |  |  |  | 0 | croak "bad regex for '$query' [$snip]: $re"; | 
| 530 |  |  |  |  |  |  | } | 
| 531 |  |  |  |  |  |  | } | 
| 532 |  |  |  |  |  |  |  | 
| 533 | 31 |  |  |  |  | 58 | my $found_matches = 0; | 
| 534 | 31 |  |  |  |  | 1394 | while ( $text =~ m/$re/g ) { | 
| 535 |  |  |  |  |  |  |  | 
| 536 | 31 |  | 100 |  |  | 152 | my $s = $1 || ''; | 
| 537 | 31 |  | 33 |  |  | 111 | my $m = $2 || $query; | 
| 538 | 31 |  | 50 |  |  | 106 | my $e = $3 || ''; | 
| 539 |  |  |  |  |  |  |  | 
| 540 | 31 |  |  |  |  | 62 | $found_matches++; | 
| 541 |  |  |  |  |  |  |  | 
| 542 | 31 | 50 |  |  |  | 78 | $debug > 1 and carp "matched $s $m $e against $re"; | 
| 543 |  |  |  |  |  |  |  | 
| 544 |  |  |  |  |  |  | # use substr to do what s/// would normally do | 
| 545 |  |  |  |  |  |  | # if pos() wasn't an issue -- is this a big speed diff? | 
| 546 | 31 |  |  |  |  | 91 | my $len = length( $s . $m . $e ); | 
| 547 | 31 |  |  |  |  | 197 | my $pos = pos($text); | 
| 548 | 31 | 50 |  |  |  | 96 | $debug > 1 and carp "pos==$pos  len==$len"; | 
| 549 | 31 |  |  |  |  | 114 | my $newstring = $s . $o . $m . $c . $e; | 
| 550 | 31 |  |  |  |  | 320 | substr( $text, $pos - $len, $len, $newstring ); | 
| 551 |  |  |  |  |  |  |  | 
| 552 | 31 | 50 |  |  |  | 146 | last if $pos == length $text; | 
| 553 |  |  |  |  |  |  |  | 
| 554 |  |  |  |  |  |  | # need to account for all the new chars we just added | 
| 555 | 31 |  |  |  |  | 90 | pos($text) = $pos + $length_we_add; | 
| 556 | 31 | 50 |  |  |  | 1493 | $debug > 1 | 
| 557 |  |  |  |  |  |  | and carp "length_we_add==$length_we_add  pos==" . pos($text); | 
| 558 |  |  |  |  |  |  |  | 
| 559 |  |  |  |  |  |  | } | 
| 560 |  |  |  |  |  |  |  | 
| 561 | 31 | 50 |  |  |  | 96 | $debug and warn "found $found_matches matches"; | 
| 562 |  |  |  |  |  |  |  | 
| 563 |  |  |  |  |  |  | # sanity check similar to Snipper->_re_snip() | 
| 564 | 31 | 0 | 33 |  |  | 90 | if ( $debug and !$found_matches and $text =~ m/$query_re/ ) { | 
|  |  |  | 33 |  |  |  |  | 
| 565 | 0 | 0 |  |  |  | 0 | $debug and warn "ERROR: regex failure for '$query'"; | 
| 566 | 0 |  |  |  |  | 0 | $text = $self->html($text); | 
| 567 |  |  |  |  |  |  | } | 
| 568 |  |  |  |  |  |  |  | 
| 569 |  |  |  |  |  |  | # increment the marker | 
| 570 | 31 |  |  |  |  | 80 | $i++; | 
| 571 |  |  |  |  |  |  |  | 
| 572 |  |  |  |  |  |  | } | 
| 573 |  |  |  |  |  |  |  | 
| 574 |  |  |  |  |  |  | # now our markers replaced with actual tags | 
| 575 | 13 |  |  |  |  | 31 | $i = 0; | 
| 576 | 13 |  |  |  |  | 38 | for my $set (@markers) { | 
| 577 | 31 |  |  |  |  | 75 | my $ichr = quotemeta( chr($i) ); | 
| 578 | 31 |  |  |  |  | 442 | $text =~ s/$ichr\002/$set->[0]/g; | 
| 579 | 31 |  |  |  |  | 390 | $text =~ s/$ichr\003/$set->[1]/g; | 
| 580 | 31 |  |  |  |  | 94 | $i++; | 
| 581 |  |  |  |  |  |  | } | 
| 582 |  |  |  |  |  |  |  | 
| 583 |  |  |  |  |  |  | #warn "plain done"; | 
| 584 |  |  |  |  |  |  |  | 
| 585 | 13 |  |  |  |  | 142 | return $text; | 
| 586 |  |  |  |  |  |  |  | 
| 587 |  |  |  |  |  |  | } | 
| 588 |  |  |  |  |  |  |  | 
| 589 |  |  |  |  |  |  | 1; | 
| 590 |  |  |  |  |  |  | __END__ |