File Coverage

blib/lib/Search/Tools/HiLiter.pm

Criterion	Covered	Total	%
statement	252	281	89.6
branch	61	98	62.2
condition	21	42	50.0
subroutine	21	26	80.7
pod	10	10	100.0
total	365	457	79.8

line	stmt	bran	cond	sub	pod	time	code
1							package Search::Tools::HiLiter;
2	16			16		12671	use Moo;
	16					39428
	16					84
3							extends 'Search::Tools::Object';
4							with 'Search::Tools::ArgNormalizer';
5	16			16		8032	use Carp;
	16					32
	16					868
6	16			16		1506	use Search::Tools::Tokenizer;
	16					41
	16					367
7	16			16		1619	use Search::Tools::XML;
	16					35
	16					398
8	16			16		87	use Search::Tools::UTF8;
	16					25
	16					1300
9	16			16		89	use Data::Dump qw( dump );
	16					46
	16					663
10
11	16			16		83	use namespace::autoclean;
	16					29
	16					99
12
13							our $VERSION = '1.006';
14
15							my $XML = Search::Tools::XML->new;
16
17							my @attrs = qw(
18							query
19							tag
20							class
21							style
22							text_color
23							colors
24							tty
25							ttycolors
26							no_html
27							);
28
29							for my $attr (@attrs) {
30							has $attr => ( is => 'rw' );
31							}
32
33							sub BUILD {
34	23			23	1	301	my $self = shift;
35
36	23	50				355	if ( $self->debug ) {
37	0					0	carp "debug level set at " . $self->debug;
38							}
39
40	23					510	$self->{_tokenizer} = Search::Tools::Tokenizer->new(
41							re => $self->query->qp->term_re,
42							debug => $self->debug,
43							);
44
45	23		100			127	$self->{tag} \|\|= 'span';
46	23		50			134	$self->{colors} \|\|= [ '#ffff99', '#99ffff', '#ffccff', '#ccccff' ];
47	23		50			111	$self->{ttycolors} \|\|= [ 'bold blue', 'bold red', 'bold green' ];
48
49	23	100				79	if ( $self->tty ) {
50	7					10	eval { require Term::ANSIColor };
	7					1527
51	7	50				23037	$self->tty(0) if $@;
52							}
53
54	23					72	$self->_build_tags;
55							}
56
57							sub terms {
58	0			0	1	0	return shift->{query}->terms;
59							}
60
61							sub keywords {
62	0			0	1	0	return @{ shift->terms };
	0					0
63							}
64
65							sub _phrases {
66	0			0		0	my $self = shift;
67	0					0	my $q = $self->{query};
68	0					0	return grep { $self->_regex_for($_)->is_phrase } @{ $q->terms };
	0					0
	0					0
69							}
70
71							sub _singles {
72	0			0		0	my $self = shift;
73	0					0	my $q = $self->{query};
74	0					0	return grep { !$self->_regex_for($_)->is_phrase } @{ $q->terms };
	0					0
	0					0
75							}
76
77							sub _kworder {
78	48			48		83	my $self = shift;
79	48					77	my $q = $self->{query};
80	48					104	my $qstr = $q->str;
81	48	100				139	if ( exists $self->{_kworder_cache}->{$qstr} ) {
82	25					41	return @{ $self->{_kworder_cache}->{$qstr} };
	25					89
83							}
84
85							# do phrases first so that duplicates privilege phrases
86	23					39	my ( @phrases, @singles );
87
88	23					33	for ( @{ $q->terms } ) {
	23					101
89	60	100				121	if ( $self->_regex_for($_)->is_phrase ) {
90	15					35	push @phrases, $_;
91							}
92							else {
93	45					110	push @singles, $_;
94							}
95							}
96
97	23					105	$self->{_kworder_cache}->{$qstr} = [ @phrases, @singles ];
98
99	23					92	return ( @phrases, @singles );
100							}
101
102							sub _build_tags {
103	23			23		39	my $self = shift;
104
105	23					40	my $t = {};
106	23					37	my @colors = @{ $self->colors };
	23					84
107	23					36	my @ttycolors = @{ $self->ttycolors };
	23					68
108	23					54	my $tag = $self->tag;
109
110	23					31	my $n = 0;
111	23					33	my $m = 0;
112
113	23					62	for my $q ( $self->_kworder ) {
114
115							# if tty flag is on, use ansicolor instead of html
116							# if debug flag is on, use both html and ansicolor
117
118	60					86	my ( %tags, $opener );
119	60					122	$tags{open} = '';
120	60					85	$tags{close} = '';
121	60	100				253	if ( $self->class ) {
		50
		100
122	10					28	$opener = qq/<$tag class='/ . $self->class . qq/'>/;
123							}
124							elsif ( $self->style ) {
125	0					0	$opener = qq/<$tag style='/ . $self->style . qq/'>/;
126							}
127							elsif ( $self->text_color ) {
128	6					14	$opener
129							= qq/<$tag style='color:/
130							. $self->text_color
131							. qq/;background:/
132							. $colors[$n] . qq/'>/;
133							}
134							else {
135	44					121	$opener = qq/<$tag style='background:/ . $colors[$n] . qq/'>/;
136							}
137
138	60	100				123	if ( $self->tty ) {
139	15	50	33			234	$tags{open} .= $opener if $self->debug && !$self->no_html;
140	15					94	$tags{open} .= Term::ANSIColor::color( $ttycolors[$m] );
141	15					300	$tags{close} .= Term::ANSIColor::color('reset');
142	15	50	33			371	$tags{close} .= "" if $self->debug && !$self->no_html;
143							}
144							else {
145	45					82	$tags{open} .= $opener;
146	45					88	$tags{close} .= "";
147							}
148
149	60					179	$t->{$q} = \%tags;
150
151	60	100				145	$n = 0 if ++$n > $#colors;
152	60	100				152	$m = 0 if ++$m > $#ttycolors;
153							}
154
155	23					888	$self->{_tags} = $t;
156							}
157
158							sub open_tag {
159	69			69	1	88	my $self = shift;
160	69	50				153	my $q = shift or croak "need query to get open_tag";
161	69		50			238	return $self->{_tags}->{$q}->{open} \|\| '';
162							}
163
164							sub close_tag {
165	69			69	1	130	my $self = shift;
166	69	50				161	my $q = shift or croak "need query to get close_tag";
167	69		50			171	return $self->{_tags}->{$q}->{close} \|\| '';
168							}
169
170							sub light {
171	25			25	1	61	my $self = shift;
172	25	50				78	my $text = shift or return '';
173
174							# force upgrade. this is so regex will match ok.
175	25					113	$text = to_utf8($text);
176
177	25	100	66			159	if ( $XML->looks_like_html($text) && !$self->no_html ) {
178
179							#warn "running ->html";
180	11	50				53	if ( $self->query->qp->stemmer ) {
181	0					0	return $self->html_stemmer($text);
182							}
183	11					26	return $self->html($text);
184							}
185							else {
186
187							#warn "running ->plain";
188	14	100				114	if ( $self->query->qp->stemmer ) {
189	1					3	return $self->plain_stemmer($text);
190							}
191	13					57	return $self->plain($text);
192							}
193							}
194
195							*hilite = \&light;
196
197							sub _get_real_html {
198	29			29		34	my $self = shift;
199	29					42	my $text = shift;
200	29					33	my $re = shift;
201	29					46	my $m = {};
202	29	50				530	my $debug = $self->debug > 1 ? 1 : 0;
203
204							# $1 should be st_bound, $2 should be query, $3 should be end_bound
205							# N.B. The XS version of this algorithm is only a hair faster,
206							# since the $re is the bottleneck.
207	29					31341	while ( $$text =~ m/$re/g ) {
208
209	43					173	my $pos = pos($$text);
210
211	43	50				65	if ($debug) {
212	0					0	carp "$2 matches $re";
213	0					0	carp "\$1='$1'\n\$2='$2'\n\$3='$3'\npos=$pos";
214							}
215
216	43					108	$m->{$2}++;
217
218							# move back and consider $3 again as possible $1 for next match
219	43	50				95	if ( length($3) ) {
220	43					27274	pos($$text) = $pos - 1;
221							}
222
223							}
224
225	29					85	return $m;
226
227							}
228
229							sub _regex_for {
230	124			124		152	my $self = shift;
231	124	50				233	my $term = shift or croak "term required";
232	124	100				289	if ( exists $self->{_regex_for}->{$term} ) {
233	64					163	return $self->{_regex_for}->{$term};
234							}
235	60					211	$self->{_regex_for}->{$term} = $self->query->regex_for($term);
236	60					217	return $self->{_regex_for}->{$term};
237							}
238
239							# based on HTML::HiLiter hilite()
240							sub html {
241	11			11	1	15	my $self = shift;
242	11	50				23	my $text = shift or croak "need text to light()";
243
244							###################################################################
245							# 1. create hash of query -> [ array of real HTML to hilite ]
246							# using the prebuilt regexp
247							# 2. hilite the real HTML
248							###################################################################
249
250							## 1
251
252	11					24	my $q2real = {};
253
254							# this is going to be query => [ real_html ]
255
256							# if the query text matched in the text, then we need to
257							# use our prebuilt regexp
258	11					33	my @kworder = $self->_kworder;
259
260							# don't consider anything we've marked
261							# with a 'nohiliter' attribute
262	11					19	my $text_copy = $text;
263	11					113	$text_copy =~ s/\002.*?\003//sgi;
264
265	11					25	Q: for my $query (@kworder) {
266	29					68	my $re = $self->_regex_for($query)->html;
267	29					61	my $real = $self->_get_real_html( \$text_copy, $re );
268
269	29					93	R: for my $r ( keys %$real ) {
270	35					84	push( @{ $q2real->{$query} }, $r ) while $real->{$r}--;
	43					172
271							}
272							}
273
274							## 2
275
276	11					20	HILITE: for my $q (@kworder) {
277
278	29					56	my %uniq_reals = ();
279	29					33	$uniq_reals{$_}++ for @{ $q2real->{$q} };
	29					122
280
281	29					71	REAL: for my $real ( keys %uniq_reals ) {
282
283	35					114	$self->_add_hilite_tags( \$text, $q, $real );
284
285							}
286
287							}
288
289	11					96	return $text;
290							}
291
292							sub _add_hilite_tags {
293	35			35		47	my $self = shift;
294	35					50	my $text = shift; # reference
295	35					57	my $query = shift;
296	35					39	my $html = shift;
297
298							# $text is reference to original text
299							# $html is the real html that matched our regexp
300
301							# we still check boundaries just to be safe
302	35					150	my $st_bound = $self->query->qp->start_bound;
303	35					75	my $end_bound = $self->query->qp->end_bound;
304
305	35					67	my $o = $self->open_tag($query);
306	35					70	my $c = $self->close_tag($query);
307
308	35					80	my $safe = quotemeta($html);
309
310							# pre-fix nested tags in match
311	35					45	my $pre_fixed = $html;
312	35					91	my $tag_re = $self->query->qp->tag_re;
313	35					279	my $pre_added = $pre_fixed =~ s(${tag_re}+)$c$1$og;
314	35					80	my $len_added = length( $c . $o ) * $pre_added;
315
316							# should be same as length( $to_hilite) - length( $prefixed );
317	35					73	my $len_diff = ( length($html) - length($pre_fixed) );
318	35	100				68	$len_diff *= -1
319							if $len_diff < 0; # pre_added might be -1 if no subs were made
320	35	50				63	if ( $len_diff != $len_added ) {
321	0					0	carp "length math failed!"
322							. "len_diff = $len_diff\nlen_added = $len_added";
323							}
324
325	35					21255	while ( $$text =~ m/($st_bound)($safe)($end_bound)/g ) {
326	43					327	my $s = $1;
327	43					95	my $m = $2;
328	43					64	my $e = $3;
329	43	50				830	if ( $self->debug > 1 ) {
330	0					0	carp "matched:\n'$s'\n'$m'\n'$e'\n"
331							. "\$1 is "
332							. ord($s)
333							. "\$3 is "
334							. ord($e);
335							}
336
337							# use substr to do what s// would normally do if pos() wasn't an issue
338							# -- is this a big speed hit?
339	43					297	my $len = length( $s . $m . $e );
340	43					140	my $pos = pos($$text);
341	43					101	my $newstring = $s . $o . $pre_fixed . $c . $e;
342	43					265	substr( $$text, $pos - $len, $len, $newstring );
343
344	43					1555	pos($$text) = $pos + length( $o . $c ) + $len_added - 1;
345
346							# adjust for new text added
347							# $pre_fixed is the hard bit, since we must take $len_added into account
348							# move back 1 to reconsider $3 as next $1
349
350							# warn "pos was $pos\nnow ", pos( $html ), "\n";
351							# warn "new: '$html'\n";
352							# warn "new text: '$newstring'\n";
353							# warn "first chars of new pos are '", substr( $html, pos($html), 10 ), "'\n";
354
355							}
356
357	35					120	$self->_clean_up_hilites( $text, $query, $o, $c, $safe );
358
359							}
360
361							# no algorithm is perfect. fix it as best we can.
362							sub _clean_up_hilites {
363
364	35			35		66	my $self = shift;
365	35					77	my ( $text, $query, $o, $c, $safe ) = @_;
366
367							# empty hilites are useless
368	35		100			1536	my $empty = ( $$text =~ s,\Q$o$c\E,,sgi ) \|\| 0;
369
370							#$self->debug and carp "looking for split entities: (&[\\w#])\Q$o\E(?:\Q$c\E)(${safe})\Q$c\E([\\w#];)";
371
372							# to be safe: in some cases we might match against entities or within tag content.
373	35		50			1779	my $ent_split = (
374							$$text
375							=~ s/(&[\w#])\Q$o\E(?:\Q$c\E)?(${safe})\Q$c\E([\w#];)/$1$2$3/igs # is i and s necessary?
376							) \|\| 0;
377
378							#$self->debug and carp "found $ent_split split entities";
379
380	35					81	my $tag_split = 0;
381	35					1441	while (
382							$$text
383							=~ m/(<[^<>])\Q$o\E($safe)\Q$c\E([^>]>)/gxsi # are these xsi flags necessary?
384							)
385							{
386
387	4					10	my $first = $1;
388	4					8	my $second = $2;
389	4					5	my $third = $3;
390	4	50				63	carp "appears to split tag: $first - $second - $third"
391							if $self->debug > 1;
392
393							# TODO this would be one place to highlight text where attributes match
394
395	4					350	$tag_split += (
396							$$text =~ s/(<[^<>])\Q$o\E($safe)\Q$c\E([^>]>)/$1$2$3/gxsi );
397
398							}
399
400							}
401
402							sub html_stemmer {
403	0			0	1	0	my $self = shift;
404	0					0	my $text = shift;
405	0					0	return $self->plain_stemmer($text);
406							}
407
408							sub plain_stemmer {
409	1			1	1	1	my $self = shift;
410	1	50				11	my $text = shift or croak "need text";
411	1					27	my $debug = $self->debug;
412
413	1					8	my @kworder = $self->_kworder;
414
415							# if stemmer is on, we must stem each token to look for a match
416	1					5	my $qre = $self->query->terms_as_regex(1);
417	1					3	$qre =~ s/(\\ )+/\\|/g; # TODO OR phrases together if (0) above?
418
419	1					109	my $re = qr/^$qre$/;
420	1					8	my $stemmer = $self->query->qp->stemmer;
421	1					4	my $qp = $self->query->qp;
422	1					4	my $wildcard = $qp->wildcard;
423							my $heat_seeker = sub {
424	34			34		58	my ($token) = @_;
425	34					65	my $st = $stemmer->( $qp, $token->str );
426	34					403	return $st =~ m/$re/;
427	1					4	};
428
429	1					20	my $tokens = $self->{_tokenizer}->tokenize( $text, $heat_seeker );
430
431							# create a new string
432	1					3	my $buf;
433
434							# iterate over tokens, looking for any hot ones,
435							# and create a new string
436	1					8	TOK: while ( my $tok = $tokens->next ) {
437	69					128	my $str = $tok->str;
438	69	100				108	if ( $tok->is_hot ) {
439
440							# find the matching query term
441
442	3					8	my $stemmed = $stemmer->( $qp, $str );
443	3					20	my $found_match = 0;
444	3					4	Q: for my $query (@kworder) {
445	4					9	my $regex = $self->_regex_for($query);
446	4					4	my @regex_to_try;
447
448							# if it is a phrase, try each term in the phrase
449	4	100				12	if ( $regex->is_phrase ) {
450	3					5	@regex_to_try = @{ $regex->phrase_terms };
	3					7
451							}
452							else {
453	1					3	@regex_to_try = ($regex);
454							}
455	4					7	REGEX: for my $r (@regex_to_try) {
456	6					10	my $term_re = $r->term_re;
457	6	50				11	$debug
458							and warn
459							"testing '$stemmed' against '$query' with '$term_re'";
460	6	100				27	if ( $stemmed =~ m/$term_re/ ) {
461	3					6	my $open = $self->open_tag($query);
462	3					8	my $close = $self->close_tag($query);
463	3	50				10	$debug and warn "$str is hot with match '$query'";
464	3					8	$str = $open . $str . $close;
465	3					6	$found_match = 1;
466	3					5	last Q;
467							}
468
469							}
470							}
471
472	3	50				7	if ( !$found_match ) {
473
474							# common case is phrases?
475	0	0				0	$debug and warn "failed to find match for '$stemmed'";
476
477							}
478							}
479	69					189	$buf .= $str;
480							}
481	1					74	return $buf;
482							}
483
484							# based on HTML::HiLiter plaintext()
485							sub plain {
486	13			13	1	31	my $self = shift;
487	13	50				37	my $text = shift or croak "need text to light()";
488	13					290	my $debug = $self->debug;
489	13					121	my $query_obj = $self->{query};
490	13					46	my @kworder = $self->_kworder;
491
492	13					25	my $i = 0;
493	13					29	my @markers;
494	13					31	Q: for my $query (@kworder) {
495	31					69	my $regex = $self->_regex_for($query);
496	31					96	my $re = $regex->plain;
497	31					64	my $term_re = $regex->term_re;
498	31					64	my $open = $self->open_tag($query);
499	31					69	my $close = $self->close_tag($query);
500
501							# use open/close markers rather than actual html tags
502							# because we do not want to get double matches on text
503							# like 'span' or 'style'
504	31					90	my $o = chr($i) . "\002";
505	31					197	my $c = chr($i) . "\003";
506	31					63	my $length_we_add = length( $o . $c ) - 1;
507	31					72	push @markers, [ $open, $close ];
508
509							# cache this
510	31		66			135	my $query_re = $self->{_compiled_query_regex}->{"$query"}
511							\|\| quotemeta($query);
512	31	100				131	if ( !$self->{_compiled_query_regex}->{"$query"} ) {
513	30					383	$self->{_compiled_query_regex}->{"$query"} = qr/$query_re/;
514							}
515
516	31	50				91	$debug > 1
517							and carp
518							"plain hiliter looking for: $re against '$query' in '$text'";
519
520							# because s/// fails to find duplicate instances like 'foo foo'
521							# we use a while loop and increment pos()
522
523							# this can suck into an infinite loop because increm pos()-- results
524							# in repeated match on nonwordchar: > (since we just added a tag)
525
526	31	50				53	if ($debug) {
527	0	0	0			0	if ( $text =~ m/\b$query_re\b/i && $text !~ m/$re/i ) {
528	0					0	my ($snip) = ( $text =~ m/(.....$query_re.....)/gi );
529	0					0	croak "bad regex for '$query' [$snip]: $re";
530							}
531							}
532
533	31					42	my $found_matches = 0;
534	31					1028	while ( $text =~ m/$re/g ) {
535
536	31		100			99	my $s = $1 \|\| '';
537	31		33			82	my $m = $2 \|\| $query;
538	31		50			66	my $e = $3 \|\| '';
539
540	31					46	$found_matches++;
541
542	31	50				53	$debug > 1 and carp "matched $s $m $e against $re";
543
544							# use substr to do what s/// would normally do
545							# if pos() wasn't an issue -- is this a big speed diff?
546	31					85	my $len = length( $s . $m . $e );
547	31					159	my $pos = pos($text);
548	31	50				83	$debug > 1 and carp "pos==$pos len==$len";
549	31					80	my $newstring = $s . $o . $m . $c . $e;
550	31					262	substr( $text, $pos - $len, $len, $newstring );
551
552	31	50				116	last if $pos == length $text;
553
554							# need to account for all the new chars we just added
555	31					90	pos($text) = $pos + $length_we_add;
556	31	50				1112	$debug > 1
557							and carp "length_we_add==$length_we_add pos==" . pos($text);
558
559							}
560
561	31	50				69	$debug and warn "found $found_matches matches";
562
563							# sanity check similar to Snipper->_re_snip()
564	31	0	33			63	if ( $debug and !$found_matches and $text =~ m/$query_re/ ) {
			33
565	0	0				0	$debug and warn "ERROR: regex failure for '$query'";
566	0					0	$text = $self->html($text);
567							}
568
569							# increment the marker
570	31					59	$i++;
571
572							}
573
574							# now our markers replaced with actual tags
575	13					23	$i = 0;
576	13					26	for my $set (@markers) {
577	31					59	my $ichr = quotemeta( chr($i) );
578	31					322	$text =~ s/$ichr\002/$set->[0]/g;
579	31					293	$text =~ s/$ichr\003/$set->[1]/g;
580	31					73	$i++;
581							}
582
583							#warn "plain done";
584
585	13					91	return $text;
586
587							}
588
589							1;
590							__END__