File Coverage

blib/lib/Search/Tools/HiLiter.pm

Criterion	Covered	Total	%
statement	252	281	89.6
branch	61	98	62.2
condition	21	42	50.0
subroutine	21	26	80.7
pod	10	10	100.0
total	365	457	79.8

line	stmt	bran	cond	sub	pod	time	code
1							package Search::Tools::HiLiter;
2	16			16		10917	use Moo;
	16					35873
	16					90
3							extends 'Search::Tools::Object';
4							with 'Search::Tools::ArgNormalizer';
5	16			16		6883	use Carp;
	16					22
	16					825
6	16			16		1371	use Search::Tools::Tokenizer;
	16					22
	16					360
7	16			16		1592	use Search::Tools::XML;
	16					23
	16					325
8	16			16		57	use Search::Tools::UTF8;
	16					20
	16					1332
9	16			16		55	use Data::Dump qw( dump );
	16					18
	16					563
10
11	16			16		52	use namespace::autoclean;
	16					20
	16					97
12
13							our $VERSION = '1.004';
14
15							my $XML = Search::Tools::XML->new;
16
17							my @attrs = qw(
18							query
19							tag
20							class
21							style
22							text_color
23							colors
24							tty
25							ttycolors
26							no_html
27							);
28
29							for my $attr (@attrs) {
30							has $attr => ( is => 'rw' );
31							}
32
33							sub BUILD {
34	23			23	1	405	my $self = shift;
35
36	23	50				377	if ( $self->debug ) {
37	0					0	carp "debug level set at " . $self->debug;
38							}
39
40	23					511	$self->{_tokenizer} = Search::Tools::Tokenizer->new(
41							re => $self->query->qp->term_re,
42							debug => $self->debug,
43							);
44
45	23		100			178	$self->{tag} \|\|= 'span';
46	23		50			122	$self->{colors} \|\|= [ '#ffff99', '#99ffff', '#ffccff', '#ccccff' ];
47	23		50			117	$self->{ttycolors} \|\|= [ 'bold blue', 'bold red', 'bold green' ];
48
49	23	100				82	if ( $self->tty ) {
50	7					10	eval { require Term::ANSIColor };
	7					1826
51	7	50				13679	$self->tty(0) if $@;
52							}
53
54	23					82	$self->_build_tags;
55							}
56
57							sub terms {
58	0			0	1	0	return shift->{query}->terms;
59							}
60
61							sub keywords {
62	0			0	1	0	return @{ shift->terms };
	0					0
63							}
64
65							sub _phrases {
66	0			0		0	my $self = shift;
67	0					0	my $q = $self->{query};
68	0					0	return grep { $self->_regex_for($_)->is_phrase } @{ $q->terms };
	0					0
	0					0
69							}
70
71							sub _singles {
72	0			0		0	my $self = shift;
73	0					0	my $q = $self->{query};
74	0					0	return grep { !$self->_regex_for($_)->is_phrase } @{ $q->terms };
	0					0
	0					0
75							}
76
77							sub _kworder {
78	48			48		51	my $self = shift;
79	48					56	my $q = $self->{query};
80	48					96	my $qstr = $q->str;
81	48	100				136	if ( exists $self->{_kworder_cache}->{$qstr} ) {
82	25					1725	return @{ $self->{_kworder_cache}->{$qstr} };
	25					97
83							}
84
85							# do phrases first so that duplicates privilege phrases
86	23					28	my ( @phrases, @singles );
87
88	23					24	for ( @{ $q->terms } ) {
	23					76
89	60	100				109	if ( $self->_regex_for($_)->is_phrase ) {
90	15					29	push @phrases, $_;
91							}
92							else {
93	45					80	push @singles, $_;
94							}
95							}
96
97	23					70	$self->{_kworder_cache}->{$qstr} = [ @phrases, @singles ];
98
99	23					64	return ( @phrases, @singles );
100							}
101
102							sub _build_tags {
103	23			23		31	my $self = shift;
104
105	23					35	my $t = {};
106	23					33	my @colors = @{ $self->colors };
	23					95
107	23					31	my @ttycolors = @{ $self->ttycolors };
	23					67
108	23					49	my $tag = $self->tag;
109
110	23					28	my $n = 0;
111	23					26	my $m = 0;
112
113	23					65	for my $q ( $self->_kworder ) {
114
115							# if tty flag is on, use ansicolor instead of html
116							# if debug flag is on, use both html and ansicolor
117
118	60					52	my ( %tags, $opener );
119	60					95	$tags{open} = '';
120	60					70	$tags{close} = '';
121	60	100				223	if ( $self->class ) {
		50
		100
122	10					29	$opener = qq/<$tag class='/ . $self->class . qq/'>/;
123							}
124							elsif ( $self->style ) {
125	0					0	$opener = qq/<$tag style='/ . $self->style . qq/'>/;
126							}
127							elsif ( $self->text_color ) {
128	6					11	$opener
129							= qq/<$tag style='color:/
130							. $self->text_color
131							. qq/;background:/
132							. $colors[$n] . qq/'>/;
133							}
134							else {
135	44					100	$opener = qq/<$tag style='background:/ . $colors[$n] . qq/'>/;
136							}
137
138	60	100				104	if ( $self->tty ) {
139	15	50	33			301	$tags{open} .= $opener if $self->debug && !$self->no_html;
140	15					104	$tags{open} .= Term::ANSIColor::color( $ttycolors[$m] );
141	15					271	$tags{close} .= Term::ANSIColor::color('reset');
142	15	50	33			381	$tags{close} .= "" if $self->debug && !$self->no_html;
143							}
144							else {
145	45					63	$tags{open} .= $opener;
146	45					67	$tags{close} .= "";
147							}
148
149	60					158	$t->{$q} = \%tags;
150
151	60	100				126	$n = 0 if ++$n > $#colors;
152	60	100				150	$m = 0 if ++$m > $#ttycolors;
153							}
154
155	23					408	$self->{_tags} = $t;
156							}
157
158							sub open_tag {
159	69			69	1	58	my $self = shift;
160	69	50				134	my $q = shift or croak "need query to get open_tag";
161	69		50			215	return $self->{_tags}->{$q}->{open} \|\| '';
162							}
163
164							sub close_tag {
165	69			69	1	59	my $self = shift;
166	69	50				107	my $q = shift or croak "need query to get close_tag";
167	69		50			170	return $self->{_tags}->{$q}->{close} \|\| '';
168							}
169
170							sub light {
171	25			25	1	4240	my $self = shift;
172	25	50				75	my $text = shift or return '';
173
174							# force upgrade. this is so regex will match ok.
175	25					85	$text = to_utf8($text);
176
177	25	100	66			115	if ( $XML->looks_like_html($text) && !$self->no_html ) {
178
179							#warn "running ->html";
180	11	50				63	if ( $self->query->qp->stemmer ) {
181	0					0	return $self->html_stemmer($text);
182							}
183	11					177	return $self->html($text);
184							}
185							else {
186
187							#warn "running ->plain";
188	14	100				96	if ( $self->query->qp->stemmer ) {
189	1					2	return $self->plain_stemmer($text);
190							}
191	13					42	return $self->plain($text);
192							}
193							}
194
195							*hilite = \&light;
196
197							sub _get_real_html {
198	29			29		27	my $self = shift;
199	29					19	my $text = shift;
200	29					27	my $re = shift;
201	29					28	my $m = {};
202	29	50				601	my $debug = $self->debug > 1 ? 1 : 0;
203
204							# $1 should be st_bound, $2 should be query, $3 should be end_bound
205							# N.B. The XS version of this algorithm is only a hair faster,
206							# since the $re is the bottleneck.
207	29					33273	while ( $$text =~ m/$re/g ) {
208
209	43					131	my $pos = pos($$text);
210
211	43	50				1848	if ($debug) {
212	0					0	carp "$2 matches $re";
213	0					0	carp "\$1='$1'\n\$2='$2'\n\$3='$3'\npos=$pos";
214							}
215
216	43					117	$m->{$2}++;
217
218							# move back and consider $3 again as possible $1 for next match
219	43	50				98	if ( length($3) ) {
220	43					30916	pos($$text) = $pos - 1;
221							}
222
223							}
224
225	29					57	return $m;
226
227							}
228
229							sub _regex_for {
230	124			124		106	my $self = shift;
231	124	50				239	my $term = shift or croak "term required";
232	124	100				256	if ( exists $self->{_regex_for}->{$term} ) {
233	64					160	return $self->{_regex_for}->{$term};
234							}
235	60					180	$self->{_regex_for}->{$term} = $self->query->regex_for($term);
236	60					184	return $self->{_regex_for}->{$term};
237							}
238
239							# based on HTML::HiLiter hilite()
240							sub html {
241	11			11	1	17	my $self = shift;
242	11	50				27	my $text = shift or croak "need text to light()";
243
244							###################################################################
245							# 1. create hash of query -> [ array of real HTML to hilite ]
246							# using the prebuilt regexp
247							# 2. hilite the real HTML
248							###################################################################
249
250							## 1
251
252	11					15	my $q2real = {};
253
254							# this is going to be query => [ real_html ]
255
256							# if the query text matched in the text, then we need to
257							# use our prebuilt regexp
258	11					23	my @kworder = $self->_kworder;
259
260							# don't consider anything we've marked
261							# with a 'nohiliter' attribute
262	11					19	my $text_copy = $text;
263	11					127	$text_copy =~ s/\002.*?\003//sgi;
264
265	11					21	Q: for my $query (@kworder) {
266	29					63	my $re = $self->_regex_for($query)->html;
267	29					57	my $real = $self->_get_real_html( \$text_copy, $re );
268
269	29					99	R: for my $r ( keys %$real ) {
270	35					87	push( @{ $q2real->{$query} }, $r ) while $real->{$r}--;
	43					199
271							}
272							}
273
274							## 2
275
276	11					22	HILITE: for my $q (@kworder) {
277
278	29					46	my %uniq_reals = ();
279	29					29	$uniq_reals{$_}++ for @{ $q2real->{$q} };
	29					181
280
281	29					61	REAL: for my $real ( keys %uniq_reals ) {
282
283	35					95	$self->_add_hilite_tags( \$text, $q, $real );
284
285							}
286
287							}
288
289	11					102	return $text;
290							}
291
292							sub _add_hilite_tags {
293	35			35		45	my $self = shift;
294	35					32	my $text = shift; # reference
295	35					34	my $query = shift;
296	35					35	my $html = shift;
297
298							# $text is reference to original text
299							# $html is the real html that matched our regexp
300
301							# we still check boundaries just to be safe
302	35					147	my $st_bound = $self->query->qp->start_bound;
303	35					69	my $end_bound = $self->query->qp->end_bound;
304
305	35					68	my $o = $self->open_tag($query);
306	35					64	my $c = $self->close_tag($query);
307
308	35					50	my $safe = quotemeta($html);
309
310							# pre-fix nested tags in match
311	35					34	my $pre_fixed = $html;
312	35					87	my $tag_re = $self->query->qp->tag_re;
313	35					335	my $pre_added = $pre_fixed =~ s(${tag_re}+)$c$1$og;
314	35					65	my $len_added = length( $c . $o ) * $pre_added;
315
316							# should be same as length( $to_hilite) - length( $prefixed );
317	35					64	my $len_diff = ( length($html) - length($pre_fixed) );
318	35	100				67	$len_diff *= -1
319							if $len_diff < 0; # pre_added might be -1 if no subs were made
320	35	50				70	if ( $len_diff != $len_added ) {
321	0					0	carp "length math failed!"
322							. "len_diff = $len_diff\nlen_added = $len_added";
323							}
324
325	35					25753	while ( $$text =~ m/($st_bound)($safe)($end_bound)/g ) {
326	42					974	my $s = $1;
327	42					58	my $m = $2;
328	42					43	my $e = $3;
329	42	50				1065	if ( $self->debug > 1 ) {
330	0					0	carp "matched:\n'$s'\n'$m'\n'$e'\n"
331							. "\$1 is "
332							. ord($s)
333							. "\$3 is "
334							. ord($e);
335							}
336
337							# use substr to do what s// would normally do if pos() wasn't an issue
338							# -- is this a big speed hit?
339	42					314	my $len = length( $s . $m . $e );
340	42					140	my $pos = pos($$text);
341	42					100	my $newstring = $s . $o . $pre_fixed . $c . $e;
342	42					307	substr( $$text, $pos - $len, $len, $newstring );
343
344	42					2078	pos($$text) = $pos + length( $o . $c ) + $len_added - 1;
345
346							# adjust for new text added
347							# $pre_fixed is the hard bit, since we must take $len_added into account
348							# move back 1 to reconsider $3 as next $1
349
350							# warn "pos was $pos\nnow ", pos( $html ), "\n";
351							# warn "new: '$html'\n";
352							# warn "new text: '$newstring'\n";
353							# warn "first chars of new pos are '", substr( $html, pos($html), 10 ), "'\n";
354
355							}
356
357	35					184	$self->_clean_up_hilites( $text, $query, $o, $c, $safe );
358
359							}
360
361							# no algorithm is perfect. fix it as best we can.
362							sub _clean_up_hilites {
363
364	35			35		40	my $self = shift;
365	35					84	my ( $text, $query, $o, $c, $safe ) = @_;
366
367							# empty hilites are useless
368	35		100			1913	my $empty = ( $$text =~ s,\Q$o$c\E,,sgi ) \|\| 0;
369
370							#$self->debug and carp "looking for split entities: (&[\\w#])\Q$o\E(?:\Q$c\E)(${safe})\Q$c\E([\\w#];)";
371
372							# to be safe: in some cases we might match against entities or within tag content.
373	35		50			1958	my $ent_split = (
374							$$text
375							=~ s/(&[\w#])\Q$o\E(?:\Q$c\E)?(${safe})\Q$c\E([\w#];)/$1$2$3/igs # is i and s necessary?
376							) \|\| 0;
377
378							#$self->debug and carp "found $ent_split split entities";
379
380	35					48	my $tag_split = 0;
381	35					1761	while (
382							$$text
383							=~ m/(<[^<>])\Q$o\E($safe)\Q$c\E([^>]>)/gxsi # are these xsi flags necessary?
384							)
385							{
386
387	4					13	my $first = $1;
388	4					7	my $second = $2;
389	4					7	my $third = $3;
390	4	50				92	carp "appears to split tag: $first - $second - $third"
391							if $self->debug > 1;
392
393							# TODO this would be one place to highlight text where attributes match
394
395	4					472	$tag_split += (
396							$$text =~ s/(<[^<>])\Q$o\E($safe)\Q$c\E([^>]>)/$1$2$3/gxsi );
397
398							}
399
400							}
401
402							sub html_stemmer {
403	0			0	1	0	my $self = shift;
404	0					0	my $text = shift;
405	0					0	return $self->plain_stemmer($text);
406							}
407
408							sub plain_stemmer {
409	1			1	1	1	my $self = shift;
410	1	50				2	my $text = shift or croak "need text";
411	1					21	my $debug = $self->debug;
412
413	1					5	my @kworder = $self->_kworder;
414
415							# if stemmer is on, we must stem each token to look for a match
416	1					3	my $qre = $self->query->terms_as_regex(1);
417	1					3	$qre =~ s/(\\ )+/\\|/g; # TODO OR phrases together if (0) above?
418
419	1					77	my $re = qr/^$qre$/;
420	1					6	my $stemmer = $self->query->qp->stemmer;
421	1					3	my $qp = $self->query->qp;
422	1					3	my $wildcard = $qp->wildcard;
423							my $heat_seeker = sub {
424	34			34		25	my ($token) = @_;
425	34					57	my $st = $stemmer->( $qp, $token->str );
426	34					309	return $st =~ m/$re/;
427	1					3	};
428
429	1					14	my $tokens = $self->{_tokenizer}->tokenize( $text, $heat_seeker );
430
431							# create a new string
432	1					1	my $buf;
433
434							# iterate over tokens, looking for any hot ones,
435							# and create a new string
436	1					6	TOK: while ( my $tok = $tokens->next ) {
437	69					72	my $str = $tok->str;
438	69	100				101	if ( $tok->is_hot ) {
439
440							# find the matching query term
441
442	3					5	my $stemmed = $stemmer->( $qp, $str );
443	3					10	my $found_match = 0;
444	3					3	Q: for my $query (@kworder) {
445	4					5	my $regex = $self->_regex_for($query);
446	4					4	my @regex_to_try;
447
448							# if it is a phrase, try each term in the phrase
449	4	100				7	if ( $regex->is_phrase ) {
450	3					3	@regex_to_try = @{ $regex->phrase_terms };
	3					6
451							}
452							else {
453	1					2	@regex_to_try = ($regex);
454							}
455	4					5	REGEX: for my $r (@regex_to_try) {
456	6					8	my $term_re = $r->term_re;
457	6	50				8	$debug
458							and warn
459							"testing '$stemmed' against '$query' with '$term_re'";
460	6	100				24	if ( $stemmed =~ m/$term_re/ ) {
461	3					4	my $open = $self->open_tag($query);
462	3					5	my $close = $self->close_tag($query);
463	3	50				8	$debug and warn "$str is hot with match '$query'";
464	3					6	$str = $open . $str . $close;
465	3					3	$found_match = 1;
466	3					4	last Q;
467							}
468
469							}
470							}
471
472	3	50				4	if ( !$found_match ) {
473
474							# common case is phrases?
475	0	0				0	$debug and warn "failed to find match for '$stemmed'";
476
477							}
478							}
479	69					158	$buf .= $str;
480							}
481	1					66	return $buf;
482							}
483
484							# based on HTML::HiLiter plaintext()
485							sub plain {
486	13			13	1	14	my $self = shift;
487	13	50				34	my $text = shift or croak "need text to light()";
488	13					324	my $debug = $self->debug;
489	13					67	my $query_obj = $self->{query};
490	13					35	my @kworder = $self->_kworder;
491
492	13					16	my $i = 0;
493	13					17	my @markers;
494	13					26	Q: for my $query (@kworder) {
495	31					60	my $regex = $self->_regex_for($query);
496	31					1310	my $re = $regex->plain;
497	31					48	my $term_re = $regex->term_re;
498	31					54	my $open = $self->open_tag($query);
499	31					49	my $close = $self->close_tag($query);
500
501							# use open/close markers rather than actual html tags
502							# because we do not want to get double matches on text
503							# like 'span' or 'style'
504	31					66	my $o = chr($i) . "\002";
505	31					42	my $c = chr($i) . "\003";
506	31					43	my $length_we_add = length( $o . $c ) - 1;
507	31					58	push @markers, [ $open, $close ];
508
509							# cache this
510	31		66			173	my $query_re = $self->{_compiled_query_regex}->{"$query"}
511							\|\| quotemeta($query);
512	31	100				61	if ( !$self->{_compiled_query_regex}->{"$query"} ) {
513	30					316	$self->{_compiled_query_regex}->{"$query"} = qr/$query_re/;
514							}
515
516	31	50				69	$debug > 1
517							and carp
518							"plain hiliter looking for: $re against '$query' in '$text'";
519
520							# because s/// fails to find duplicate instances like 'foo foo'
521							# we use a while loop and increment pos()
522
523							# this can suck into an infinite loop because increm pos()-- results
524							# in repeated match on nonwordchar: > (since we just added a tag)
525
526	31	50				68	if ($debug) {
527	0	0	0			0	if ( $text =~ m/\b$query_re\b/i && $text !~ m/$re/i ) {
528	0					0	my ($snip) = ( $text =~ m/(.....$query_re.....)/gi );
529	0					0	croak "bad regex for '$query' [$snip]: $re";
530							}
531							}
532
533	31					24	my $found_matches = 0;
534	31					1463	while ( $text =~ m/$re/g ) {
535
536	31		100			84	my $s = $1 \|\| '';
537	31		33			72	my $m = $2 \|\| $query;
538	31		50			60	my $e = $3 \|\| '';
539
540	31					26	$found_matches++;
541
542	31	50				52	$debug > 1 and carp "matched $s $m $e against $re";
543
544							# use substr to do what s/// would normally do
545							# if pos() wasn't an issue -- is this a big speed diff?
546	31					60	my $len = length( $s . $m . $e );
547	31					61	my $pos = pos($text);
548	31	50				51	$debug > 1 and carp "pos==$pos len==$len";
549	31					57	my $newstring = $s . $o . $m . $c . $e;
550	31					96	substr( $text, $pos - $len, $len, $newstring );
551
552	31	50				89	last if $pos == length $text;
553
554							# need to account for all the new chars we just added
555	31					54	pos($text) = $pos + $length_we_add;
556	31	50				1374	$debug > 1
557							and carp "length_we_add==$length_we_add pos==" . pos($text);
558
559							}
560
561	31	50				51	$debug and warn "found $found_matches matches";
562
563							# sanity check similar to Snipper->_re_snip()
564	31	0	33			134	if ( $debug and !$found_matches and $text =~ m/$query_re/ ) {
			33
565	0	0				0	$debug and warn "ERROR: regex failure for '$query'";
566	0					0	$text = $self->html($text);
567							}
568
569							# increment the marker
570	31					50	$i++;
571
572							}
573
574							# now our markers replaced with actual tags
575	13					32	$i = 0;
576	13					23	for my $set (@markers) {
577	31					131	my $ichr = quotemeta( chr($i) );
578	31					294	$text =~ s/$ichr\002/$set->[0]/g;
579	31					243	$text =~ s/$ichr\003/$set->[1]/g;
580	31					48	$i++;
581							}
582
583							#warn "plain done";
584
585	13					87	return $text;
586
587							}
588
589							1;
590							__END__