File Coverage

blib/lib/KinoSearch1/Highlight/Highlighter.pm

Criterion	Covered	Total	%
statement	162	177	91.5
branch	39	56	69.6
condition	4	6	66.6
subroutine	12	12	100.0
pod	1	2	50.0
total	218	253	86.1

line	stmt	bran	cond	sub	pod	time	code
1							package KinoSearch1::Highlight::Highlighter;
2	18			18		103	use strict;
	18					44
	18					667
3	18			18		99	use warnings;
	18					45
	18					453
4	18			18		127	use KinoSearch1::Util::ToolSet;
	18					36
	18					2828
5	18			18		104	use base qw( KinoSearch1::Util::Class );
	18					68
	18					1578
6	18			18		11460	use locale;
	18					2752
	18					126
7
8							BEGIN {
9	18			18		1841	__PACKAGE__->init_instance_vars(
10							# constructor params / members
11							excerpt_field => undef,
12							analyzer => undef,
13							formatter => undef,
14							encoder => undef,
15							terms => undef,
16							excerpt_length => 200,
17							pre_tag => undef, # back compat
18							post_tag => undef, # back compat
19							token_re => qr/\b\w+(?:'\w+)?\b/,
20
21							# members
22							limit => undef,
23							);
24	18					181	__PACKAGE__->ready_get_set(qw( terms ));
25							}
26
27	18			18		10961	use KinoSearch1::Highlight::SimpleHTMLFormatter;
	18					51
	18					463
28	18			18		10697	use KinoSearch1::Highlight::SimpleHTMLEncoder;
	18					52
	18					52856
29
30							sub init_instance {
31	1			1	1	199	my $self = shift;
32	1	50				177	croak("Missing required arg 'excerpt_field'")
33							unless defined $self->{excerpt_field};
34	1					3	$self->{terms} = [];
35
36							# assume HTML
37	1	50				174	if ( !defined $self->{encoder} ) {
38	1					11	$self->{encoder} = KinoSearch1::Highlight::SimpleHTMLEncoder->new;
39							}
40	1	50				7	if ( !defined $self->{formatter} ) {
41	1					3	my ( $pre_tag, $post_tag ) = @{$self}{qw( pre_tag post_tag )};
	1					3
42	1	50				5	$pre_tag = '' unless defined $pre_tag;
43	1	50				4	$post_tag = '' unless defined $post_tag;
44	1					17	$self->{formatter} = KinoSearch1::Highlight::SimpleHTMLFormatter->new(
45							pre_tag => $pre_tag,
46							post_tag => $post_tag,
47							);
48							}
49
50							# scoring window is 1.66 * excerpt_length, with the loc in the middle
51	1					5	$self->{limit} = int( $self->{excerpt_length} / 3 );
52							}
53
54							sub generate_excerpt {
55	4			4	0	8	my ( $self, $doc ) = @_;
56	4					11	my $excerpt_length = $self->{excerpt_length};
57	4					8	my $limit = $self->{limit};
58	4					8	my $token_re = $self->{token_re};
59
60							# retrieve the text from the chosen field
61	4					17	my $field = $doc->get_field( $self->{excerpt_field} );
62	4					13	my $text = $field->get_value;
63	4					18	my $text_length = bytes::length $text;
64	4	50				21	return '' unless $text_length;
65
66							# determine the rough boundaries of the excerpt
67	4					15	my $posits = $self->_starts_and_ends($field);
68	4					14	my $best_location = $self->_calc_best_location($posits);
69	4					10	my $top = $best_location - $limit;
70
71							# expand the excerpt if the best location is near the end
72	4	100				10	$top
73							= $text_length - $excerpt_length < $top
74							? $text_length - $excerpt_length
75							: $top;
76
77							# if the best starting point is the very beginning, cool...
78	4	100				10	if ( $top <= 0 ) {
79	2					4	$top = 0;
80							}
81							# ... otherwise ...
82							else {
83							# lop off $top bytes
84	2					9	$text = bytes::substr( $text, $top );
85
86							# try to start the excerpt at a sentence boundary
87	2	50				74	if ($text =~ s/
88							\A
89							(
90							\C{0,$limit}?
91							\.\s+
92							)
93							//xsm
94							)
95							{
96	0					0	$top += bytes::length($1);
97							}
98							# no sentence boundary, so we'll need an ellipsis
99							else {
100							# skip past possible partial tokens, prepend an ellipsis
101	2	50				54	if ($text =~ s/
102							\A
103							(
104							\C{0,$limit}? # don't go outside the window
105							$token_re # match possible partial token
106							.*? # ... and any junk following that token
107							)
108							(?=$token_re) # just before the start of a full token...
109							/... /xsm # ... insert an ellipsis
110							)
111							{
112	2					7	$top += bytes::length($1);
113	2					11	$top -= 4 # three dots and a space
114							}
115							}
116							}
117
118							# remove possible partial tokens from the end of the excerpt
119	4					16	$text = bytes::substr( $text, 0, $excerpt_length + 1 );
120	4	100				28	if ( bytes::length($text) > $excerpt_length ) {
121	2					10	my $extra_char = chop $text;
122							# if the extra char wasn't part of a token, we aren't splitting one
123	2	50				16	if ( $extra_char =~ $token_re ) {
124	2					93	$text =~ s/$token_re$//; # if this is unsuccessful, that's fine
125							}
126							}
127
128							# if the excerpt doesn't end with a full stop, end with an an ellipsis
129	4	100				40	if ( $text !~ /\.\s*\Z/xsm ) {
130	3					40	$text =~ s/\W+\Z//xsm;
131	3					10	while ( bytes::length($text) + 4 > $excerpt_length ) {
132	4					18	my $extra_char = chop $text;
133	4	50				29	if ( $extra_char =~ $token_re ) {
134	4					151	$text =~ s/\W+$token_re\Z//xsm; # if unsuccessful, that's fine
135							}
136	4					73	$text =~ s/\W+\Z//xsm;
137							}
138	3					17	$text .= ' ...';
139							}
140
141							# remap locations now that we know the starting and ending bytes
142	4					11	$text_length = bytes::length($text);
143	4					17	my @relative_starts = map { $_->[0] - $top } @$posits;
	5					13
144	4					8	my @relative_ends = map { $_->[1] - $top } @$posits;
	5					10
145
146							# get rid of pairs with at least one member outside the text
147	4		66			25	while ( @relative_starts and $relative_starts[0] < 0 ) {
148	0					0	shift @relative_starts;
149	0					0	shift @relative_ends;
150							}
151	4		66			22	while ( @relative_ends and $relative_ends[-1] > $text_length ) {
152	0					0	pop @relative_starts;
153	0					0	pop @relative_ends;
154							}
155
156							# insert highlight tags
157	4					9	my $formatter = $self->{formatter};
158	4					10	my $encoder = $self->{encoder};
159	4					6	my $output_text = '';
160	4					8	my ( $start, $end, $last_start, $last_end ) = ( undef, undef, 0, 0 );
161	4					11	while (@relative_starts) {
162	5					7	$end = shift @relative_ends;
163	5					6	$start = shift @relative_starts;
164	5					14	$output_text .= $encoder->encode(
165							bytes::substr( $text, $last_end, $start - $last_end ) );
166	5					18	$output_text
167							.= $formatter->highlight(
168							$encoder->encode( bytes::substr( $text, $start, $end - $start ) )
169							);
170	5					13	$last_end = $end;
171							}
172	4					12	$output_text .= $encoder->encode( bytes::substr( $text, $last_end ) );
173
174	4					25	return $output_text;
175							}
176
177							=for comment
178							Find all points in the text where a relevant term begins and ends. For terms
179							that are part of a phrase, only include points that are part of the phrase.
180
181							=cut
182
183							sub _starts_and_ends {
184	4			4		5	my ( $self, $field ) = @_;
185	4					6	my @posits;
186							my %done;
187
188	4					6	TERM: for my $term ( @{ $self->{terms} } ) {
	4					12
189	16	100				39	if ( a_isa_b( $term, 'KinoSearch1::Index::Term' ) ) {
190	10					33	my $term_text = $term->get_text;
191
192	10	100				33	next TERM if $done{$term_text};
193	5					10	$done{$term_text} = 1;
194
195							# add all starts and ends
196	5					17	my $term_vector = $field->term_vector($term_text);
197	5	100				15	next TERM unless defined $term_vector;
198	4					12	my $starts = $term_vector->get_start_offsets;
199	4					10	my $ends = $term_vector->get_end_offsets;
200	4					10	while (@$starts) {
201	4					22	push @posits, [ shift @$starts, shift @$ends, 1 ];
202							}
203							}
204							# intersect positions for phrase terms
205							else {
206							# if not a Term, it's an array of Terms representing a phrase
207	6					10	my @term_texts = map { $_->get_text } @$term;
	18					40
208
209	6					17	my $phrase_text = join( ' ', @term_texts );
210	6	100				19	next TERM if $done{$phrase_text};
211	3					7	$done{$phrase_text} = 1;
212
213	3					42	my $posit_vec = KinoSearch1::Util::BitVector->new;
214	3					6	my @term_vectors = map { $field->term_vector($_) } @term_texts;
	9					21
215
216							# make sure all terms are present
217	3	100				16	next TERM unless scalar @term_vectors == scalar @term_texts;
218
219	2					4	my $i = 0;
220	2					4	for my $tv (@term_vectors) {
221							# one term missing, ergo no phrase
222	6	50				14	next TERM unless defined $tv;
223	6	100				13	if ( $i == 0 ) {
224	2					5	$posit_vec->set( @{ $tv->get_positions } );
	2					7
225							}
226							else {
227							# filter positions using logical "and"
228	4					27	my $other_posit_vec = KinoSearch1::Util::BitVector->new;
229	4					15	$other_posit_vec->set(
230	4					9	grep { $_ >= 0 }
231	4					10	map { $_ - $i } @{ $tv->get_positions }
	4					11
232							);
233	4					33	$posit_vec->logical_and($other_posit_vec);
234							}
235	6					11	$i++;
236							}
237
238							# add only those starts/ends that belong to a valid position
239	2					8	my $tv_start_positions = $term_vectors[0]->get_positions;
240	2					9	my $tv_starts = $term_vectors[0]->get_start_offsets;
241	2					6	my $tv_end_positions = $term_vectors[-1]->get_positions;
242	2					6	my $tv_ends = $term_vectors[-1]->get_end_offsets;
243	2					5	$i = 0;
244	2					3	my $j = 0;
245	2					3	my $last_token_index = $#term_vectors;
246	2					4	for my $valid_position ( @{ $posit_vec->to_arrayref } ) {
	2					11
247
248	2					9	while ( $i <= $#$tv_start_positions ) {
249	2	50				7	last if ( $tv_start_positions->[$i] >= $valid_position );
250	0					0	$i++;
251							}
252	2					3	$valid_position += $last_token_index;
253	2					5	while ( $j <= $#$tv_end_positions ) {
254	2	50				6	last if ( $tv_end_positions->[$j] >= $valid_position );
255	0					0	$j++;
256							}
257	2					7	push @posits,
258							[ $tv_starts->[$i], $tv_ends->[$j], scalar @$term ];
259	2					4	$i++;
260	2					29	$j++;
261							}
262							}
263							}
264
265							# sort, uniquify and return
266	4	50				13	@posits = sort { $a->[0] <=> $b->[0] \|\| $b->[1] <=> $a->[1] } @posits;
	3					10
267	4					6	my @unique;
268	4					7	my $last = ~0;
269	4					8	for (@posits) {
270	6	100				14	push @unique, $_ if $_->[0] != $last;
271	6					10	$last = $_->[0];
272							}
273	4					12	return \@unique;
274							}
275
276							=for comment
277							Select the byte address representing the greatest keyword density. Because
278							the algorithm counts bytes rather than characters, it will degrade if the
279							number of bytes per character is larger than 1.
280
281							=cut
282
283							sub _calc_best_location {
284	4			4		6	my ( $self, $posits ) = @_;
285	4					6	my $window = $self->{limit} * 2;
286
287							# if there aren't any keywords, take the excerpt from the top of the text
288	4	100				10	return 0 unless @$posits;
289
290	3					3	my %locations = map { ( $_->[0] => 0 ) } @$posits;
	5					19
291
292							# if another keyword is in close proximity, add to the loc's score
293	3					9	for my $loc_index ( 0 .. $#$posits ) {
294							# only score positions that are in range
295	5					8	my $location = $posits->[$loc_index][0];
296	5					7	my $other_loc_index = $loc_index - 1;
297	5					16	while ( $other_loc_index > 0 ) {
298	0					0	my $diff = $location - $posits->[$other_loc_index][0];
299	0	0				0	last if $diff > $window;
300	0					0	my $num_tokens_at_pos = $posits->[$other_loc_index][2];
301	0					0	$locations{$location}
302							+= ( 1 / ( 1 + log($diff) ) ) * $num_tokens_at_pos;
303	0					0	--$other_loc_index;
304							}
305	5					7	$other_loc_index = $loc_index + 1;
306	5					20	while ( $other_loc_index <= $#$posits ) {
307	2					4	my $diff = $posits->[$other_loc_index] - $location;
308	2	50				9	last if $diff > $window;
309	0					0	my $num_tokens_at_pos = $posits->[$other_loc_index][2];
310	0					0	$locations{$location}
311							+= ( 1 / ( 1 + log($diff) ) ) * $num_tokens_at_pos;
312	0					0	++$other_loc_index;
313							}
314							}
315
316							# return the highest scoring position
317	3					13	return ( sort { $locations{$b} <=> $locations{$a} } keys %locations )[0];
	2					10
318							}
319
320							1;
321
322							__END__