File Coverage

blib/lib/KSx/Highlight/Summarizer.pm

Criterion	Covered	Total	%
statement	26	153	16.9
branch	2	62	3.2
condition	0	33	0.0
subroutine	7	11	63.6
pod	3	3	100.0
total	38	262	14.5

line	stmt	bran	cond	sub	pod	time	code
1							package KSx::Highlight::Summarizer;
2
3							$VERSION = '0.06';
4
5							@ISA = KinoSearch::Highlight::Highlighter;
6	1			1		1500	use KinoSearch::Highlight::Highlighter;
	1					3
	1					57
7
8	1			1		6	use strict;
	1					2
	1					44
9
10	1			1		24	use List::Util qw 'min';
	1					2
	1					169
11	1			1		1073	use Number::Range;
	1					26897
	1					76
12
13	1			1		1509	use Hash::Util::FieldHash::Compat 'fieldhashes';
	1					25608
	1					10
14							fieldhashes \my( %ellipsis, %summ_len, %page_h, %encoder );
15
16							sub _range_endpoints {
17	0			0		0	my $range = shift;
18	0					0	my @range = $range->range;
19	0					0	my $previous = shift @range;
20	0					0	my $subrange = [($previous) x 2];
21	0					0	my @arrays;
22	0					0	foreach my $current (@range) {
23	0	0				0	if ($current == ($previous + 1)) {
24	0					0	$subrange->[1] = $current;
25							}
26							else {
27	0					0	push @arrays, $subrange;
28	0					0	$subrange = [($current) x 2];
29							}
30	0					0	$previous = $current;
31							}
32	0					0	return @arrays, $subrange; # Make sure the last subrange isn’t left out!
33							}
34
35							sub new {
36	1			1	1	31123	my($pack, %args) = @_;
37	1	50				10	my $ellipsis = exists $args{ellipsis} ? delete $args{ellipsis}
38							: ' ... ';
39	1	50				4	my $summ_len = exists $args{summary_length}
40							? delete $args{summary_length} : 0;
41	1					4	my $page_h = delete $args{page_handler};
42	1					3	my $encoder = delete $args{encoder};
43
44							# accept args that the superclass only allows one to set through
45							# accessor methods:
46	1					3	my $pre_tag = delete $args{pre_tag};
47	1					2	my $post_tag = delete $args{post_tag};
48
49	1					201	my $self = SUPER::new $pack %args;
50
51	0						$ellipsis{$self} = $ellipsis;
52	0						$summ_len{$self} = $summ_len;
53	0						$page_h{$self} = $page_h;
54	0						$encoder{$self} = $encoder;
55
56	0	0					defined $pre_tag and $self->set_pre_tag($pre_tag);
57	0	0					defined $post_tag and $self->set_post_tag($post_tag);
58
59	0						return $self;
60							}
61
62							sub create_excerpt {
63	0			0	1		my ($self, $hitdoc) = @_;
64
65	0						my $field = $self->get_field;
66	0						my $x_len = $self->get_excerpt_length;
67	0						my $limit = int($x_len /3 );
68
69							# retrieve the text from the chosen field
70	0						my $text = $hitdoc->{$field};
71	0	0					return unless defined $text;
72	0						my $text_length = length $text;
73	0	0					return '' unless $text_length;
74
75							# get offsets and weights of words that match
76	0						my $searcher = $self->get_searchable;
77	0						my $posits = $self->get_compiler->highlight_spans(
78							searchable => $searcher,
79							field => $field,
80							doc_vec => $searcher->fetch_doc_vec(
81							$hitdoc->get_doc_id
82							),
83							);
84	0						my @locs = map [$_->get_offset,$_->get_weight], @{
85	0						KinoSearch::Highlight::HeatMap->new(
86							spans => $posits,
87							window => $limit*2
88							)->get_spans
89							};
90	0						@locs = map $$_[0], sort { $$b[1] <=> $$a[1] } @locs;
	0
91
92	0	0					@locs or @locs = 0;
93
94							#warn "@locs" if $summ_len{$self};
95							# determine the rough boundaries of the excerpts
96	0						my $range = new Number::Range;
97	0						my $summ_len = $summ_len{$self};
98	0						for(@locs) {
99	1			1		1316	no warnings; # suppress Number::Range’s nasty warnings
	1					2
	1					1701
100	0						my $start = $_-$limit;
101	0	0					$start = 0 if $start < 0;
102	0						$range->addrange($start . '..' . min($start+$x_len, $text_length));
103	0	0	0				last if !$summ_len \|\| $range->size >= $summ_len;
104							}
105	0						my @excerpt_bounds = _range_endpoints($range);
106							#use DDS; warn Dump \@excerpt_bounds if $summ_len;
107
108							# close small gaps between ranges
109	0						for(my $c = 1; $c < @excerpt_bounds;++$c) {
110	0	0					$excerpt_bounds[$c][0] - $excerpt_bounds[$c-1][1] <= 10 and
111							$excerpt_bounds[$c-1][1] = $excerpt_bounds[$c][1],
112							splice(@excerpt_bounds, $c, 1),
113							--$c;
114							}
115
116							# extract the offsets from the highlight spans
117	0						my(@starts, @ends);
118	0						for(@$posits) {
119	0						push(@starts, my $start = $_->get_offset);
120	0						push(@ends, $start + $_->get_length);
121							}
122
123							# make the summary
124	0						my $summary = '';
125	0						my $ellipsis = $ellipsis{$self};
126	0						my $token_re = qr/\b\w+(?:'\w+)?\b/;
127	0						my $prev_ellipsis; # whether the previous excerpt ended with an ellip.
128	0						my $prev_page = 0; # last page number of previous excerpt
129	0						my $page_h = $page_h{$self};
130	0						for(@excerpt_bounds) {
131							# make the excerpt
132	0						my ($start,$end) = @$_;
133
134							# determine the page number that $start falls within
135	0						my $page_no;
136	0	0					$page_h and $page_no =
137							substr($text, 0,$start) =~ y/\014// + 1;
138
139	0						my $x; # short for x-cerpt
140							my $need_ellipsis;
141
142							#warn "<<".substr($text,$start,$limit).">>";
143							# look for a page break within $limit chars from $start (except we
144							# shouldn’t do it if $start is 0 because there’s a good chance
145							# we’ll go past the very word for whose sake this excerpt exists)
146							# ~~~ What about a case in which a page break plus maybe a few
147							# spaces occur just before $start. That shouldn’t get an
148							# ellipsis (as in the elsif block below), should it?
149	0	0	0				if($page_h && $start &&
		0	0
150							substr($text,$start,$limit) =~ /^(.*)\014/s) {
151	0						$start += length($1) + 1;
152	0						$page_no += 1 + $1 =~ y/\014//;
153	0						$x = substr $text, $start;
154							}
155							elsif( $start ) { # if this is not the beginning of the doc
156	0						my $sb = $self->find_sentences(
157							text => $text, offset => $start, length => $limit
158							);
159	0	0					if(@$sb) {
160	0						$start = $$sb[0];
161							}
162	0						else { ++ $need_ellipsis }
163	0						$x = substr $text, $start;
164	0	0					if($need_ellipsis) {
165							# skip past possible partial tokens, but don’t insert an
166							# ellipsis yet, because it might need to come after a
167							# page marker
168	0	0					if ($x =~ s/
169							\A
170							(
171							.{1,$limit}? # don't go outside the window
172							)
173							(?=$token_re) # just b4 the start of a full token
174							//xsm
175							)
176							{
177	0						$start += length($1);
178							}
179							}
180							}
181	0						else { $x = substr $text, $start }
182
183							# trim unwanted text from the end of the excerpt
184	0						$x = substr $x, 0, $end-$start+1; # +1 ’cos we need that extra
185							# char later
186	0						my $end_with_ellipsis = 0;
187
188							# if we’ve trimmed the end of the text
189	0	0					if ( $end < $text_length) {{ # doubled so ‘last’ will work
190							# check to see whether there are page breaks after the high-
191							# lighted word, and stop at the first one if so
192	0	0	0				if ($page_h and substr($x, $limit-2) =~ s/(\014[^\014])//) {
	0
193	0						$end -= length $1; last;
	0
194							}
195
196							# remove possible partial tokens from the end of the excerpt
197	0						my $extra_char = chop $x; # the char we left dangling earlier
198							# if the extra char wasn't part of a token, then we’re not
199							# splitting one
200	0	0					if ( $extra_char =~ $token_re ) {
201	0						$x =~ s/$token_re$//; # if this fails, that's fine
202							}
203
204							# if the excerpt doesn't end with a full stop, end with
205							# an ellipsis
206	0	0					if ( $x !~ /\.\s*\Z/xsm ) {
207	0						$x =~ s/\W+\Z//xsm;
208	0						$x .= $ellipsis;
209	0						++$end_with_ellipsis;
210							}
211							}}
212							#warn $x if $page_h;
213
214							# get the offsets that are within range for the excerpt, and make
215							# them relative to $start
216	0						my @relative_starts = map $_-$start, @starts;
217	0						my @relative_ends = map $_-$start, @ends;
218	0						my $this_x_len = $end - $start;
219	0		0				while ( @relative_starts and $relative_starts[0] < 0 ) {
220	0						shift @relative_starts;
221	0						shift @relative_ends;
222							}
223	0		0				while ( @relative_ends and $relative_ends[-1] > $this_x_len ) {
224	0						pop @relative_starts;
225	0						pop @relative_ends;
226							}
227
228							# insert highlight tags and page break markers
229							# sstart and send stand for span start and end
230	0						my ( $sstart, $send, $last_sstart, $last_send ) =
231							( undef, undef, 0, 0 );
232	0	0					if($page_h) { # Some of this code is repeated redundantly, but it
233							# should theoretically run faster since the
234							# if($page_h) check doesn’t have to be made every
235							# time through the loop.
236	0	0	0				$prev_page != $page_no
			0
237							? (
238							$summary .= &$page_h($hitdoc, $page_no),
239							$need_ellipsis && ($summary .= $ellipsis)
240							) : $need_ellipsis && !$prev_ellipsis &&
241							($summary .= $ellipsis)
242							;
243	0						while (@relative_starts) {
244	0						$send = shift @relative_ends;
245	0						$sstart = shift @relative_starts;
246	0	0	0				$summary .= _encode_with_pb( $self,
247							substr( $x, $last_send, $sstart - $last_send ),
248							$page_h, \$page_no, $hitdoc
249							) unless !$last_send && !$sstart;
250	0						$summary .= $self->highlight(
251							_encode_with_pb( $self,
252							substr( $x, $sstart, $send - $sstart ),
253							$page_h, \$page_no, $hitdoc
254							)
255							);
256	0						$last_send = $send;
257							}
258	0	0					$summary .= _encode_with_pb( $self,
259							substr( $x, $last_send ),
260							$page_h, \$page_no, $hitdoc
261							) unless $last_send == length $x;
262	0						$prev_page = $page_no;
263							}
264							else {
265	0	0	0				$need_ellipsis and !$prev_ellipsis and $summary .= $ellipsis;
266	0						while (@relative_starts) {
267	0						$send = shift @relative_ends;
268	0						$sstart = shift @relative_starts;
269	0	0	0				$summary .= $self->encode(
270							substr( $x, $last_send, $sstart - $last_send ) )
271							unless !$last_send && !$sstart;
272	0						$summary .= $self->highlight(
273							$self->encode(
274							substr( $x, $sstart, $send - $sstart )
275							)
276							);
277	0						$last_send = $send;
278							}
279	0	0					$summary .= $self->encode( substr( $x, $last_send ) )
280							unless $last_send == length $x;
281							}
282
283	0						$prev_ellipsis = $end_with_ellipsis;
284
285							}
286
287	0						return $summary;
288							}
289
290							# This is not called as a method above, because it’s a private routine that
291							# should not be overridden (it is not guaranteed to exist in future ver-
292							# sions), and it’s faster to call it as a function.
293							sub _encode_with_pb { # w/page breaks
294	0			0			my ($self, $text, $page_h, $page_no_ref, $hitdoc) = @_;
295	0						my @to_encode = split /\014/, $text, -1; # -1 to allow trailing
296	0						my $ret = ''; # null fields
297	0	0					$ret .= $self->encode(shift @to_encode) if length $to_encode[0];
298	0						for(@to_encode) {
299	0						$ret .= &$page_h($hitdoc, ++$$page_no_ref);
300	0	0					$ret .= $self->encode($_) if length;
301							}
302	0						$ret;
303							}
304
305							sub encode {
306	0			0	1		my @__ = @_; # workaround for perl5.8.8 bug
307							&{
308	0	0					$encoder{$__[0]} or return shift(@__)->SUPER::encode(@__)
	0
309							}($__[1])
310							}
311
312							1;
313
314							__END__