line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Search::Tools::Snipper; |
2
|
16
|
|
|
16
|
|
135963
|
use Moo; |
|
16
|
|
|
|
|
43933
|
|
|
16
|
|
|
|
|
78
|
|
3
|
|
|
|
|
|
|
extends 'Search::Tools::Object'; |
4
|
|
|
|
|
|
|
with 'Search::Tools::ArgNormalizer'; |
5
|
16
|
|
|
16
|
|
8860
|
use Carp; |
|
16
|
|
|
|
|
31
|
|
|
16
|
|
|
|
|
742
|
|
6
|
16
|
|
|
16
|
|
798
|
use Data::Dump qw( dump ); |
|
16
|
|
|
|
|
9171
|
|
|
16
|
|
|
|
|
514
|
|
7
|
16
|
|
|
16
|
|
2456
|
use Search::Tools::XML; |
|
16
|
|
|
|
|
30
|
|
|
16
|
|
|
|
|
411
|
|
8
|
16
|
|
|
16
|
|
84
|
use Search::Tools::UTF8; |
|
16
|
|
|
|
|
24
|
|
|
16
|
|
|
|
|
1509
|
|
9
|
16
|
|
|
16
|
|
3965
|
use Search::Tools::Tokenizer; |
|
16
|
|
|
|
|
39
|
|
|
16
|
|
|
|
|
472
|
|
10
|
16
|
|
|
16
|
|
6349
|
use Search::Tools::HeatMap; |
|
16
|
|
|
|
|
41
|
|
|
16
|
|
|
|
|
438
|
|
11
|
|
|
|
|
|
|
|
12
|
16
|
|
|
16
|
|
97
|
use namespace::autoclean; |
|
16
|
|
|
|
|
26
|
|
|
16
|
|
|
|
|
85
|
|
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
our $VERSION = '1.006'; |
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
# extra space here so pmvers works against $VERSION |
17
|
|
|
|
|
|
|
our $ellip = ' ... '; |
18
|
|
|
|
|
|
|
our $DefaultSnipper = 'offset'; |
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
# |
21
|
|
|
|
|
|
|
# TODO allow for returning an array ref of |
22
|
|
|
|
|
|
|
# extracts instead of joining them all with $ellip |
23
|
|
|
|
|
|
|
# |
24
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
my @attrs = qw( |
26
|
|
|
|
|
|
|
as_sentences |
27
|
|
|
|
|
|
|
collapse_whitespace |
28
|
|
|
|
|
|
|
context |
29
|
|
|
|
|
|
|
count |
30
|
|
|
|
|
|
|
escape |
31
|
|
|
|
|
|
|
force |
32
|
|
|
|
|
|
|
ignore_length |
33
|
|
|
|
|
|
|
max_chars |
34
|
|
|
|
|
|
|
occur |
35
|
|
|
|
|
|
|
query |
36
|
|
|
|
|
|
|
show |
37
|
|
|
|
|
|
|
snipper |
38
|
|
|
|
|
|
|
strip_markup |
39
|
|
|
|
|
|
|
treat_phrases_as_singles |
40
|
|
|
|
|
|
|
type |
41
|
|
|
|
|
|
|
type_used |
42
|
|
|
|
|
|
|
use_pp |
43
|
|
|
|
|
|
|
word_len |
44
|
|
|
|
|
|
|
); |
45
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
my %Defaults = ( |
47
|
|
|
|
|
|
|
type => $DefaultSnipper, |
48
|
|
|
|
|
|
|
occur => 5, |
49
|
|
|
|
|
|
|
max_chars => 300, |
50
|
|
|
|
|
|
|
context => 8, |
51
|
|
|
|
|
|
|
word_len => 4, # TODO still used? |
52
|
|
|
|
|
|
|
show => 1, |
53
|
|
|
|
|
|
|
collapse_whitespace => 1, |
54
|
|
|
|
|
|
|
escape => 0, |
55
|
|
|
|
|
|
|
force => 0, |
56
|
|
|
|
|
|
|
as_sentences => 0, |
57
|
|
|
|
|
|
|
ignore_length => 0, |
58
|
|
|
|
|
|
|
treat_phrases_as_singles => 1, |
59
|
|
|
|
|
|
|
strip_markup => 0, |
60
|
|
|
|
|
|
|
); |
61
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
for my $attr (@attrs) { |
63
|
|
|
|
|
|
|
my $def = $Defaults{$attr} || undef; |
64
|
|
|
|
|
|
|
if ( defined $def ) { |
65
|
|
|
|
|
|
|
has( $attr => ( is => 'rw', default => sub {$def} ) ); |
66
|
|
|
|
|
|
|
} |
67
|
|
|
|
|
|
|
else { |
68
|
|
|
|
|
|
|
has( $attr => ( is => 'rw' ) ); |
69
|
|
|
|
|
|
|
} |
70
|
|
|
|
|
|
|
} |
71
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
sub BUILD { |
73
|
30
|
|
|
30
|
1
|
132
|
my $self = shift; |
74
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
#dump $self; |
76
|
|
|
|
|
|
|
|
77
|
30
|
|
|
|
|
563
|
$self->{_tokenizer} = Search::Tools::Tokenizer->new( |
78
|
|
|
|
|
|
|
re => $self->query->qp->term_re, |
79
|
|
|
|
|
|
|
debug => $self->debug, |
80
|
|
|
|
|
|
|
); |
81
|
|
|
|
|
|
|
|
82
|
30
|
|
|
|
|
136
|
my $wc = $self->query->qp->word_characters; |
83
|
|
|
|
|
|
|
|
84
|
|
|
|
|
|
|
# regexp for splitting into terms in _re() |
85
|
30
|
|
|
|
|
745
|
$self->{_wc_regexp} = qr/[^$wc]+/io; |
86
|
|
|
|
|
|
|
|
87
|
|
|
|
|
|
|
$self->{_qre} |
88
|
30
|
|
|
|
|
191
|
= $self->query->terms_as_regex( $self->treat_phrases_as_singles ); |
89
|
|
|
|
|
|
|
|
90
|
30
|
|
|
|
|
133
|
$self->count(0); |
91
|
|
|
|
|
|
|
|
92
|
30
|
|
|
|
|
1301
|
return $self; |
93
|
|
|
|
|
|
|
} |
94
|
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
# I tried Text::Context but that was too slow. |
96
|
|
|
|
|
|
|
# Here are several different models. |
97
|
|
|
|
|
|
|
# I have found that _loop() is faster for single-word queries, |
98
|
|
|
|
|
|
|
# while _re() seems to be the best compromise between speed and accuracy. |
99
|
|
|
|
|
|
|
# New in version 0.24 is _token() which is mostly XS and should be best. |
100
|
|
|
|
|
|
|
|
101
|
|
|
|
|
|
|
sub _pick_snipper { |
102
|
31
|
|
|
31
|
|
144
|
my ( $self, $text ) = @_; |
103
|
31
|
|
66
|
|
|
131
|
my $snipper_name = $self->type || $DefaultSnipper; |
104
|
31
|
100
|
|
|
|
175
|
if ( $self->query->qp->stemmer ) { |
105
|
5
|
|
|
|
|
10
|
$snipper_name = 'token'; |
106
|
|
|
|
|
|
|
} |
107
|
31
|
|
|
|
|
75
|
my $method_name = '_' . $snipper_name; |
108
|
31
|
|
|
|
|
95
|
$self->type_used($snipper_name); |
109
|
31
|
|
|
31
|
|
120
|
my $func = sub { shift->$method_name(@_) }; |
|
31
|
|
|
|
|
113
|
|
110
|
31
|
|
|
|
|
112
|
return $func; |
111
|
|
|
|
|
|
|
} |
112
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
# 2 passes, excluding ' ' in the first one, |
114
|
|
|
|
|
|
|
# is 60% faster than a single pass including ' '. |
115
|
|
|
|
|
|
|
# likely because there are far fewer matches |
116
|
|
|
|
|
|
|
# in either of the 2 than the 1. |
117
|
|
|
|
|
|
|
sub _normalize_whitespace { |
118
|
62
|
|
|
62
|
|
2242
|
$_[0] =~ s,[\n\r\t\xa0]+,\ ,go; |
119
|
62
|
|
|
|
|
4911
|
$_[0] =~ s,\ +, ,go; # \ \ + was 16x slower on bigfile!! |
120
|
|
|
|
|
|
|
} |
121
|
|
|
|
|
|
|
|
122
|
|
|
|
|
|
|
sub snip { |
123
|
33
|
|
|
33
|
1
|
107
|
my $self = shift; |
124
|
33
|
|
|
|
|
59
|
my $text = shift; |
125
|
33
|
50
|
|
|
|
92
|
if ( !defined $text ) { |
126
|
0
|
|
|
|
|
0
|
croak "text required to snip"; |
127
|
|
|
|
|
|
|
} |
128
|
|
|
|
|
|
|
|
129
|
|
|
|
|
|
|
# normalize encoding, esp for regular expressions. |
130
|
33
|
|
|
|
|
121
|
$text = to_utf8($text); |
131
|
|
|
|
|
|
|
|
132
|
|
|
|
|
|
|
# don't snip if we're less than the threshold |
133
|
33
|
100
|
100
|
|
|
639
|
if ( length($text) < $self->max_chars && !$self->ignore_length ) { |
134
|
2
|
50
|
|
|
|
7
|
if ( $self->show ) { |
135
|
2
|
50
|
|
|
|
6
|
if ( $self->strip_markup ) { |
136
|
0
|
|
|
|
|
0
|
return Search::Tools::XML->no_html($text); |
137
|
|
|
|
|
|
|
} |
138
|
2
|
|
|
|
|
10
|
return $text; |
139
|
|
|
|
|
|
|
} |
140
|
0
|
|
|
|
|
0
|
return ''; |
141
|
|
|
|
|
|
|
} |
142
|
|
|
|
|
|
|
|
143
|
31
|
100
|
|
|
|
98
|
if ( $self->strip_markup ) { |
144
|
1
|
|
|
|
|
8
|
$text = Search::Tools::XML->no_html($text); |
145
|
|
|
|
|
|
|
} |
146
|
|
|
|
|
|
|
|
147
|
31
|
50
|
|
|
|
89
|
if ( $self->collapse_whitespace ) { |
148
|
31
|
|
|
|
|
91
|
_normalize_whitespace($text); |
149
|
|
|
|
|
|
|
} |
150
|
|
|
|
|
|
|
|
151
|
|
|
|
|
|
|
# we calculate the snipper each time since caller |
152
|
|
|
|
|
|
|
# may set type() or snipper() between calls to snip(). |
153
|
31
|
|
33
|
|
|
193
|
my $func = $self->snipper || $self->_pick_snipper($text); |
154
|
|
|
|
|
|
|
|
155
|
31
|
|
|
|
|
72
|
my $s = $func->( $self, $text ); |
156
|
|
|
|
|
|
|
|
157
|
31
|
50
|
|
|
|
626
|
$self->debug and warn "snipped: '$s'\n"; |
158
|
|
|
|
|
|
|
|
159
|
|
|
|
|
|
|
# sanity check |
160
|
31
|
100
|
100
|
|
|
454
|
if ( length($s) > ( $self->max_chars * 4 ) && !$self->ignore_length ) { |
|
|
100
|
66
|
|
|
|
|
161
|
1
|
|
|
|
|
5
|
$s = $self->_dumb($s); |
162
|
1
|
50
|
|
|
|
23
|
$self->debug and warn "too long. dumb snip: '$s'\n"; |
163
|
|
|
|
|
|
|
} |
164
|
|
|
|
|
|
|
elsif ( !length($s) && !$self->ignore_length ) { |
165
|
1
|
|
|
|
|
4
|
$s = $self->_dumb($text); |
166
|
1
|
50
|
|
|
|
14
|
$self->debug and warn "too short. dumb snip: '$s'\n"; |
167
|
|
|
|
|
|
|
} |
168
|
|
|
|
|
|
|
|
169
|
|
|
|
|
|
|
# escape entities before collapsing whitespace. |
170
|
31
|
|
|
|
|
100
|
$s = $self->_escape($s); |
171
|
|
|
|
|
|
|
|
172
|
31
|
50
|
|
|
|
88
|
if ( $self->collapse_whitespace ) { |
173
|
31
|
|
|
|
|
62
|
_normalize_whitespace($s); |
174
|
|
|
|
|
|
|
} |
175
|
|
|
|
|
|
|
|
176
|
31
|
|
|
|
|
277
|
return $s; |
177
|
|
|
|
|
|
|
|
178
|
|
|
|
|
|
|
} |
179
|
|
|
|
|
|
|
|
180
|
|
|
|
|
|
|
sub _token { |
181
|
30
|
|
|
30
|
|
50
|
my $self = shift; |
182
|
30
|
|
|
|
|
62
|
my $qre = $self->{_qre}; |
183
|
30
|
50
|
|
|
|
610
|
$self->debug and warn "\$qre: $qre"; |
184
|
|
|
|
|
|
|
|
185
|
30
|
100
|
|
|
|
237
|
my $method = ( $self->{use_pp} ) ? 'tokenize_pp' : 'tokenize'; |
186
|
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
# must split phrases into OR'd regex or else no heat is generated. |
188
|
30
|
|
|
|
|
52
|
my $qre_ORd = $qre; |
189
|
30
|
|
|
|
|
122
|
$qre_ORd =~ s/(\\ )+/\|/g; |
190
|
30
|
|
|
|
|
964
|
my $heat_seeker = qr/^$qre_ORd$/; |
191
|
|
|
|
|
|
|
|
192
|
|
|
|
|
|
|
# if stemmer is on, we must stem each token to look for a match |
193
|
30
|
100
|
|
|
|
190
|
if ( $self->query->qp->stemmer ) { |
194
|
5
|
|
|
|
|
16
|
my $stemmer = $self->query->qp->stemmer; |
195
|
5
|
|
|
|
|
10
|
my $qp = $self->query->qp; |
196
|
5
|
|
|
|
|
9
|
my $re = $heat_seeker; |
197
|
|
|
|
|
|
|
$heat_seeker = sub { |
198
|
486
|
|
|
486
|
|
737
|
my ($token) = @_; |
199
|
486
|
|
|
|
|
900
|
my $st = $stemmer->( $qp, $token->str ); |
200
|
486
|
|
|
|
|
5017
|
return $st =~ m/$re/; |
201
|
5
|
|
|
|
|
20
|
}; |
202
|
|
|
|
|
|
|
} |
203
|
30
|
|
|
|
|
22071
|
my $tokens = $self->{_tokenizer}->$method( $_[0], $heat_seeker ); |
204
|
|
|
|
|
|
|
|
205
|
|
|
|
|
|
|
#$self->debug and $tokens->dump; |
206
|
|
|
|
|
|
|
|
207
|
30
|
50
|
|
|
|
75
|
return $self->_dumb( $_[0] ) unless scalar @{ $tokens->get_heat }; |
|
30
|
|
|
|
|
177
|
|
208
|
|
|
|
|
|
|
|
209
|
|
|
|
|
|
|
my $heatmap = Search::Tools::HeatMap->new( |
210
|
|
|
|
|
|
|
tokens => $tokens, |
211
|
|
|
|
|
|
|
window_size => $self->{context}, |
212
|
|
|
|
|
|
|
as_sentences => $self->{as_sentences}, |
213
|
|
|
|
|
|
|
debug => $self->debug, |
214
|
|
|
|
|
|
|
_query => $self->query, |
215
|
|
|
|
|
|
|
_qre => $qre, |
216
|
|
|
|
|
|
|
_treat_phrases_as_singles => $self->{treat_phrases_as_singles}, |
217
|
30
|
|
|
|
|
656
|
_stemmer => $self->query->qp->stemmer, |
218
|
|
|
|
|
|
|
); |
219
|
|
|
|
|
|
|
|
220
|
|
|
|
|
|
|
# reduce noise in debug |
221
|
30
|
|
|
|
|
71
|
delete $heatmap->{_query}; |
222
|
|
|
|
|
|
|
|
223
|
30
|
50
|
|
|
|
655
|
$self->debug and warn "heatmap: " . dump $heatmap; |
224
|
|
|
|
|
|
|
|
225
|
30
|
|
|
|
|
256
|
my $tokens_arr = $tokens->as_array; |
226
|
|
|
|
|
|
|
|
227
|
|
|
|
|
|
|
#warn "snips: " . dump $heatmap->spans; |
228
|
30
|
100
|
|
|
|
91
|
if ( $heatmap->has_spans ) { |
229
|
|
|
|
|
|
|
|
230
|
|
|
|
|
|
|
# stringify positions |
231
|
29
|
|
|
|
|
50
|
my @snips; |
232
|
29
|
|
|
|
|
42
|
for my $span ( @{ $heatmap->spans } ) { |
|
29
|
|
|
|
|
120
|
|
233
|
|
|
|
|
|
|
|
234
|
58
|
50
|
|
|
|
824
|
$self->debug and warn '>>>' . $span->{str_w_pos} . '<<<'; |
235
|
58
|
|
|
|
|
329
|
push( @snips, $span->{str} ); |
236
|
|
|
|
|
|
|
} |
237
|
29
|
|
|
|
|
89
|
my $occur_index = $self->occur - 1; |
238
|
29
|
100
|
|
|
|
91
|
if ( $#snips > $occur_index ) { |
239
|
5
|
|
|
|
|
18
|
@snips = @snips[ 0 .. $occur_index ]; |
240
|
|
|
|
|
|
|
} |
241
|
29
|
|
|
|
|
154
|
my $snip = join( $ellip, @snips ); |
242
|
29
|
|
|
|
|
857
|
my $snips_start_with_query = $_[0] =~ m/^\Q$snip\E/; |
243
|
29
|
|
|
|
|
794
|
my $snips_end_with_query = $_[0] =~ m/\Q$snip\E$/; |
244
|
29
|
100
|
|
|
|
115
|
if ( $self->{as_sentences} ) { |
245
|
13
|
|
|
|
|
21
|
$snips_start_with_query = 1; |
246
|
13
|
|
|
|
|
117
|
$snips_end_with_query = $snip =~ m/[\.\?\!]\s*$/; |
247
|
|
|
|
|
|
|
} |
248
|
|
|
|
|
|
|
|
249
|
|
|
|
|
|
|
# if we are pulling out something less than the entire |
250
|
|
|
|
|
|
|
# text, insert ellipses... |
251
|
29
|
100
|
|
|
|
86
|
if ( $_[0] ne $snip ) { |
252
|
25
|
50
|
|
|
|
482
|
$self->debug and warn "extract is smaller than snip"; |
253
|
25
|
100
|
|
|
|
241
|
my $extract = join( '', |
|
|
100
|
|
|
|
|
|
254
|
|
|
|
|
|
|
( $snips_start_with_query ? '' : $ellip ), |
255
|
|
|
|
|
|
|
$snip, ( $snips_end_with_query ? '' : $ellip ) ); |
256
|
25
|
|
|
|
|
9384
|
return $extract; |
257
|
|
|
|
|
|
|
} |
258
|
|
|
|
|
|
|
else { |
259
|
4
|
|
|
|
|
396
|
return $snip; |
260
|
|
|
|
|
|
|
} |
261
|
|
|
|
|
|
|
} |
262
|
|
|
|
|
|
|
else { |
263
|
|
|
|
|
|
|
|
264
|
|
|
|
|
|
|
#warn "no spans. using dumb snip"; |
265
|
1
|
|
|
|
|
5
|
return $self->_dumb( $_[0] ); |
266
|
|
|
|
|
|
|
} |
267
|
|
|
|
|
|
|
|
268
|
|
|
|
|
|
|
} |
269
|
|
|
|
|
|
|
|
270
|
|
|
|
|
|
|
sub _get_offsets { |
271
|
25
|
|
|
25
|
|
54
|
my $self = shift; |
272
|
25
|
|
|
|
|
3434
|
return $self->{_tokenizer}->get_offsets( @_, $self->{_qre} ); |
273
|
|
|
|
|
|
|
} |
274
|
|
|
|
|
|
|
|
275
|
|
|
|
|
|
|
sub _offset { |
276
|
25
|
|
|
25
|
|
43
|
my $self = shift; |
277
|
25
|
|
|
|
|
87
|
my $txt = shift; |
278
|
25
|
|
|
|
|
77
|
my $offsets = $self->_get_offsets($txt); |
279
|
25
|
|
|
|
|
103
|
my $snips = $self->_get_offset_snips( $txt, $offsets ); |
280
|
25
|
|
|
|
|
156
|
return $self->_token( join( '', @$snips ) ); |
281
|
|
|
|
|
|
|
} |
282
|
|
|
|
|
|
|
|
283
|
|
|
|
|
|
|
sub _get_offset_snips { |
284
|
25
|
|
|
25
|
|
40
|
my $self = shift; |
285
|
25
|
|
|
|
|
44
|
my $txt = shift; |
286
|
25
|
|
|
|
|
49
|
my $offsets = shift; |
287
|
|
|
|
|
|
|
|
288
|
|
|
|
|
|
|
# grab $size chars on either side of each offset |
289
|
|
|
|
|
|
|
# and tokenize each. |
290
|
|
|
|
|
|
|
# $size should be nice and wide to minimize the substr() calls. |
291
|
25
|
|
|
|
|
72
|
my $size = $self->max_chars * 10; |
292
|
|
|
|
|
|
|
|
293
|
|
|
|
|
|
|
#warn "window size $size"; |
294
|
|
|
|
|
|
|
|
295
|
25
|
|
|
|
|
38
|
my @buf; |
296
|
25
|
|
|
|
|
436
|
my $len = length($txt); |
297
|
25
|
100
|
|
|
|
84
|
if ( $size > $len ) { |
298
|
|
|
|
|
|
|
|
299
|
|
|
|
|
|
|
#warn "window bigger than document"; |
300
|
20
|
|
|
|
|
83
|
return [$txt]; |
301
|
|
|
|
|
|
|
} |
302
|
|
|
|
|
|
|
|
303
|
5
|
|
|
|
|
11
|
my ( $seen_start, $seen_end ); |
304
|
5
|
|
|
|
|
6
|
my $last_ending = 0; |
305
|
5
|
|
|
|
|
14
|
for my $pos (@$offsets) { |
306
|
|
|
|
|
|
|
|
307
|
34
|
|
|
|
|
34
|
my $tmp; |
308
|
|
|
|
|
|
|
|
309
|
34
|
|
|
|
|
50
|
my $start = $pos - int( $size / 2 ); |
310
|
34
|
|
|
|
|
44
|
my $end = $pos + int( $size / 2 ); |
311
|
|
|
|
|
|
|
|
312
|
|
|
|
|
|
|
# avoid overlaps |
313
|
34
|
100
|
100
|
|
|
77
|
if ( $last_ending && $start < $last_ending ) { |
314
|
26
|
|
|
|
|
30
|
$start = $last_ending + 1; |
315
|
26
|
|
|
|
|
33
|
$end = $start + $size; |
316
|
|
|
|
|
|
|
} |
317
|
|
|
|
|
|
|
|
318
|
|
|
|
|
|
|
#warn "$start .. $pos .. $end"; |
319
|
|
|
|
|
|
|
|
320
|
34
|
100
|
66
|
|
|
87
|
if ( $pos > $end or $pos < $start ) { |
321
|
23
|
|
|
|
|
34
|
next; |
322
|
|
|
|
|
|
|
} |
323
|
|
|
|
|
|
|
|
324
|
11
|
|
|
|
|
19
|
$last_ending = $end; |
325
|
|
|
|
|
|
|
|
326
|
|
|
|
|
|
|
#warn "$start .. $end"; |
327
|
|
|
|
|
|
|
|
328
|
|
|
|
|
|
|
# if $pos is close to the front of $txt |
329
|
11
|
100
|
|
|
|
28
|
if ( $start <= 0 ) { |
|
|
100
|
|
|
|
|
|
330
|
1
|
50
|
|
|
|
3
|
next if $seen_start++; |
331
|
|
|
|
|
|
|
|
332
|
|
|
|
|
|
|
#warn "start"; |
333
|
1
|
|
|
|
|
3
|
$tmp = substr( $txt, 0, $size ); |
334
|
|
|
|
|
|
|
} |
335
|
|
|
|
|
|
|
|
336
|
|
|
|
|
|
|
# if $pos is somewhere near the end |
337
|
|
|
|
|
|
|
elsif ( $end > $len ) { |
338
|
2
|
50
|
|
|
|
11
|
next if $seen_end++; |
339
|
|
|
|
|
|
|
|
340
|
|
|
|
|
|
|
#warn "end"; |
341
|
2
|
|
|
|
|
16
|
$tmp = substr( $txt, ( $len - $size ) ); |
342
|
|
|
|
|
|
|
} |
343
|
|
|
|
|
|
|
|
344
|
|
|
|
|
|
|
# default is somewhere in the ripe middle. |
345
|
|
|
|
|
|
|
else { |
346
|
|
|
|
|
|
|
|
347
|
|
|
|
|
|
|
#warn "middle"; |
348
|
8
|
|
|
|
|
152
|
$tmp = substr( $txt, $start, $size ); |
349
|
|
|
|
|
|
|
} |
350
|
|
|
|
|
|
|
|
351
|
11
|
|
|
|
|
28
|
push @buf, $tmp; |
352
|
|
|
|
|
|
|
} |
353
|
|
|
|
|
|
|
|
354
|
5
|
|
|
|
|
17
|
return \@buf; |
355
|
|
|
|
|
|
|
} |
356
|
|
|
|
|
|
|
|
357
|
|
|
|
|
|
|
sub _loop { |
358
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
359
|
0
|
|
|
|
|
0
|
my $txt = shift; |
360
|
0
|
|
|
|
|
0
|
my $regexp = $self->{_qre}; |
361
|
|
|
|
|
|
|
|
362
|
|
|
|
|
|
|
#carp "loop snip: $txt"; |
363
|
|
|
|
|
|
|
|
364
|
0
|
0
|
|
|
|
0
|
$self->debug and carp "loop snip regexp: $regexp"; |
365
|
|
|
|
|
|
|
|
366
|
0
|
|
0
|
|
|
0
|
my $debug = $self->debug || 0; |
367
|
|
|
|
|
|
|
|
368
|
|
|
|
|
|
|
# no matches |
369
|
0
|
0
|
|
|
|
0
|
return $self->_dumb($txt) unless $txt =~ m/$regexp/; |
370
|
|
|
|
|
|
|
|
371
|
|
|
|
|
|
|
#carp "loop snip: $txt"; |
372
|
|
|
|
|
|
|
|
373
|
0
|
|
|
|
|
0
|
my $context = $self->context - 1; |
374
|
0
|
|
0
|
|
|
0
|
my $occur = $self->occur || 1; |
375
|
0
|
|
|
|
|
0
|
my @snips; |
376
|
|
|
|
|
|
|
|
377
|
0
|
|
|
|
|
0
|
my $notwc = $self->{_wc_regexp}; |
378
|
|
|
|
|
|
|
|
379
|
0
|
|
|
|
|
0
|
my @words = split( /($notwc)/, $txt ); |
380
|
0
|
|
|
|
|
0
|
my $count = -1; |
381
|
0
|
|
|
|
|
0
|
my $start_again = $count; |
382
|
0
|
|
|
|
|
0
|
my $total = 0; |
383
|
0
|
|
|
|
|
0
|
my $first_match = 0; |
384
|
|
|
|
|
|
|
|
385
|
0
|
|
|
|
|
0
|
WORD: for my $w (@words) { |
386
|
|
|
|
|
|
|
|
387
|
0
|
0
|
|
|
|
0
|
if ( $debug > 1 ) { |
388
|
0
|
0
|
|
|
|
0
|
warn ">>\n" if $count % 2; |
389
|
0
|
|
|
|
|
0
|
warn "word: '$w'\n"; |
390
|
|
|
|
|
|
|
} |
391
|
|
|
|
|
|
|
|
392
|
0
|
|
|
|
|
0
|
$count++; |
393
|
0
|
0
|
|
|
|
0
|
next WORD if $count < $start_again; |
394
|
|
|
|
|
|
|
|
395
|
|
|
|
|
|
|
# the next WORD lets us skip past the last frag we excerpted |
396
|
|
|
|
|
|
|
|
397
|
0
|
|
|
|
|
0
|
my $last = $count - 1; |
398
|
0
|
|
|
|
|
0
|
my $next = $count + 1; |
399
|
|
|
|
|
|
|
|
400
|
|
|
|
|
|
|
#warn '-' x 30 . "\n"; |
401
|
0
|
0
|
|
|
|
0
|
if ( $w =~ m/^$regexp$/ ) { |
402
|
|
|
|
|
|
|
|
403
|
0
|
0
|
|
|
|
0
|
if ( $debug > 1 ) { |
404
|
0
|
|
|
|
|
0
|
warn "w: '$w' match: '$1'\n"; |
405
|
|
|
|
|
|
|
} |
406
|
|
|
|
|
|
|
|
407
|
0
|
|
|
|
|
0
|
$first_match = $count; |
408
|
|
|
|
|
|
|
|
409
|
0
|
|
|
|
|
0
|
my $before = $last - $context; |
410
|
0
|
0
|
|
|
|
0
|
$before = 0 if $before < 0; |
411
|
0
|
|
|
|
|
0
|
my $after = $next + $context; |
412
|
0
|
0
|
|
|
|
0
|
$after = $#words if $after > $#words; |
413
|
|
|
|
|
|
|
|
414
|
0
|
0
|
|
|
|
0
|
if ( $debug > 1 ) { |
415
|
0
|
|
|
|
|
0
|
warn "$before .. $last, $count, $next .. $after\n"; |
416
|
|
|
|
|
|
|
} |
417
|
|
|
|
|
|
|
|
418
|
0
|
|
|
|
|
0
|
my @before = @words[ $before .. $last ]; |
419
|
0
|
|
|
|
|
0
|
my @after = @words[ $next .. $after ]; |
420
|
|
|
|
|
|
|
|
421
|
0
|
|
|
|
|
0
|
my $this_snip_matches = grep {m/^$regexp$/i} ( @before, @after ); |
|
0
|
|
|
|
|
0
|
|
422
|
0
|
0
|
|
|
|
0
|
if ($this_snip_matches) { |
423
|
0
|
|
|
|
|
0
|
$after += $this_snip_matches; |
424
|
0
|
|
|
|
|
0
|
@after = @words[ $next .. $after ]; |
425
|
|
|
|
|
|
|
} |
426
|
0
|
|
|
|
|
0
|
$total += $this_snip_matches; |
427
|
0
|
|
|
|
|
0
|
$total++; # for current $w |
428
|
|
|
|
|
|
|
|
429
|
0
|
|
|
|
|
0
|
my $t = join( '', @before, $w, @after ); |
430
|
|
|
|
|
|
|
|
431
|
0
|
0
|
|
|
|
0
|
$t .= $ellip unless $count == $#words; |
432
|
|
|
|
|
|
|
|
433
|
0
|
0
|
|
|
|
0
|
if ( $debug > 1 ) { |
434
|
0
|
|
|
|
|
0
|
warn "t: $t\n"; |
435
|
0
|
|
|
|
|
0
|
warn "this_snip_matches: $this_snip_matches\n"; |
436
|
0
|
|
|
|
|
0
|
warn "total: $total\n"; |
437
|
|
|
|
|
|
|
} |
438
|
|
|
|
|
|
|
|
439
|
0
|
|
|
|
|
0
|
push( @snips, [ $t, $this_snip_matches + 1 ] ); # +1 for $w |
440
|
0
|
|
|
|
|
0
|
$start_again = $after; |
441
|
|
|
|
|
|
|
} |
442
|
|
|
|
|
|
|
|
443
|
|
|
|
|
|
|
} |
444
|
|
|
|
|
|
|
|
445
|
|
|
|
|
|
|
# sort by match density. |
446
|
|
|
|
|
|
|
# consistent with HeatMap and lets us find |
447
|
|
|
|
|
|
|
# the *best* match, including phrases. |
448
|
0
|
|
|
|
|
0
|
@snips = map { $_->[0] } sort { $b->[1] <=> $a->[1] } @snips; |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
449
|
|
|
|
|
|
|
|
450
|
0
|
0
|
|
|
|
0
|
if ( $debug > 1 ) { |
451
|
0
|
|
|
|
|
0
|
carp "snips: " . scalar @snips; |
452
|
0
|
|
|
|
|
0
|
carp "words: $count\n"; |
453
|
0
|
|
|
|
|
0
|
carp "grandtotal: $total\n"; |
454
|
0
|
|
|
|
|
0
|
carp "occur: $occur\n"; |
455
|
0
|
|
|
|
|
0
|
carp '-' x 50 . "\n"; |
456
|
|
|
|
|
|
|
|
457
|
|
|
|
|
|
|
} |
458
|
|
|
|
|
|
|
|
459
|
0
|
|
|
|
|
0
|
$self->count( scalar(@snips) + $self->count ); |
460
|
0
|
|
|
|
|
0
|
my $last_snip = $occur - 1; |
461
|
0
|
0
|
|
|
|
0
|
if ( $last_snip > $#snips ) { |
462
|
0
|
|
|
|
|
0
|
$last_snip = $#snips; |
463
|
|
|
|
|
|
|
} |
464
|
|
|
|
|
|
|
|
465
|
|
|
|
|
|
|
#warn dump \@snips; |
466
|
0
|
|
|
|
|
0
|
my $snippet = join( '', @snips[ 0 .. $last_snip ] ); |
467
|
0
|
0
|
|
|
|
0
|
$self->debug and warn "before no_start_partial: '$snippet'\n"; |
468
|
|
|
|
|
|
|
|
469
|
|
|
|
|
|
|
#_no_start_partial($snippet); |
470
|
0
|
0
|
|
|
|
0
|
$snippet = $ellip . $snippet if $first_match; |
471
|
|
|
|
|
|
|
|
472
|
0
|
|
|
|
|
0
|
return $snippet; |
473
|
|
|
|
|
|
|
} |
474
|
|
|
|
|
|
|
|
475
|
|
|
|
|
|
|
sub _re { |
476
|
|
|
|
|
|
|
|
477
|
|
|
|
|
|
|
# get first N matches for each q, then take one of each till we have $occur |
478
|
|
|
|
|
|
|
|
479
|
1
|
|
|
1
|
|
3
|
my $self = shift; |
480
|
1
|
|
|
|
|
2
|
my $text = shift; |
481
|
1
|
|
|
|
|
2
|
my @q = @{ $self->query->terms }; |
|
1
|
|
|
|
|
5
|
|
482
|
1
|
|
|
|
|
3
|
my $occur = $self->occur; |
483
|
1
|
|
|
|
|
5
|
my $Nchar = $self->context * $self->word_len; |
484
|
1
|
|
|
|
|
2
|
my $total = 0; |
485
|
1
|
|
|
|
|
2
|
my $notwc = $self->{_wc_regexp}; |
486
|
|
|
|
|
|
|
|
487
|
|
|
|
|
|
|
# get minimum number of snips necessary to meet $occur |
488
|
1
|
|
|
|
|
3
|
my $snip_per_q = int( $occur / scalar(@q) ); |
489
|
1
|
|
50
|
|
|
3
|
$snip_per_q ||= 1; |
490
|
|
|
|
|
|
|
|
491
|
1
|
|
|
|
|
20
|
my ( %snips, @snips, %ranges, $snip_starts_with_query ); |
492
|
1
|
|
|
|
|
2
|
$snip_starts_with_query = 0; |
493
|
|
|
|
|
|
|
|
494
|
1
|
|
|
|
|
2
|
Q: for my $q (@q) { |
495
|
1
|
|
|
|
|
6
|
$snips{$q} = { t => [], offset => [] }; |
496
|
|
|
|
|
|
|
|
497
|
1
|
50
|
|
|
|
48
|
$self->debug and warn "$q : $snip_starts_with_query"; |
498
|
|
|
|
|
|
|
|
499
|
|
|
|
|
|
|
# try simple regexp first, then more complex if we don't match |
500
|
|
|
|
|
|
|
next Q |
501
|
|
|
|
|
|
|
if $self->_re_match( \$text, $self->query->regex_for($q)->plain, |
502
|
1
|
50
|
|
|
|
34
|
\$total, $snips{$q}, \%ranges, $Nchar, $snip_per_q, |
503
|
|
|
|
|
|
|
\$snip_starts_with_query ); |
504
|
|
|
|
|
|
|
|
505
|
0
|
0
|
|
|
|
0
|
$self->debug and warn "failed match on plain regexp"; |
506
|
|
|
|
|
|
|
|
507
|
0
|
|
|
|
|
0
|
pos $text = 0; # do we really need to reset this? |
508
|
|
|
|
|
|
|
|
509
|
0
|
0
|
|
|
|
0
|
unless ( |
510
|
|
|
|
|
|
|
$self->_re_match( |
511
|
|
|
|
|
|
|
\$text, $self->query->regex_for($q)->html, |
512
|
|
|
|
|
|
|
\$total, $snips{$q}, |
513
|
|
|
|
|
|
|
\%ranges, $Nchar, |
514
|
|
|
|
|
|
|
$snip_per_q, \$snip_starts_with_query |
515
|
|
|
|
|
|
|
) |
516
|
|
|
|
|
|
|
) |
517
|
|
|
|
|
|
|
{ |
518
|
0
|
0
|
|
|
|
0
|
$self->debug and warn "failed match on html regexp"; |
519
|
|
|
|
|
|
|
} |
520
|
|
|
|
|
|
|
|
521
|
|
|
|
|
|
|
} |
522
|
|
|
|
|
|
|
|
523
|
1
|
50
|
|
|
|
3
|
return $self->_dumb($text) unless $total; |
524
|
|
|
|
|
|
|
|
525
|
|
|
|
|
|
|
# get all snips into one array in order they appeared in $text |
526
|
|
|
|
|
|
|
# should be a max of $snip_per_q in any one $q snip array |
527
|
|
|
|
|
|
|
# so we should have at least $occur in total, |
528
|
|
|
|
|
|
|
# which we'll splice() if need be. |
529
|
|
|
|
|
|
|
|
530
|
1
|
|
|
|
|
2
|
my %offsets; |
531
|
1
|
|
|
|
|
3
|
for my $q ( keys %snips ) { |
532
|
1
|
|
|
|
|
2
|
my @s = @{ $snips{$q}->{t} }; |
|
1
|
|
|
|
|
4
|
|
533
|
1
|
|
|
|
|
1
|
my @o = @{ $snips{$q}->{offset} }; |
|
1
|
|
|
|
|
3
|
|
534
|
|
|
|
|
|
|
|
535
|
1
|
|
|
|
|
1
|
my $i = 0; |
536
|
1
|
|
|
|
|
2
|
for (@s) { |
537
|
1
|
|
|
|
|
4
|
$offsets{$_} = $o[$i]; |
538
|
|
|
|
|
|
|
} |
539
|
|
|
|
|
|
|
} |
540
|
1
|
|
|
|
|
4
|
@snips = sort { $offsets{$a} <=> $offsets{$b} } keys %offsets; |
|
0
|
|
|
|
|
0
|
|
541
|
|
|
|
|
|
|
|
542
|
|
|
|
|
|
|
# max = $occur |
543
|
1
|
|
|
|
|
4
|
@snips = splice @snips, 0, $occur; |
544
|
|
|
|
|
|
|
|
545
|
1
|
50
|
|
|
|
16
|
$self->debug and warn dump( \@snips ); |
546
|
|
|
|
|
|
|
|
547
|
1
|
|
|
|
|
8
|
my $snip = join( $ellip, @snips ); |
548
|
1
|
50
|
|
|
|
5
|
_no_start_partial($snip) unless $snip_starts_with_query; |
549
|
1
|
50
|
|
|
|
23
|
$snip = $ellip . $snip unless $text =~ m/^\Q$snips[0]/i; |
550
|
1
|
50
|
|
|
|
31
|
$snip .= $ellip unless $text =~ m/\Q$snips[-1]$/i; |
551
|
|
|
|
|
|
|
|
552
|
1
|
|
|
|
|
5
|
$self->count( scalar(@snips) + $self->count ); |
553
|
|
|
|
|
|
|
|
554
|
1
|
|
|
|
|
21
|
return $snip; |
555
|
|
|
|
|
|
|
|
556
|
|
|
|
|
|
|
} |
557
|
|
|
|
|
|
|
|
558
|
|
|
|
|
|
|
sub _re_match { |
559
|
|
|
|
|
|
|
|
560
|
|
|
|
|
|
|
# the .{0,$Nchar} regexp slows things WAY down. so just match, |
561
|
|
|
|
|
|
|
# then use pos() to get chars before and after. |
562
|
|
|
|
|
|
|
|
563
|
|
|
|
|
|
|
# if escape = 0 and if prefix or suffix contains a < or >, |
564
|
|
|
|
|
|
|
# try to include entire tagset. |
565
|
|
|
|
|
|
|
|
566
|
1
|
|
|
1
|
|
4
|
my ( $self, $text, $re, $total, $snips, $ranges, $Nchar, $max_snips, |
567
|
|
|
|
|
|
|
$snip_starts_with_query ) |
568
|
|
|
|
|
|
|
= @_; |
569
|
|
|
|
|
|
|
|
570
|
1
|
|
|
|
|
4
|
my $t_len = length $$text; |
571
|
|
|
|
|
|
|
|
572
|
1
|
|
|
|
|
2
|
my $cnt = 0; |
573
|
|
|
|
|
|
|
|
574
|
1
|
50
|
|
|
|
16
|
if ( $self->debug ) { |
575
|
0
|
|
|
|
|
0
|
warn "re_match regexp: >$re<\n"; |
576
|
0
|
|
|
|
|
0
|
warn "max_snips: $max_snips\n"; |
577
|
|
|
|
|
|
|
} |
578
|
|
|
|
|
|
|
|
579
|
1
|
|
|
|
|
72
|
RE: while ( $$text =~ m/$re/g ) { |
580
|
|
|
|
|
|
|
|
581
|
1
|
|
|
|
|
4
|
my $pos = pos $$text; |
582
|
1
|
|
|
|
|
2
|
my $before_match = $1; |
583
|
1
|
|
|
|
|
3
|
my $match = $2; |
584
|
1
|
|
|
|
|
2
|
my $after_match = $3; |
585
|
1
|
|
|
|
|
3
|
$cnt++; |
586
|
1
|
|
|
|
|
1
|
my $len = length $match; |
587
|
1
|
|
|
|
|
3
|
my $blen = length $before_match; |
588
|
1
|
50
|
|
|
|
14
|
if ( $self->debug ) { |
589
|
0
|
|
|
|
|
0
|
warn "re: '$re'\n"; |
590
|
0
|
|
|
|
|
0
|
warn "\$1 = '$before_match' = ", ord($before_match), "\n"; |
591
|
0
|
|
|
|
|
0
|
warn "\$2 = '$match'\n"; |
592
|
0
|
|
|
|
|
0
|
warn "\$3 = '$after_match' = ", ord($after_match), "\n"; |
593
|
0
|
|
|
|
|
0
|
warn "pos = $pos\n"; |
594
|
0
|
|
|
|
|
0
|
warn "len = $len\n"; |
595
|
0
|
|
|
|
|
0
|
warn "blen= $blen\n"; |
596
|
|
|
|
|
|
|
} |
597
|
|
|
|
|
|
|
|
598
|
1
|
0
|
33
|
|
|
18
|
if ( $self->debug && exists $ranges->{$pos} ) { |
599
|
0
|
|
|
|
|
0
|
warn "already found $pos\n"; |
600
|
|
|
|
|
|
|
} |
601
|
|
|
|
|
|
|
|
602
|
1
|
50
|
|
|
|
7
|
next RE if exists $ranges->{$pos}; |
603
|
|
|
|
|
|
|
|
604
|
1
|
|
50
|
|
|
4
|
my $start_match = $pos - $len - ( $blen || 1 ); |
605
|
1
|
50
|
|
|
|
2
|
$start_match = 0 if $start_match < 0; |
606
|
|
|
|
|
|
|
|
607
|
1
|
50
|
|
|
|
3
|
$$snip_starts_with_query = 1 if $start_match == 0; |
608
|
|
|
|
|
|
|
|
609
|
|
|
|
|
|
|
# sanity |
610
|
1
|
50
|
|
|
|
25
|
$self->debug |
611
|
|
|
|
|
|
|
and warn "match should be [$start_match $len]: '", |
612
|
|
|
|
|
|
|
substr( $$text, $start_match, $len ), "'\n"; |
613
|
|
|
|
|
|
|
|
614
|
1
|
50
|
|
|
|
8
|
my $prefix_start |
615
|
|
|
|
|
|
|
= $start_match < $Nchar |
616
|
|
|
|
|
|
|
? 0 |
617
|
|
|
|
|
|
|
: $start_match - $Nchar; |
618
|
|
|
|
|
|
|
|
619
|
1
|
|
|
|
|
1
|
my $prefix_len = $start_match - $prefix_start; |
620
|
|
|
|
|
|
|
|
621
|
|
|
|
|
|
|
#$prefix_len++; $prefix_len++; |
622
|
|
|
|
|
|
|
|
623
|
1
|
|
|
|
|
2
|
my $suffix_start = $pos - length($after_match); |
624
|
1
|
|
|
|
|
2
|
my $suffix_len = $Nchar; |
625
|
1
|
|
|
|
|
1
|
my $end = $suffix_start + $suffix_len; |
626
|
|
|
|
|
|
|
|
627
|
|
|
|
|
|
|
# if $end extends beyond, that's ok, substr compensates |
628
|
|
|
|
|
|
|
|
629
|
1
|
|
|
|
|
108
|
$ranges->{$_}++ for ( $prefix_start .. $end ); |
630
|
1
|
|
|
|
|
3
|
my $prefix = substr( $$text, $prefix_start, $prefix_len ); |
631
|
1
|
|
|
|
|
3
|
my $suffix = substr( $$text, $suffix_start, $suffix_len ); |
632
|
|
|
|
|
|
|
|
633
|
1
|
50
|
|
|
|
16
|
if ( $self->debug ) { |
634
|
0
|
|
|
|
|
0
|
warn "prefix_start = $prefix_start\n"; |
635
|
0
|
|
|
|
|
0
|
warn "prefix_len = $prefix_len\n"; |
636
|
0
|
|
|
|
|
0
|
warn "start_match = $start_match\n"; |
637
|
0
|
|
|
|
|
0
|
warn "len = $len\n"; |
638
|
0
|
|
|
|
|
0
|
warn "pos = $pos\n"; |
639
|
0
|
|
|
|
|
0
|
warn "char = $Nchar\n"; |
640
|
0
|
|
|
|
|
0
|
warn "suffix_start = $suffix_start\n"; |
641
|
0
|
|
|
|
|
0
|
warn "suffix_len = $suffix_len\n"; |
642
|
0
|
|
|
|
|
0
|
warn "end = $end\n"; |
643
|
0
|
|
|
|
|
0
|
warn "prefix: '$prefix'\n"; |
644
|
0
|
|
|
|
|
0
|
warn "match: '$match'\n"; |
645
|
0
|
|
|
|
|
0
|
warn "suffix: '$suffix'\n"; |
646
|
|
|
|
|
|
|
} |
647
|
|
|
|
|
|
|
|
648
|
|
|
|
|
|
|
# try and get whole words if we split one up |
649
|
|
|
|
|
|
|
# _no_*_partial does this more rudely |
650
|
|
|
|
|
|
|
|
651
|
|
|
|
|
|
|
# might be faster to do m/(\S)*$prefix/i |
652
|
|
|
|
|
|
|
# but we couldn't guarantee position accuracy |
653
|
|
|
|
|
|
|
# e.g. if $prefix matched more than once in $$text, |
654
|
|
|
|
|
|
|
# we might pull the wrong \S* |
655
|
|
|
|
|
|
|
|
656
|
1
|
50
|
33
|
|
|
8
|
unless ( $prefix =~ m/^\s/ |
657
|
|
|
|
|
|
|
or substr( $$text, $prefix_start - 1, 1 ) =~ m/(\s)/ ) |
658
|
|
|
|
|
|
|
{ |
659
|
0
|
|
0
|
|
|
0
|
while ( --$prefix_start >= 0 |
660
|
|
|
|
|
|
|
and substr( $$text, $prefix_start, 1 ) =~ m/(\S)/ ) |
661
|
|
|
|
|
|
|
{ |
662
|
0
|
|
|
|
|
0
|
my $onemorechar = $1; |
663
|
|
|
|
|
|
|
|
664
|
|
|
|
|
|
|
#warn "adding $onemorechar to prefix\n"; |
665
|
0
|
|
|
|
|
0
|
$prefix = $onemorechar . $prefix; |
666
|
|
|
|
|
|
|
|
667
|
|
|
|
|
|
|
#last if $prefix_start <= 0 or $onemorechar !~ /\S/; |
668
|
|
|
|
|
|
|
} |
669
|
|
|
|
|
|
|
} |
670
|
|
|
|
|
|
|
|
671
|
|
|
|
|
|
|
# do same for suffix |
672
|
|
|
|
|
|
|
|
673
|
|
|
|
|
|
|
# We get error here under -w |
674
|
|
|
|
|
|
|
# about substr outside of string -- is $end undefined sometimes?? |
675
|
|
|
|
|
|
|
|
676
|
1
|
50
|
33
|
|
|
9
|
unless ( $suffix =~ m/\s$/ or substr( $$text, $end, 1 ) =~ m/(\s)/ ) { |
677
|
1
|
|
66
|
|
|
8
|
while ( $end <= $t_len |
678
|
|
|
|
|
|
|
and substr( $$text, $end++, 1 ) =~ m/(\S)/ ) |
679
|
|
|
|
|
|
|
{ |
680
|
|
|
|
|
|
|
|
681
|
3
|
|
|
|
|
4
|
my $onemore = $1; |
682
|
|
|
|
|
|
|
|
683
|
|
|
|
|
|
|
#warn "adding $onemore to suffix\n"; |
684
|
|
|
|
|
|
|
#warn "before '$suffix'\n"; |
685
|
3
|
|
|
|
|
27
|
$suffix .= $onemore; |
686
|
|
|
|
|
|
|
|
687
|
|
|
|
|
|
|
#warn "after '$suffix'\n"; |
688
|
|
|
|
|
|
|
} |
689
|
|
|
|
|
|
|
} |
690
|
|
|
|
|
|
|
|
691
|
|
|
|
|
|
|
# will likely fail to include one half of tagset if other is complete |
692
|
1
|
50
|
|
|
|
4
|
unless ( $self->escape ) { |
693
|
1
|
|
|
|
|
2
|
my $sanity = 0; |
694
|
1
|
|
|
|
|
3
|
my @l = ( $prefix =~ /(<)/g ); |
695
|
1
|
|
|
|
|
3
|
my @r = ( $prefix =~ /(>)/g ); |
696
|
1
|
|
|
|
|
4
|
while ( scalar @l != scalar @r ) { |
697
|
|
|
|
|
|
|
|
698
|
0
|
|
|
|
|
0
|
@l = ( $prefix =~ /(<)/g ); |
699
|
0
|
|
|
|
|
0
|
@r = ( $prefix =~ /(>)/g ); |
700
|
|
|
|
|
|
|
last |
701
|
|
|
|
|
|
|
if scalar @l |
702
|
0
|
0
|
|
|
|
0
|
== scalar @r; # don't take any more than we need to |
703
|
|
|
|
|
|
|
|
704
|
0
|
|
|
|
|
0
|
my $onemorechar = substr( $$text, $prefix_start--, 1 ); |
705
|
|
|
|
|
|
|
|
706
|
|
|
|
|
|
|
#warn "tagfix: adding $onemorechar to prefix\n"; |
707
|
0
|
|
|
|
|
0
|
$prefix = $onemorechar . $prefix; |
708
|
0
|
0
|
|
|
|
0
|
last if $prefix_start <= 0; |
709
|
0
|
0
|
|
|
|
0
|
last if $sanity++ > 100; |
710
|
|
|
|
|
|
|
|
711
|
|
|
|
|
|
|
} |
712
|
|
|
|
|
|
|
|
713
|
1
|
|
|
|
|
2
|
$sanity = 0; |
714
|
1
|
|
33
|
|
|
4
|
while ( $suffix =~ /<(\w+)/ && $suffix !~ /<\/$1>/ ) { |
715
|
|
|
|
|
|
|
|
716
|
0
|
|
|
|
|
0
|
my $onemorechar = substr( $$text, $end, 1 ); |
717
|
|
|
|
|
|
|
|
718
|
|
|
|
|
|
|
#warn "tagfix: adding $onemorechar to suffix\n"; |
719
|
0
|
|
|
|
|
0
|
$suffix .= $onemorechar; |
720
|
0
|
0
|
|
|
|
0
|
last if ++$end > $t_len; |
721
|
0
|
0
|
|
|
|
0
|
last if $sanity++ > 100; |
722
|
|
|
|
|
|
|
|
723
|
|
|
|
|
|
|
} |
724
|
|
|
|
|
|
|
} |
725
|
|
|
|
|
|
|
|
726
|
|
|
|
|
|
|
# warn "prefix: '$prefix'\n"; |
727
|
|
|
|
|
|
|
# warn "match: '$match'\n"; |
728
|
|
|
|
|
|
|
# warn "suffix: '$suffix'\n"; |
729
|
|
|
|
|
|
|
|
730
|
1
|
|
|
|
|
5
|
my $context = join( '', $prefix, $match, $suffix ); |
731
|
|
|
|
|
|
|
|
732
|
|
|
|
|
|
|
#warn "context is '$context'\n"; |
733
|
|
|
|
|
|
|
|
734
|
1
|
|
|
|
|
2
|
push( @{ $snips->{t} }, $context ); |
|
1
|
|
|
|
|
3
|
|
735
|
1
|
|
|
|
|
2
|
push( @{ $snips->{offset} }, $prefix_start ); |
|
1
|
|
|
|
|
3
|
|
736
|
|
|
|
|
|
|
|
737
|
1
|
|
|
|
|
2
|
$$total++; |
738
|
|
|
|
|
|
|
|
739
|
|
|
|
|
|
|
# warn '-' x 40, "\n"; |
740
|
|
|
|
|
|
|
|
741
|
1
|
50
|
|
|
|
4
|
last if $cnt >= $max_snips; |
742
|
|
|
|
|
|
|
} |
743
|
|
|
|
|
|
|
|
744
|
1
|
|
|
|
|
15
|
return $cnt; |
745
|
|
|
|
|
|
|
} |
746
|
|
|
|
|
|
|
|
747
|
|
|
|
|
|
|
sub _dumb { |
748
|
|
|
|
|
|
|
|
749
|
|
|
|
|
|
|
# just grap the first X chars and return |
750
|
|
|
|
|
|
|
|
751
|
3
|
|
|
3
|
|
7
|
my $self = shift; |
752
|
3
|
100
|
|
|
|
131
|
return '' unless $self->show; |
753
|
|
|
|
|
|
|
|
754
|
1
|
|
|
|
|
3
|
my $txt = shift; |
755
|
1
|
|
|
|
|
4
|
my $max = $self->max_chars; |
756
|
1
|
|
|
|
|
5
|
$self->type_used('dumb'); |
757
|
|
|
|
|
|
|
|
758
|
1
|
|
|
|
|
3
|
my $show = substr( $txt, 0, $max ); |
759
|
1
|
|
|
|
|
5
|
_no_end_partial($show); |
760
|
1
|
|
|
|
|
3
|
$show .= $ellip; |
761
|
|
|
|
|
|
|
|
762
|
1
|
|
|
|
|
5
|
$self->count( 1 + $self->count ); |
763
|
|
|
|
|
|
|
|
764
|
1
|
|
|
|
|
4
|
return $show; |
765
|
|
|
|
|
|
|
|
766
|
|
|
|
|
|
|
} |
767
|
|
|
|
|
|
|
|
768
|
|
|
|
|
|
|
sub _no_start_partial { |
769
|
1
|
|
|
1
|
|
4
|
$_[0] =~ s/^\S+\s+//gs; |
770
|
|
|
|
|
|
|
} |
771
|
|
|
|
|
|
|
|
772
|
|
|
|
|
|
|
sub _no_end_partial { |
773
|
1
|
|
|
1
|
|
4
|
$_[0] =~ s/\s+\S+$//gs; |
774
|
|
|
|
|
|
|
} |
775
|
|
|
|
|
|
|
|
776
|
|
|
|
|
|
|
sub _escape { |
777
|
31
|
50
|
|
31
|
|
100
|
if ( $_[0]->escape ) { |
778
|
0
|
|
|
|
|
0
|
return Search::Tools::XML->escape( $_[1] ); |
779
|
|
|
|
|
|
|
} |
780
|
|
|
|
|
|
|
else { |
781
|
31
|
|
|
|
|
80
|
return $_[1]; |
782
|
|
|
|
|
|
|
} |
783
|
|
|
|
|
|
|
} |
784
|
|
|
|
|
|
|
|
785
|
|
|
|
|
|
|
1; |
786
|
|
|
|
|
|
|
__END__ |