| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package Search::Tools::Snipper; |
|
2
|
16
|
|
|
16
|
|
135963
|
use Moo; |
|
|
16
|
|
|
|
|
43933
|
|
|
|
16
|
|
|
|
|
78
|
|
|
3
|
|
|
|
|
|
|
extends 'Search::Tools::Object'; |
|
4
|
|
|
|
|
|
|
with 'Search::Tools::ArgNormalizer'; |
|
5
|
16
|
|
|
16
|
|
8860
|
use Carp; |
|
|
16
|
|
|
|
|
31
|
|
|
|
16
|
|
|
|
|
742
|
|
|
6
|
16
|
|
|
16
|
|
798
|
use Data::Dump qw( dump ); |
|
|
16
|
|
|
|
|
9171
|
|
|
|
16
|
|
|
|
|
514
|
|
|
7
|
16
|
|
|
16
|
|
2456
|
use Search::Tools::XML; |
|
|
16
|
|
|
|
|
30
|
|
|
|
16
|
|
|
|
|
411
|
|
|
8
|
16
|
|
|
16
|
|
84
|
use Search::Tools::UTF8; |
|
|
16
|
|
|
|
|
24
|
|
|
|
16
|
|
|
|
|
1509
|
|
|
9
|
16
|
|
|
16
|
|
3965
|
use Search::Tools::Tokenizer; |
|
|
16
|
|
|
|
|
39
|
|
|
|
16
|
|
|
|
|
472
|
|
|
10
|
16
|
|
|
16
|
|
6349
|
use Search::Tools::HeatMap; |
|
|
16
|
|
|
|
|
41
|
|
|
|
16
|
|
|
|
|
438
|
|
|
11
|
|
|
|
|
|
|
|
|
12
|
16
|
|
|
16
|
|
97
|
use namespace::autoclean; |
|
|
16
|
|
|
|
|
26
|
|
|
|
16
|
|
|
|
|
85
|
|
|
13
|
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
our $VERSION = '1.006'; |
|
15
|
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
# extra space here so pmvers works against $VERSION |
|
17
|
|
|
|
|
|
|
our $ellip = ' ... '; |
|
18
|
|
|
|
|
|
|
our $DefaultSnipper = 'offset'; |
|
19
|
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
# |
|
21
|
|
|
|
|
|
|
# TODO allow for returning an array ref of |
|
22
|
|
|
|
|
|
|
# extracts instead of joining them all with $ellip |
|
23
|
|
|
|
|
|
|
# |
|
24
|
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
my @attrs = qw( |
|
26
|
|
|
|
|
|
|
as_sentences |
|
27
|
|
|
|
|
|
|
collapse_whitespace |
|
28
|
|
|
|
|
|
|
context |
|
29
|
|
|
|
|
|
|
count |
|
30
|
|
|
|
|
|
|
escape |
|
31
|
|
|
|
|
|
|
force |
|
32
|
|
|
|
|
|
|
ignore_length |
|
33
|
|
|
|
|
|
|
max_chars |
|
34
|
|
|
|
|
|
|
occur |
|
35
|
|
|
|
|
|
|
query |
|
36
|
|
|
|
|
|
|
show |
|
37
|
|
|
|
|
|
|
snipper |
|
38
|
|
|
|
|
|
|
strip_markup |
|
39
|
|
|
|
|
|
|
treat_phrases_as_singles |
|
40
|
|
|
|
|
|
|
type |
|
41
|
|
|
|
|
|
|
type_used |
|
42
|
|
|
|
|
|
|
use_pp |
|
43
|
|
|
|
|
|
|
word_len |
|
44
|
|
|
|
|
|
|
); |
|
45
|
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
my %Defaults = ( |
|
47
|
|
|
|
|
|
|
type => $DefaultSnipper, |
|
48
|
|
|
|
|
|
|
occur => 5, |
|
49
|
|
|
|
|
|
|
max_chars => 300, |
|
50
|
|
|
|
|
|
|
context => 8, |
|
51
|
|
|
|
|
|
|
word_len => 4, # TODO still used? |
|
52
|
|
|
|
|
|
|
show => 1, |
|
53
|
|
|
|
|
|
|
collapse_whitespace => 1, |
|
54
|
|
|
|
|
|
|
escape => 0, |
|
55
|
|
|
|
|
|
|
force => 0, |
|
56
|
|
|
|
|
|
|
as_sentences => 0, |
|
57
|
|
|
|
|
|
|
ignore_length => 0, |
|
58
|
|
|
|
|
|
|
treat_phrases_as_singles => 1, |
|
59
|
|
|
|
|
|
|
strip_markup => 0, |
|
60
|
|
|
|
|
|
|
); |
|
61
|
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
for my $attr (@attrs) { |
|
63
|
|
|
|
|
|
|
my $def = $Defaults{$attr} || undef; |
|
64
|
|
|
|
|
|
|
if ( defined $def ) { |
|
65
|
|
|
|
|
|
|
has( $attr => ( is => 'rw', default => sub {$def} ) ); |
|
66
|
|
|
|
|
|
|
} |
|
67
|
|
|
|
|
|
|
else { |
|
68
|
|
|
|
|
|
|
has( $attr => ( is => 'rw' ) ); |
|
69
|
|
|
|
|
|
|
} |
|
70
|
|
|
|
|
|
|
} |
|
71
|
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
sub BUILD { |
|
73
|
30
|
|
|
30
|
1
|
132
|
my $self = shift; |
|
74
|
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
#dump $self; |
|
76
|
|
|
|
|
|
|
|
|
77
|
30
|
|
|
|
|
563
|
$self->{_tokenizer} = Search::Tools::Tokenizer->new( |
|
78
|
|
|
|
|
|
|
re => $self->query->qp->term_re, |
|
79
|
|
|
|
|
|
|
debug => $self->debug, |
|
80
|
|
|
|
|
|
|
); |
|
81
|
|
|
|
|
|
|
|
|
82
|
30
|
|
|
|
|
136
|
my $wc = $self->query->qp->word_characters; |
|
83
|
|
|
|
|
|
|
|
|
84
|
|
|
|
|
|
|
# regexp for splitting into terms in _re() |
|
85
|
30
|
|
|
|
|
745
|
$self->{_wc_regexp} = qr/[^$wc]+/io; |
|
86
|
|
|
|
|
|
|
|
|
87
|
|
|
|
|
|
|
$self->{_qre} |
|
88
|
30
|
|
|
|
|
191
|
= $self->query->terms_as_regex( $self->treat_phrases_as_singles ); |
|
89
|
|
|
|
|
|
|
|
|
90
|
30
|
|
|
|
|
133
|
$self->count(0); |
|
91
|
|
|
|
|
|
|
|
|
92
|
30
|
|
|
|
|
1301
|
return $self; |
|
93
|
|
|
|
|
|
|
} |
|
94
|
|
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
# I tried Text::Context but that was too slow. |
|
96
|
|
|
|
|
|
|
# Here are several different models. |
|
97
|
|
|
|
|
|
|
# I have found that _loop() is faster for single-word queries, |
|
98
|
|
|
|
|
|
|
# while _re() seems to be the best compromise between speed and accuracy. |
|
99
|
|
|
|
|
|
|
# New in version 0.24 is _token() which is mostly XS and should be best. |
|
100
|
|
|
|
|
|
|
|
|
101
|
|
|
|
|
|
|
sub _pick_snipper { |
|
102
|
31
|
|
|
31
|
|
144
|
my ( $self, $text ) = @_; |
|
103
|
31
|
|
66
|
|
|
131
|
my $snipper_name = $self->type || $DefaultSnipper; |
|
104
|
31
|
100
|
|
|
|
175
|
if ( $self->query->qp->stemmer ) { |
|
105
|
5
|
|
|
|
|
10
|
$snipper_name = 'token'; |
|
106
|
|
|
|
|
|
|
} |
|
107
|
31
|
|
|
|
|
75
|
my $method_name = '_' . $snipper_name; |
|
108
|
31
|
|
|
|
|
95
|
$self->type_used($snipper_name); |
|
109
|
31
|
|
|
31
|
|
120
|
my $func = sub { shift->$method_name(@_) }; |
|
|
31
|
|
|
|
|
113
|
|
|
110
|
31
|
|
|
|
|
112
|
return $func; |
|
111
|
|
|
|
|
|
|
} |
|
112
|
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
# 2 passes, excluding ' ' in the first one, |
|
114
|
|
|
|
|
|
|
# is 60% faster than a single pass including ' '. |
|
115
|
|
|
|
|
|
|
# likely because there are far fewer matches |
|
116
|
|
|
|
|
|
|
# in either of the 2 than the 1. |
|
117
|
|
|
|
|
|
|
sub _normalize_whitespace { |
|
118
|
62
|
|
|
62
|
|
2242
|
$_[0] =~ s,[\n\r\t\xa0]+,\ ,go; |
|
119
|
62
|
|
|
|
|
4911
|
$_[0] =~ s,\ +, ,go; # \ \ + was 16x slower on bigfile!! |
|
120
|
|
|
|
|
|
|
} |
|
121
|
|
|
|
|
|
|
|
|
122
|
|
|
|
|
|
|
sub snip { |
|
123
|
33
|
|
|
33
|
1
|
107
|
my $self = shift; |
|
124
|
33
|
|
|
|
|
59
|
my $text = shift; |
|
125
|
33
|
50
|
|
|
|
92
|
if ( !defined $text ) { |
|
126
|
0
|
|
|
|
|
0
|
croak "text required to snip"; |
|
127
|
|
|
|
|
|
|
} |
|
128
|
|
|
|
|
|
|
|
|
129
|
|
|
|
|
|
|
# normalize encoding, esp for regular expressions. |
|
130
|
33
|
|
|
|
|
121
|
$text = to_utf8($text); |
|
131
|
|
|
|
|
|
|
|
|
132
|
|
|
|
|
|
|
# don't snip if we're less than the threshold |
|
133
|
33
|
100
|
100
|
|
|
639
|
if ( length($text) < $self->max_chars && !$self->ignore_length ) { |
|
134
|
2
|
50
|
|
|
|
7
|
if ( $self->show ) { |
|
135
|
2
|
50
|
|
|
|
6
|
if ( $self->strip_markup ) { |
|
136
|
0
|
|
|
|
|
0
|
return Search::Tools::XML->no_html($text); |
|
137
|
|
|
|
|
|
|
} |
|
138
|
2
|
|
|
|
|
10
|
return $text; |
|
139
|
|
|
|
|
|
|
} |
|
140
|
0
|
|
|
|
|
0
|
return ''; |
|
141
|
|
|
|
|
|
|
} |
|
142
|
|
|
|
|
|
|
|
|
143
|
31
|
100
|
|
|
|
98
|
if ( $self->strip_markup ) { |
|
144
|
1
|
|
|
|
|
8
|
$text = Search::Tools::XML->no_html($text); |
|
145
|
|
|
|
|
|
|
} |
|
146
|
|
|
|
|
|
|
|
|
147
|
31
|
50
|
|
|
|
89
|
if ( $self->collapse_whitespace ) { |
|
148
|
31
|
|
|
|
|
91
|
_normalize_whitespace($text); |
|
149
|
|
|
|
|
|
|
} |
|
150
|
|
|
|
|
|
|
|
|
151
|
|
|
|
|
|
|
# we calculate the snipper each time since caller |
|
152
|
|
|
|
|
|
|
# may set type() or snipper() between calls to snip(). |
|
153
|
31
|
|
33
|
|
|
193
|
my $func = $self->snipper || $self->_pick_snipper($text); |
|
154
|
|
|
|
|
|
|
|
|
155
|
31
|
|
|
|
|
72
|
my $s = $func->( $self, $text ); |
|
156
|
|
|
|
|
|
|
|
|
157
|
31
|
50
|
|
|
|
626
|
$self->debug and warn "snipped: '$s'\n"; |
|
158
|
|
|
|
|
|
|
|
|
159
|
|
|
|
|
|
|
# sanity check |
|
160
|
31
|
100
|
100
|
|
|
454
|
if ( length($s) > ( $self->max_chars * 4 ) && !$self->ignore_length ) { |
|
|
|
100
|
66
|
|
|
|
|
|
161
|
1
|
|
|
|
|
5
|
$s = $self->_dumb($s); |
|
162
|
1
|
50
|
|
|
|
23
|
$self->debug and warn "too long. dumb snip: '$s'\n"; |
|
163
|
|
|
|
|
|
|
} |
|
164
|
|
|
|
|
|
|
elsif ( !length($s) && !$self->ignore_length ) { |
|
165
|
1
|
|
|
|
|
4
|
$s = $self->_dumb($text); |
|
166
|
1
|
50
|
|
|
|
14
|
$self->debug and warn "too short. dumb snip: '$s'\n"; |
|
167
|
|
|
|
|
|
|
} |
|
168
|
|
|
|
|
|
|
|
|
169
|
|
|
|
|
|
|
# escape entities before collapsing whitespace. |
|
170
|
31
|
|
|
|
|
100
|
$s = $self->_escape($s); |
|
171
|
|
|
|
|
|
|
|
|
172
|
31
|
50
|
|
|
|
88
|
if ( $self->collapse_whitespace ) { |
|
173
|
31
|
|
|
|
|
62
|
_normalize_whitespace($s); |
|
174
|
|
|
|
|
|
|
} |
|
175
|
|
|
|
|
|
|
|
|
176
|
31
|
|
|
|
|
277
|
return $s; |
|
177
|
|
|
|
|
|
|
|
|
178
|
|
|
|
|
|
|
} |
|
179
|
|
|
|
|
|
|
|
|
180
|
|
|
|
|
|
|
sub _token { |
|
181
|
30
|
|
|
30
|
|
50
|
my $self = shift; |
|
182
|
30
|
|
|
|
|
62
|
my $qre = $self->{_qre}; |
|
183
|
30
|
50
|
|
|
|
610
|
$self->debug and warn "\$qre: $qre"; |
|
184
|
|
|
|
|
|
|
|
|
185
|
30
|
100
|
|
|
|
237
|
my $method = ( $self->{use_pp} ) ? 'tokenize_pp' : 'tokenize'; |
|
186
|
|
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
# must split phrases into OR'd regex or else no heat is generated. |
|
188
|
30
|
|
|
|
|
52
|
my $qre_ORd = $qre; |
|
189
|
30
|
|
|
|
|
122
|
$qre_ORd =~ s/(\\ )+/\|/g; |
|
190
|
30
|
|
|
|
|
964
|
my $heat_seeker = qr/^$qre_ORd$/; |
|
191
|
|
|
|
|
|
|
|
|
192
|
|
|
|
|
|
|
# if stemmer is on, we must stem each token to look for a match |
|
193
|
30
|
100
|
|
|
|
190
|
if ( $self->query->qp->stemmer ) { |
|
194
|
5
|
|
|
|
|
16
|
my $stemmer = $self->query->qp->stemmer; |
|
195
|
5
|
|
|
|
|
10
|
my $qp = $self->query->qp; |
|
196
|
5
|
|
|
|
|
9
|
my $re = $heat_seeker; |
|
197
|
|
|
|
|
|
|
$heat_seeker = sub { |
|
198
|
486
|
|
|
486
|
|
737
|
my ($token) = @_; |
|
199
|
486
|
|
|
|
|
900
|
my $st = $stemmer->( $qp, $token->str ); |
|
200
|
486
|
|
|
|
|
5017
|
return $st =~ m/$re/; |
|
201
|
5
|
|
|
|
|
20
|
}; |
|
202
|
|
|
|
|
|
|
} |
|
203
|
30
|
|
|
|
|
22071
|
my $tokens = $self->{_tokenizer}->$method( $_[0], $heat_seeker ); |
|
204
|
|
|
|
|
|
|
|
|
205
|
|
|
|
|
|
|
#$self->debug and $tokens->dump; |
|
206
|
|
|
|
|
|
|
|
|
207
|
30
|
50
|
|
|
|
75
|
return $self->_dumb( $_[0] ) unless scalar @{ $tokens->get_heat }; |
|
|
30
|
|
|
|
|
177
|
|
|
208
|
|
|
|
|
|
|
|
|
209
|
|
|
|
|
|
|
my $heatmap = Search::Tools::HeatMap->new( |
|
210
|
|
|
|
|
|
|
tokens => $tokens, |
|
211
|
|
|
|
|
|
|
window_size => $self->{context}, |
|
212
|
|
|
|
|
|
|
as_sentences => $self->{as_sentences}, |
|
213
|
|
|
|
|
|
|
debug => $self->debug, |
|
214
|
|
|
|
|
|
|
_query => $self->query, |
|
215
|
|
|
|
|
|
|
_qre => $qre, |
|
216
|
|
|
|
|
|
|
_treat_phrases_as_singles => $self->{treat_phrases_as_singles}, |
|
217
|
30
|
|
|
|
|
656
|
_stemmer => $self->query->qp->stemmer, |
|
218
|
|
|
|
|
|
|
); |
|
219
|
|
|
|
|
|
|
|
|
220
|
|
|
|
|
|
|
# reduce noise in debug |
|
221
|
30
|
|
|
|
|
71
|
delete $heatmap->{_query}; |
|
222
|
|
|
|
|
|
|
|
|
223
|
30
|
50
|
|
|
|
655
|
$self->debug and warn "heatmap: " . dump $heatmap; |
|
224
|
|
|
|
|
|
|
|
|
225
|
30
|
|
|
|
|
256
|
my $tokens_arr = $tokens->as_array; |
|
226
|
|
|
|
|
|
|
|
|
227
|
|
|
|
|
|
|
#warn "snips: " . dump $heatmap->spans; |
|
228
|
30
|
100
|
|
|
|
91
|
if ( $heatmap->has_spans ) { |
|
229
|
|
|
|
|
|
|
|
|
230
|
|
|
|
|
|
|
# stringify positions |
|
231
|
29
|
|
|
|
|
50
|
my @snips; |
|
232
|
29
|
|
|
|
|
42
|
for my $span ( @{ $heatmap->spans } ) { |
|
|
29
|
|
|
|
|
120
|
|
|
233
|
|
|
|
|
|
|
|
|
234
|
58
|
50
|
|
|
|
824
|
$self->debug and warn '>>>' . $span->{str_w_pos} . '<<<'; |
|
235
|
58
|
|
|
|
|
329
|
push( @snips, $span->{str} ); |
|
236
|
|
|
|
|
|
|
} |
|
237
|
29
|
|
|
|
|
89
|
my $occur_index = $self->occur - 1; |
|
238
|
29
|
100
|
|
|
|
91
|
if ( $#snips > $occur_index ) { |
|
239
|
5
|
|
|
|
|
18
|
@snips = @snips[ 0 .. $occur_index ]; |
|
240
|
|
|
|
|
|
|
} |
|
241
|
29
|
|
|
|
|
154
|
my $snip = join( $ellip, @snips ); |
|
242
|
29
|
|
|
|
|
857
|
my $snips_start_with_query = $_[0] =~ m/^\Q$snip\E/; |
|
243
|
29
|
|
|
|
|
794
|
my $snips_end_with_query = $_[0] =~ m/\Q$snip\E$/; |
|
244
|
29
|
100
|
|
|
|
115
|
if ( $self->{as_sentences} ) { |
|
245
|
13
|
|
|
|
|
21
|
$snips_start_with_query = 1; |
|
246
|
13
|
|
|
|
|
117
|
$snips_end_with_query = $snip =~ m/[\.\?\!]\s*$/; |
|
247
|
|
|
|
|
|
|
} |
|
248
|
|
|
|
|
|
|
|
|
249
|
|
|
|
|
|
|
# if we are pulling out something less than the entire |
|
250
|
|
|
|
|
|
|
# text, insert ellipses... |
|
251
|
29
|
100
|
|
|
|
86
|
if ( $_[0] ne $snip ) { |
|
252
|
25
|
50
|
|
|
|
482
|
$self->debug and warn "extract is smaller than snip"; |
|
253
|
25
|
100
|
|
|
|
241
|
my $extract = join( '', |
|
|
|
100
|
|
|
|
|
|
|
254
|
|
|
|
|
|
|
( $snips_start_with_query ? '' : $ellip ), |
|
255
|
|
|
|
|
|
|
$snip, ( $snips_end_with_query ? '' : $ellip ) ); |
|
256
|
25
|
|
|
|
|
9384
|
return $extract; |
|
257
|
|
|
|
|
|
|
} |
|
258
|
|
|
|
|
|
|
else { |
|
259
|
4
|
|
|
|
|
396
|
return $snip; |
|
260
|
|
|
|
|
|
|
} |
|
261
|
|
|
|
|
|
|
} |
|
262
|
|
|
|
|
|
|
else { |
|
263
|
|
|
|
|
|
|
|
|
264
|
|
|
|
|
|
|
#warn "no spans. using dumb snip"; |
|
265
|
1
|
|
|
|
|
5
|
return $self->_dumb( $_[0] ); |
|
266
|
|
|
|
|
|
|
} |
|
267
|
|
|
|
|
|
|
|
|
268
|
|
|
|
|
|
|
} |
|
269
|
|
|
|
|
|
|
|
|
270
|
|
|
|
|
|
|
sub _get_offsets { |
|
271
|
25
|
|
|
25
|
|
54
|
my $self = shift; |
|
272
|
25
|
|
|
|
|
3434
|
return $self->{_tokenizer}->get_offsets( @_, $self->{_qre} ); |
|
273
|
|
|
|
|
|
|
} |
|
274
|
|
|
|
|
|
|
|
|
275
|
|
|
|
|
|
|
sub _offset { |
|
276
|
25
|
|
|
25
|
|
43
|
my $self = shift; |
|
277
|
25
|
|
|
|
|
87
|
my $txt = shift; |
|
278
|
25
|
|
|
|
|
77
|
my $offsets = $self->_get_offsets($txt); |
|
279
|
25
|
|
|
|
|
103
|
my $snips = $self->_get_offset_snips( $txt, $offsets ); |
|
280
|
25
|
|
|
|
|
156
|
return $self->_token( join( '', @$snips ) ); |
|
281
|
|
|
|
|
|
|
} |
|
282
|
|
|
|
|
|
|
|
|
283
|
|
|
|
|
|
|
sub _get_offset_snips { |
|
284
|
25
|
|
|
25
|
|
40
|
my $self = shift; |
|
285
|
25
|
|
|
|
|
44
|
my $txt = shift; |
|
286
|
25
|
|
|
|
|
49
|
my $offsets = shift; |
|
287
|
|
|
|
|
|
|
|
|
288
|
|
|
|
|
|
|
# grab $size chars on either side of each offset |
|
289
|
|
|
|
|
|
|
# and tokenize each. |
|
290
|
|
|
|
|
|
|
# $size should be nice and wide to minimize the substr() calls. |
|
291
|
25
|
|
|
|
|
72
|
my $size = $self->max_chars * 10; |
|
292
|
|
|
|
|
|
|
|
|
293
|
|
|
|
|
|
|
#warn "window size $size"; |
|
294
|
|
|
|
|
|
|
|
|
295
|
25
|
|
|
|
|
38
|
my @buf; |
|
296
|
25
|
|
|
|
|
436
|
my $len = length($txt); |
|
297
|
25
|
100
|
|
|
|
84
|
if ( $size > $len ) { |
|
298
|
|
|
|
|
|
|
|
|
299
|
|
|
|
|
|
|
#warn "window bigger than document"; |
|
300
|
20
|
|
|
|
|
83
|
return [$txt]; |
|
301
|
|
|
|
|
|
|
} |
|
302
|
|
|
|
|
|
|
|
|
303
|
5
|
|
|
|
|
11
|
my ( $seen_start, $seen_end ); |
|
304
|
5
|
|
|
|
|
6
|
my $last_ending = 0; |
|
305
|
5
|
|
|
|
|
14
|
for my $pos (@$offsets) { |
|
306
|
|
|
|
|
|
|
|
|
307
|
34
|
|
|
|
|
34
|
my $tmp; |
|
308
|
|
|
|
|
|
|
|
|
309
|
34
|
|
|
|
|
50
|
my $start = $pos - int( $size / 2 ); |
|
310
|
34
|
|
|
|
|
44
|
my $end = $pos + int( $size / 2 ); |
|
311
|
|
|
|
|
|
|
|
|
312
|
|
|
|
|
|
|
# avoid overlaps |
|
313
|
34
|
100
|
100
|
|
|
77
|
if ( $last_ending && $start < $last_ending ) { |
|
314
|
26
|
|
|
|
|
30
|
$start = $last_ending + 1; |
|
315
|
26
|
|
|
|
|
33
|
$end = $start + $size; |
|
316
|
|
|
|
|
|
|
} |
|
317
|
|
|
|
|
|
|
|
|
318
|
|
|
|
|
|
|
#warn "$start .. $pos .. $end"; |
|
319
|
|
|
|
|
|
|
|
|
320
|
34
|
100
|
66
|
|
|
87
|
if ( $pos > $end or $pos < $start ) { |
|
321
|
23
|
|
|
|
|
34
|
next; |
|
322
|
|
|
|
|
|
|
} |
|
323
|
|
|
|
|
|
|
|
|
324
|
11
|
|
|
|
|
19
|
$last_ending = $end; |
|
325
|
|
|
|
|
|
|
|
|
326
|
|
|
|
|
|
|
#warn "$start .. $end"; |
|
327
|
|
|
|
|
|
|
|
|
328
|
|
|
|
|
|
|
# if $pos is close to the front of $txt |
|
329
|
11
|
100
|
|
|
|
28
|
if ( $start <= 0 ) { |
|
|
|
100
|
|
|
|
|
|
|
330
|
1
|
50
|
|
|
|
3
|
next if $seen_start++; |
|
331
|
|
|
|
|
|
|
|
|
332
|
|
|
|
|
|
|
#warn "start"; |
|
333
|
1
|
|
|
|
|
3
|
$tmp = substr( $txt, 0, $size ); |
|
334
|
|
|
|
|
|
|
} |
|
335
|
|
|
|
|
|
|
|
|
336
|
|
|
|
|
|
|
# if $pos is somewhere near the end |
|
337
|
|
|
|
|
|
|
elsif ( $end > $len ) { |
|
338
|
2
|
50
|
|
|
|
11
|
next if $seen_end++; |
|
339
|
|
|
|
|
|
|
|
|
340
|
|
|
|
|
|
|
#warn "end"; |
|
341
|
2
|
|
|
|
|
16
|
$tmp = substr( $txt, ( $len - $size ) ); |
|
342
|
|
|
|
|
|
|
} |
|
343
|
|
|
|
|
|
|
|
|
344
|
|
|
|
|
|
|
# default is somewhere in the ripe middle. |
|
345
|
|
|
|
|
|
|
else { |
|
346
|
|
|
|
|
|
|
|
|
347
|
|
|
|
|
|
|
#warn "middle"; |
|
348
|
8
|
|
|
|
|
152
|
$tmp = substr( $txt, $start, $size ); |
|
349
|
|
|
|
|
|
|
} |
|
350
|
|
|
|
|
|
|
|
|
351
|
11
|
|
|
|
|
28
|
push @buf, $tmp; |
|
352
|
|
|
|
|
|
|
} |
|
353
|
|
|
|
|
|
|
|
|
354
|
5
|
|
|
|
|
17
|
return \@buf; |
|
355
|
|
|
|
|
|
|
} |
|
356
|
|
|
|
|
|
|
|
|
357
|
|
|
|
|
|
|
sub _loop { |
|
358
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
|
359
|
0
|
|
|
|
|
0
|
my $txt = shift; |
|
360
|
0
|
|
|
|
|
0
|
my $regexp = $self->{_qre}; |
|
361
|
|
|
|
|
|
|
|
|
362
|
|
|
|
|
|
|
#carp "loop snip: $txt"; |
|
363
|
|
|
|
|
|
|
|
|
364
|
0
|
0
|
|
|
|
0
|
$self->debug and carp "loop snip regexp: $regexp"; |
|
365
|
|
|
|
|
|
|
|
|
366
|
0
|
|
0
|
|
|
0
|
my $debug = $self->debug || 0; |
|
367
|
|
|
|
|
|
|
|
|
368
|
|
|
|
|
|
|
# no matches |
|
369
|
0
|
0
|
|
|
|
0
|
return $self->_dumb($txt) unless $txt =~ m/$regexp/; |
|
370
|
|
|
|
|
|
|
|
|
371
|
|
|
|
|
|
|
#carp "loop snip: $txt"; |
|
372
|
|
|
|
|
|
|
|
|
373
|
0
|
|
|
|
|
0
|
my $context = $self->context - 1; |
|
374
|
0
|
|
0
|
|
|
0
|
my $occur = $self->occur || 1; |
|
375
|
0
|
|
|
|
|
0
|
my @snips; |
|
376
|
|
|
|
|
|
|
|
|
377
|
0
|
|
|
|
|
0
|
my $notwc = $self->{_wc_regexp}; |
|
378
|
|
|
|
|
|
|
|
|
379
|
0
|
|
|
|
|
0
|
my @words = split( /($notwc)/, $txt ); |
|
380
|
0
|
|
|
|
|
0
|
my $count = -1; |
|
381
|
0
|
|
|
|
|
0
|
my $start_again = $count; |
|
382
|
0
|
|
|
|
|
0
|
my $total = 0; |
|
383
|
0
|
|
|
|
|
0
|
my $first_match = 0; |
|
384
|
|
|
|
|
|
|
|
|
385
|
0
|
|
|
|
|
0
|
WORD: for my $w (@words) { |
|
386
|
|
|
|
|
|
|
|
|
387
|
0
|
0
|
|
|
|
0
|
if ( $debug > 1 ) { |
|
388
|
0
|
0
|
|
|
|
0
|
warn ">>\n" if $count % 2; |
|
389
|
0
|
|
|
|
|
0
|
warn "word: '$w'\n"; |
|
390
|
|
|
|
|
|
|
} |
|
391
|
|
|
|
|
|
|
|
|
392
|
0
|
|
|
|
|
0
|
$count++; |
|
393
|
0
|
0
|
|
|
|
0
|
next WORD if $count < $start_again; |
|
394
|
|
|
|
|
|
|
|
|
395
|
|
|
|
|
|
|
# the next WORD lets us skip past the last frag we excerpted |
|
396
|
|
|
|
|
|
|
|
|
397
|
0
|
|
|
|
|
0
|
my $last = $count - 1; |
|
398
|
0
|
|
|
|
|
0
|
my $next = $count + 1; |
|
399
|
|
|
|
|
|
|
|
|
400
|
|
|
|
|
|
|
#warn '-' x 30 . "\n"; |
|
401
|
0
|
0
|
|
|
|
0
|
if ( $w =~ m/^$regexp$/ ) { |
|
402
|
|
|
|
|
|
|
|
|
403
|
0
|
0
|
|
|
|
0
|
if ( $debug > 1 ) { |
|
404
|
0
|
|
|
|
|
0
|
warn "w: '$w' match: '$1'\n"; |
|
405
|
|
|
|
|
|
|
} |
|
406
|
|
|
|
|
|
|
|
|
407
|
0
|
|
|
|
|
0
|
$first_match = $count; |
|
408
|
|
|
|
|
|
|
|
|
409
|
0
|
|
|
|
|
0
|
my $before = $last - $context; |
|
410
|
0
|
0
|
|
|
|
0
|
$before = 0 if $before < 0; |
|
411
|
0
|
|
|
|
|
0
|
my $after = $next + $context; |
|
412
|
0
|
0
|
|
|
|
0
|
$after = $#words if $after > $#words; |
|
413
|
|
|
|
|
|
|
|
|
414
|
0
|
0
|
|
|
|
0
|
if ( $debug > 1 ) { |
|
415
|
0
|
|
|
|
|
0
|
warn "$before .. $last, $count, $next .. $after\n"; |
|
416
|
|
|
|
|
|
|
} |
|
417
|
|
|
|
|
|
|
|
|
418
|
0
|
|
|
|
|
0
|
my @before = @words[ $before .. $last ]; |
|
419
|
0
|
|
|
|
|
0
|
my @after = @words[ $next .. $after ]; |
|
420
|
|
|
|
|
|
|
|
|
421
|
0
|
|
|
|
|
0
|
my $this_snip_matches = grep {m/^$regexp$/i} ( @before, @after ); |
|
|
0
|
|
|
|
|
0
|
|
|
422
|
0
|
0
|
|
|
|
0
|
if ($this_snip_matches) { |
|
423
|
0
|
|
|
|
|
0
|
$after += $this_snip_matches; |
|
424
|
0
|
|
|
|
|
0
|
@after = @words[ $next .. $after ]; |
|
425
|
|
|
|
|
|
|
} |
|
426
|
0
|
|
|
|
|
0
|
$total += $this_snip_matches; |
|
427
|
0
|
|
|
|
|
0
|
$total++; # for current $w |
|
428
|
|
|
|
|
|
|
|
|
429
|
0
|
|
|
|
|
0
|
my $t = join( '', @before, $w, @after ); |
|
430
|
|
|
|
|
|
|
|
|
431
|
0
|
0
|
|
|
|
0
|
$t .= $ellip unless $count == $#words; |
|
432
|
|
|
|
|
|
|
|
|
433
|
0
|
0
|
|
|
|
0
|
if ( $debug > 1 ) { |
|
434
|
0
|
|
|
|
|
0
|
warn "t: $t\n"; |
|
435
|
0
|
|
|
|
|
0
|
warn "this_snip_matches: $this_snip_matches\n"; |
|
436
|
0
|
|
|
|
|
0
|
warn "total: $total\n"; |
|
437
|
|
|
|
|
|
|
} |
|
438
|
|
|
|
|
|
|
|
|
439
|
0
|
|
|
|
|
0
|
push( @snips, [ $t, $this_snip_matches + 1 ] ); # +1 for $w |
|
440
|
0
|
|
|
|
|
0
|
$start_again = $after; |
|
441
|
|
|
|
|
|
|
} |
|
442
|
|
|
|
|
|
|
|
|
443
|
|
|
|
|
|
|
} |
|
444
|
|
|
|
|
|
|
|
|
445
|
|
|
|
|
|
|
# sort by match density. |
|
446
|
|
|
|
|
|
|
# consistent with HeatMap and lets us find |
|
447
|
|
|
|
|
|
|
# the *best* match, including phrases. |
|
448
|
0
|
|
|
|
|
0
|
@snips = map { $_->[0] } sort { $b->[1] <=> $a->[1] } @snips; |
|
|
0
|
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
449
|
|
|
|
|
|
|
|
|
450
|
0
|
0
|
|
|
|
0
|
if ( $debug > 1 ) { |
|
451
|
0
|
|
|
|
|
0
|
carp "snips: " . scalar @snips; |
|
452
|
0
|
|
|
|
|
0
|
carp "words: $count\n"; |
|
453
|
0
|
|
|
|
|
0
|
carp "grandtotal: $total\n"; |
|
454
|
0
|
|
|
|
|
0
|
carp "occur: $occur\n"; |
|
455
|
0
|
|
|
|
|
0
|
carp '-' x 50 . "\n"; |
|
456
|
|
|
|
|
|
|
|
|
457
|
|
|
|
|
|
|
} |
|
458
|
|
|
|
|
|
|
|
|
459
|
0
|
|
|
|
|
0
|
$self->count( scalar(@snips) + $self->count ); |
|
460
|
0
|
|
|
|
|
0
|
my $last_snip = $occur - 1; |
|
461
|
0
|
0
|
|
|
|
0
|
if ( $last_snip > $#snips ) { |
|
462
|
0
|
|
|
|
|
0
|
$last_snip = $#snips; |
|
463
|
|
|
|
|
|
|
} |
|
464
|
|
|
|
|
|
|
|
|
465
|
|
|
|
|
|
|
#warn dump \@snips; |
|
466
|
0
|
|
|
|
|
0
|
my $snippet = join( '', @snips[ 0 .. $last_snip ] ); |
|
467
|
0
|
0
|
|
|
|
0
|
$self->debug and warn "before no_start_partial: '$snippet'\n"; |
|
468
|
|
|
|
|
|
|
|
|
469
|
|
|
|
|
|
|
#_no_start_partial($snippet); |
|
470
|
0
|
0
|
|
|
|
0
|
$snippet = $ellip . $snippet if $first_match; |
|
471
|
|
|
|
|
|
|
|
|
472
|
0
|
|
|
|
|
0
|
return $snippet; |
|
473
|
|
|
|
|
|
|
} |
|
474
|
|
|
|
|
|
|
|
|
475
|
|
|
|
|
|
|
sub _re { |
|
476
|
|
|
|
|
|
|
|
|
477
|
|
|
|
|
|
|
# get first N matches for each q, then take one of each till we have $occur |
|
478
|
|
|
|
|
|
|
|
|
479
|
1
|
|
|
1
|
|
3
|
my $self = shift; |
|
480
|
1
|
|
|
|
|
2
|
my $text = shift; |
|
481
|
1
|
|
|
|
|
2
|
my @q = @{ $self->query->terms }; |
|
|
1
|
|
|
|
|
5
|
|
|
482
|
1
|
|
|
|
|
3
|
my $occur = $self->occur; |
|
483
|
1
|
|
|
|
|
5
|
my $Nchar = $self->context * $self->word_len; |
|
484
|
1
|
|
|
|
|
2
|
my $total = 0; |
|
485
|
1
|
|
|
|
|
2
|
my $notwc = $self->{_wc_regexp}; |
|
486
|
|
|
|
|
|
|
|
|
487
|
|
|
|
|
|
|
# get minimum number of snips necessary to meet $occur |
|
488
|
1
|
|
|
|
|
3
|
my $snip_per_q = int( $occur / scalar(@q) ); |
|
489
|
1
|
|
50
|
|
|
3
|
$snip_per_q ||= 1; |
|
490
|
|
|
|
|
|
|
|
|
491
|
1
|
|
|
|
|
20
|
my ( %snips, @snips, %ranges, $snip_starts_with_query ); |
|
492
|
1
|
|
|
|
|
2
|
$snip_starts_with_query = 0; |
|
493
|
|
|
|
|
|
|
|
|
494
|
1
|
|
|
|
|
2
|
Q: for my $q (@q) { |
|
495
|
1
|
|
|
|
|
6
|
$snips{$q} = { t => [], offset => [] }; |
|
496
|
|
|
|
|
|
|
|
|
497
|
1
|
50
|
|
|
|
48
|
$self->debug and warn "$q : $snip_starts_with_query"; |
|
498
|
|
|
|
|
|
|
|
|
499
|
|
|
|
|
|
|
# try simple regexp first, then more complex if we don't match |
|
500
|
|
|
|
|
|
|
next Q |
|
501
|
|
|
|
|
|
|
if $self->_re_match( \$text, $self->query->regex_for($q)->plain, |
|
502
|
1
|
50
|
|
|
|
34
|
\$total, $snips{$q}, \%ranges, $Nchar, $snip_per_q, |
|
503
|
|
|
|
|
|
|
\$snip_starts_with_query ); |
|
504
|
|
|
|
|
|
|
|
|
505
|
0
|
0
|
|
|
|
0
|
$self->debug and warn "failed match on plain regexp"; |
|
506
|
|
|
|
|
|
|
|
|
507
|
0
|
|
|
|
|
0
|
pos $text = 0; # do we really need to reset this? |
|
508
|
|
|
|
|
|
|
|
|
509
|
0
|
0
|
|
|
|
0
|
unless ( |
|
510
|
|
|
|
|
|
|
$self->_re_match( |
|
511
|
|
|
|
|
|
|
\$text, $self->query->regex_for($q)->html, |
|
512
|
|
|
|
|
|
|
\$total, $snips{$q}, |
|
513
|
|
|
|
|
|
|
\%ranges, $Nchar, |
|
514
|
|
|
|
|
|
|
$snip_per_q, \$snip_starts_with_query |
|
515
|
|
|
|
|
|
|
) |
|
516
|
|
|
|
|
|
|
) |
|
517
|
|
|
|
|
|
|
{ |
|
518
|
0
|
0
|
|
|
|
0
|
$self->debug and warn "failed match on html regexp"; |
|
519
|
|
|
|
|
|
|
} |
|
520
|
|
|
|
|
|
|
|
|
521
|
|
|
|
|
|
|
} |
|
522
|
|
|
|
|
|
|
|
|
523
|
1
|
50
|
|
|
|
3
|
return $self->_dumb($text) unless $total; |
|
524
|
|
|
|
|
|
|
|
|
525
|
|
|
|
|
|
|
# get all snips into one array in order they appeared in $text |
|
526
|
|
|
|
|
|
|
# should be a max of $snip_per_q in any one $q snip array |
|
527
|
|
|
|
|
|
|
# so we should have at least $occur in total, |
|
528
|
|
|
|
|
|
|
# which we'll splice() if need be. |
|
529
|
|
|
|
|
|
|
|
|
530
|
1
|
|
|
|
|
2
|
my %offsets; |
|
531
|
1
|
|
|
|
|
3
|
for my $q ( keys %snips ) { |
|
532
|
1
|
|
|
|
|
2
|
my @s = @{ $snips{$q}->{t} }; |
|
|
1
|
|
|
|
|
4
|
|
|
533
|
1
|
|
|
|
|
1
|
my @o = @{ $snips{$q}->{offset} }; |
|
|
1
|
|
|
|
|
3
|
|
|
534
|
|
|
|
|
|
|
|
|
535
|
1
|
|
|
|
|
1
|
my $i = 0; |
|
536
|
1
|
|
|
|
|
2
|
for (@s) { |
|
537
|
1
|
|
|
|
|
4
|
$offsets{$_} = $o[$i]; |
|
538
|
|
|
|
|
|
|
} |
|
539
|
|
|
|
|
|
|
} |
|
540
|
1
|
|
|
|
|
4
|
@snips = sort { $offsets{$a} <=> $offsets{$b} } keys %offsets; |
|
|
0
|
|
|
|
|
0
|
|
|
541
|
|
|
|
|
|
|
|
|
542
|
|
|
|
|
|
|
# max = $occur |
|
543
|
1
|
|
|
|
|
4
|
@snips = splice @snips, 0, $occur; |
|
544
|
|
|
|
|
|
|
|
|
545
|
1
|
50
|
|
|
|
16
|
$self->debug and warn dump( \@snips ); |
|
546
|
|
|
|
|
|
|
|
|
547
|
1
|
|
|
|
|
8
|
my $snip = join( $ellip, @snips ); |
|
548
|
1
|
50
|
|
|
|
5
|
_no_start_partial($snip) unless $snip_starts_with_query; |
|
549
|
1
|
50
|
|
|
|
23
|
$snip = $ellip . $snip unless $text =~ m/^\Q$snips[0]/i; |
|
550
|
1
|
50
|
|
|
|
31
|
$snip .= $ellip unless $text =~ m/\Q$snips[-1]$/i; |
|
551
|
|
|
|
|
|
|
|
|
552
|
1
|
|
|
|
|
5
|
$self->count( scalar(@snips) + $self->count ); |
|
553
|
|
|
|
|
|
|
|
|
554
|
1
|
|
|
|
|
21
|
return $snip; |
|
555
|
|
|
|
|
|
|
|
|
556
|
|
|
|
|
|
|
} |
|
557
|
|
|
|
|
|
|
|
|
558
|
|
|
|
|
|
|
sub _re_match { |
|
559
|
|
|
|
|
|
|
|
|
560
|
|
|
|
|
|
|
# the .{0,$Nchar} regexp slows things WAY down. so just match, |
|
561
|
|
|
|
|
|
|
# then use pos() to get chars before and after. |
|
562
|
|
|
|
|
|
|
|
|
563
|
|
|
|
|
|
|
# if escape = 0 and if prefix or suffix contains a < or >, |
|
564
|
|
|
|
|
|
|
# try to include entire tagset. |
|
565
|
|
|
|
|
|
|
|
|
566
|
1
|
|
|
1
|
|
4
|
my ( $self, $text, $re, $total, $snips, $ranges, $Nchar, $max_snips, |
|
567
|
|
|
|
|
|
|
$snip_starts_with_query ) |
|
568
|
|
|
|
|
|
|
= @_; |
|
569
|
|
|
|
|
|
|
|
|
570
|
1
|
|
|
|
|
4
|
my $t_len = length $$text; |
|
571
|
|
|
|
|
|
|
|
|
572
|
1
|
|
|
|
|
2
|
my $cnt = 0; |
|
573
|
|
|
|
|
|
|
|
|
574
|
1
|
50
|
|
|
|
16
|
if ( $self->debug ) { |
|
575
|
0
|
|
|
|
|
0
|
warn "re_match regexp: >$re<\n"; |
|
576
|
0
|
|
|
|
|
0
|
warn "max_snips: $max_snips\n"; |
|
577
|
|
|
|
|
|
|
} |
|
578
|
|
|
|
|
|
|
|
|
579
|
1
|
|
|
|
|
72
|
RE: while ( $$text =~ m/$re/g ) { |
|
580
|
|
|
|
|
|
|
|
|
581
|
1
|
|
|
|
|
4
|
my $pos = pos $$text; |
|
582
|
1
|
|
|
|
|
2
|
my $before_match = $1; |
|
583
|
1
|
|
|
|
|
3
|
my $match = $2; |
|
584
|
1
|
|
|
|
|
2
|
my $after_match = $3; |
|
585
|
1
|
|
|
|
|
3
|
$cnt++; |
|
586
|
1
|
|
|
|
|
1
|
my $len = length $match; |
|
587
|
1
|
|
|
|
|
3
|
my $blen = length $before_match; |
|
588
|
1
|
50
|
|
|
|
14
|
if ( $self->debug ) { |
|
589
|
0
|
|
|
|
|
0
|
warn "re: '$re'\n"; |
|
590
|
0
|
|
|
|
|
0
|
warn "\$1 = '$before_match' = ", ord($before_match), "\n"; |
|
591
|
0
|
|
|
|
|
0
|
warn "\$2 = '$match'\n"; |
|
592
|
0
|
|
|
|
|
0
|
warn "\$3 = '$after_match' = ", ord($after_match), "\n"; |
|
593
|
0
|
|
|
|
|
0
|
warn "pos = $pos\n"; |
|
594
|
0
|
|
|
|
|
0
|
warn "len = $len\n"; |
|
595
|
0
|
|
|
|
|
0
|
warn "blen= $blen\n"; |
|
596
|
|
|
|
|
|
|
} |
|
597
|
|
|
|
|
|
|
|
|
598
|
1
|
0
|
33
|
|
|
18
|
if ( $self->debug && exists $ranges->{$pos} ) { |
|
599
|
0
|
|
|
|
|
0
|
warn "already found $pos\n"; |
|
600
|
|
|
|
|
|
|
} |
|
601
|
|
|
|
|
|
|
|
|
602
|
1
|
50
|
|
|
|
7
|
next RE if exists $ranges->{$pos}; |
|
603
|
|
|
|
|
|
|
|
|
604
|
1
|
|
50
|
|
|
4
|
my $start_match = $pos - $len - ( $blen || 1 ); |
|
605
|
1
|
50
|
|
|
|
2
|
$start_match = 0 if $start_match < 0; |
|
606
|
|
|
|
|
|
|
|
|
607
|
1
|
50
|
|
|
|
3
|
$$snip_starts_with_query = 1 if $start_match == 0; |
|
608
|
|
|
|
|
|
|
|
|
609
|
|
|
|
|
|
|
# sanity |
|
610
|
1
|
50
|
|
|
|
25
|
$self->debug |
|
611
|
|
|
|
|
|
|
and warn "match should be [$start_match $len]: '", |
|
612
|
|
|
|
|
|
|
substr( $$text, $start_match, $len ), "'\n"; |
|
613
|
|
|
|
|
|
|
|
|
614
|
1
|
50
|
|
|
|
8
|
my $prefix_start |
|
615
|
|
|
|
|
|
|
= $start_match < $Nchar |
|
616
|
|
|
|
|
|
|
? 0 |
|
617
|
|
|
|
|
|
|
: $start_match - $Nchar; |
|
618
|
|
|
|
|
|
|
|
|
619
|
1
|
|
|
|
|
1
|
my $prefix_len = $start_match - $prefix_start; |
|
620
|
|
|
|
|
|
|
|
|
621
|
|
|
|
|
|
|
#$prefix_len++; $prefix_len++; |
|
622
|
|
|
|
|
|
|
|
|
623
|
1
|
|
|
|
|
2
|
my $suffix_start = $pos - length($after_match); |
|
624
|
1
|
|
|
|
|
2
|
my $suffix_len = $Nchar; |
|
625
|
1
|
|
|
|
|
1
|
my $end = $suffix_start + $suffix_len; |
|
626
|
|
|
|
|
|
|
|
|
627
|
|
|
|
|
|
|
# if $end extends beyond, that's ok, substr compensates |
|
628
|
|
|
|
|
|
|
|
|
629
|
1
|
|
|
|
|
108
|
$ranges->{$_}++ for ( $prefix_start .. $end ); |
|
630
|
1
|
|
|
|
|
3
|
my $prefix = substr( $$text, $prefix_start, $prefix_len ); |
|
631
|
1
|
|
|
|
|
3
|
my $suffix = substr( $$text, $suffix_start, $suffix_len ); |
|
632
|
|
|
|
|
|
|
|
|
633
|
1
|
50
|
|
|
|
16
|
if ( $self->debug ) { |
|
634
|
0
|
|
|
|
|
0
|
warn "prefix_start = $prefix_start\n"; |
|
635
|
0
|
|
|
|
|
0
|
warn "prefix_len = $prefix_len\n"; |
|
636
|
0
|
|
|
|
|
0
|
warn "start_match = $start_match\n"; |
|
637
|
0
|
|
|
|
|
0
|
warn "len = $len\n"; |
|
638
|
0
|
|
|
|
|
0
|
warn "pos = $pos\n"; |
|
639
|
0
|
|
|
|
|
0
|
warn "char = $Nchar\n"; |
|
640
|
0
|
|
|
|
|
0
|
warn "suffix_start = $suffix_start\n"; |
|
641
|
0
|
|
|
|
|
0
|
warn "suffix_len = $suffix_len\n"; |
|
642
|
0
|
|
|
|
|
0
|
warn "end = $end\n"; |
|
643
|
0
|
|
|
|
|
0
|
warn "prefix: '$prefix'\n"; |
|
644
|
0
|
|
|
|
|
0
|
warn "match: '$match'\n"; |
|
645
|
0
|
|
|
|
|
0
|
warn "suffix: '$suffix'\n"; |
|
646
|
|
|
|
|
|
|
} |
|
647
|
|
|
|
|
|
|
|
|
648
|
|
|
|
|
|
|
# try and get whole words if we split one up |
|
649
|
|
|
|
|
|
|
# _no_*_partial does this more rudely |
|
650
|
|
|
|
|
|
|
|
|
651
|
|
|
|
|
|
|
# might be faster to do m/(\S)*$prefix/i |
|
652
|
|
|
|
|
|
|
# but we couldn't guarantee position accuracy |
|
653
|
|
|
|
|
|
|
# e.g. if $prefix matched more than once in $$text, |
|
654
|
|
|
|
|
|
|
# we might pull the wrong \S* |
|
655
|
|
|
|
|
|
|
|
|
656
|
1
|
50
|
33
|
|
|
8
|
unless ( $prefix =~ m/^\s/ |
|
657
|
|
|
|
|
|
|
or substr( $$text, $prefix_start - 1, 1 ) =~ m/(\s)/ ) |
|
658
|
|
|
|
|
|
|
{ |
|
659
|
0
|
|
0
|
|
|
0
|
while ( --$prefix_start >= 0 |
|
660
|
|
|
|
|
|
|
and substr( $$text, $prefix_start, 1 ) =~ m/(\S)/ ) |
|
661
|
|
|
|
|
|
|
{ |
|
662
|
0
|
|
|
|
|
0
|
my $onemorechar = $1; |
|
663
|
|
|
|
|
|
|
|
|
664
|
|
|
|
|
|
|
#warn "adding $onemorechar to prefix\n"; |
|
665
|
0
|
|
|
|
|
0
|
$prefix = $onemorechar . $prefix; |
|
666
|
|
|
|
|
|
|
|
|
667
|
|
|
|
|
|
|
#last if $prefix_start <= 0 or $onemorechar !~ /\S/; |
|
668
|
|
|
|
|
|
|
} |
|
669
|
|
|
|
|
|
|
} |
|
670
|
|
|
|
|
|
|
|
|
671
|
|
|
|
|
|
|
# do same for suffix |
|
672
|
|
|
|
|
|
|
|
|
673
|
|
|
|
|
|
|
# We get error here under -w |
|
674
|
|
|
|
|
|
|
# about substr outside of string -- is $end undefined sometimes?? |
|
675
|
|
|
|
|
|
|
|
|
676
|
1
|
50
|
33
|
|
|
9
|
unless ( $suffix =~ m/\s$/ or substr( $$text, $end, 1 ) =~ m/(\s)/ ) { |
|
677
|
1
|
|
66
|
|
|
8
|
while ( $end <= $t_len |
|
678
|
|
|
|
|
|
|
and substr( $$text, $end++, 1 ) =~ m/(\S)/ ) |
|
679
|
|
|
|
|
|
|
{ |
|
680
|
|
|
|
|
|
|
|
|
681
|
3
|
|
|
|
|
4
|
my $onemore = $1; |
|
682
|
|
|
|
|
|
|
|
|
683
|
|
|
|
|
|
|
#warn "adding $onemore to suffix\n"; |
|
684
|
|
|
|
|
|
|
#warn "before '$suffix'\n"; |
|
685
|
3
|
|
|
|
|
27
|
$suffix .= $onemore; |
|
686
|
|
|
|
|
|
|
|
|
687
|
|
|
|
|
|
|
#warn "after '$suffix'\n"; |
|
688
|
|
|
|
|
|
|
} |
|
689
|
|
|
|
|
|
|
} |
|
690
|
|
|
|
|
|
|
|
|
691
|
|
|
|
|
|
|
# will likely fail to include one half of tagset if other is complete |
|
692
|
1
|
50
|
|
|
|
4
|
unless ( $self->escape ) { |
|
693
|
1
|
|
|
|
|
2
|
my $sanity = 0; |
|
694
|
1
|
|
|
|
|
3
|
my @l = ( $prefix =~ /(<)/g ); |
|
695
|
1
|
|
|
|
|
3
|
my @r = ( $prefix =~ /(>)/g ); |
|
696
|
1
|
|
|
|
|
4
|
while ( scalar @l != scalar @r ) { |
|
697
|
|
|
|
|
|
|
|
|
698
|
0
|
|
|
|
|
0
|
@l = ( $prefix =~ /(<)/g ); |
|
699
|
0
|
|
|
|
|
0
|
@r = ( $prefix =~ /(>)/g ); |
|
700
|
|
|
|
|
|
|
last |
|
701
|
|
|
|
|
|
|
if scalar @l |
|
702
|
0
|
0
|
|
|
|
0
|
== scalar @r; # don't take any more than we need to |
|
703
|
|
|
|
|
|
|
|
|
704
|
0
|
|
|
|
|
0
|
my $onemorechar = substr( $$text, $prefix_start--, 1 ); |
|
705
|
|
|
|
|
|
|
|
|
706
|
|
|
|
|
|
|
#warn "tagfix: adding $onemorechar to prefix\n"; |
|
707
|
0
|
|
|
|
|
0
|
$prefix = $onemorechar . $prefix; |
|
708
|
0
|
0
|
|
|
|
0
|
last if $prefix_start <= 0; |
|
709
|
0
|
0
|
|
|
|
0
|
last if $sanity++ > 100; |
|
710
|
|
|
|
|
|
|
|
|
711
|
|
|
|
|
|
|
} |
|
712
|
|
|
|
|
|
|
|
|
713
|
1
|
|
|
|
|
2
|
$sanity = 0; |
|
714
|
1
|
|
33
|
|
|
4
|
while ( $suffix =~ /<(\w+)/ && $suffix !~ /<\/$1>/ ) { |
|
715
|
|
|
|
|
|
|
|
|
716
|
0
|
|
|
|
|
0
|
my $onemorechar = substr( $$text, $end, 1 ); |
|
717
|
|
|
|
|
|
|
|
|
718
|
|
|
|
|
|
|
#warn "tagfix: adding $onemorechar to suffix\n"; |
|
719
|
0
|
|
|
|
|
0
|
$suffix .= $onemorechar; |
|
720
|
0
|
0
|
|
|
|
0
|
last if ++$end > $t_len; |
|
721
|
0
|
0
|
|
|
|
0
|
last if $sanity++ > 100; |
|
722
|
|
|
|
|
|
|
|
|
723
|
|
|
|
|
|
|
} |
|
724
|
|
|
|
|
|
|
} |
|
725
|
|
|
|
|
|
|
|
|
726
|
|
|
|
|
|
|
# warn "prefix: '$prefix'\n"; |
|
727
|
|
|
|
|
|
|
# warn "match: '$match'\n"; |
|
728
|
|
|
|
|
|
|
# warn "suffix: '$suffix'\n"; |
|
729
|
|
|
|
|
|
|
|
|
730
|
1
|
|
|
|
|
5
|
my $context = join( '', $prefix, $match, $suffix ); |
|
731
|
|
|
|
|
|
|
|
|
732
|
|
|
|
|
|
|
#warn "context is '$context'\n"; |
|
733
|
|
|
|
|
|
|
|
|
734
|
1
|
|
|
|
|
2
|
push( @{ $snips->{t} }, $context ); |
|
|
1
|
|
|
|
|
3
|
|
|
735
|
1
|
|
|
|
|
2
|
push( @{ $snips->{offset} }, $prefix_start ); |
|
|
1
|
|
|
|
|
3
|
|
|
736
|
|
|
|
|
|
|
|
|
737
|
1
|
|
|
|
|
2
|
$$total++; |
|
738
|
|
|
|
|
|
|
|
|
739
|
|
|
|
|
|
|
# warn '-' x 40, "\n"; |
|
740
|
|
|
|
|
|
|
|
|
741
|
1
|
50
|
|
|
|
4
|
last if $cnt >= $max_snips; |
|
742
|
|
|
|
|
|
|
} |
|
743
|
|
|
|
|
|
|
|
|
744
|
1
|
|
|
|
|
15
|
return $cnt; |
|
745
|
|
|
|
|
|
|
} |
|
746
|
|
|
|
|
|
|
|
|
747
|
|
|
|
|
|
|
sub _dumb { |
|
748
|
|
|
|
|
|
|
|
|
749
|
|
|
|
|
|
|
# just grap the first X chars and return |
|
750
|
|
|
|
|
|
|
|
|
751
|
3
|
|
|
3
|
|
7
|
my $self = shift; |
|
752
|
3
|
100
|
|
|
|
131
|
return '' unless $self->show; |
|
753
|
|
|
|
|
|
|
|
|
754
|
1
|
|
|
|
|
3
|
my $txt = shift; |
|
755
|
1
|
|
|
|
|
4
|
my $max = $self->max_chars; |
|
756
|
1
|
|
|
|
|
5
|
$self->type_used('dumb'); |
|
757
|
|
|
|
|
|
|
|
|
758
|
1
|
|
|
|
|
3
|
my $show = substr( $txt, 0, $max ); |
|
759
|
1
|
|
|
|
|
5
|
_no_end_partial($show); |
|
760
|
1
|
|
|
|
|
3
|
$show .= $ellip; |
|
761
|
|
|
|
|
|
|
|
|
762
|
1
|
|
|
|
|
5
|
$self->count( 1 + $self->count ); |
|
763
|
|
|
|
|
|
|
|
|
764
|
1
|
|
|
|
|
4
|
return $show; |
|
765
|
|
|
|
|
|
|
|
|
766
|
|
|
|
|
|
|
} |
|
767
|
|
|
|
|
|
|
|
|
768
|
|
|
|
|
|
|
sub _no_start_partial { |
|
769
|
1
|
|
|
1
|
|
4
|
$_[0] =~ s/^\S+\s+//gs; |
|
770
|
|
|
|
|
|
|
} |
|
771
|
|
|
|
|
|
|
|
|
772
|
|
|
|
|
|
|
sub _no_end_partial { |
|
773
|
1
|
|
|
1
|
|
4
|
$_[0] =~ s/\s+\S+$//gs; |
|
774
|
|
|
|
|
|
|
} |
|
775
|
|
|
|
|
|
|
|
|
776
|
|
|
|
|
|
|
sub _escape { |
|
777
|
31
|
50
|
|
31
|
|
100
|
if ( $_[0]->escape ) { |
|
778
|
0
|
|
|
|
|
0
|
return Search::Tools::XML->escape( $_[1] ); |
|
779
|
|
|
|
|
|
|
} |
|
780
|
|
|
|
|
|
|
else { |
|
781
|
31
|
|
|
|
|
80
|
return $_[1]; |
|
782
|
|
|
|
|
|
|
} |
|
783
|
|
|
|
|
|
|
} |
|
784
|
|
|
|
|
|
|
|
|
785
|
|
|
|
|
|
|
1; |
|
786
|
|
|
|
|
|
|
__END__ |