line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Search::Tools::TokenListUtils; |
2
|
32
|
|
|
32
|
|
13319
|
use Moo::Role; |
|
32
|
|
|
|
|
37
|
|
|
32
|
|
|
|
|
173
|
|
3
|
32
|
|
|
32
|
|
6704
|
use Carp; |
|
32
|
|
|
|
|
41
|
|
|
32
|
|
|
|
|
16561
|
|
4
|
|
|
|
|
|
|
|
5
|
|
|
|
|
|
|
our $VERSION = '1.004'; |
6
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
=head1 NAME |
8
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
Search::Tools::TokenListUtils - mixin methods for TokenList and TokenListPP |
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
=head1 SYNOPSIS |
12
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
my $tokens = $tokenizer->tokenize( $string ); |
14
|
|
|
|
|
|
|
if ( $tokens->str eq $string) { |
15
|
|
|
|
|
|
|
print "string is same, before and after tokenize()\n"; |
16
|
|
|
|
|
|
|
} |
17
|
|
|
|
|
|
|
else { |
18
|
|
|
|
|
|
|
warn "I'm filing a bug report against Search::Tools right away!\n"; |
19
|
|
|
|
|
|
|
} |
20
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
my ($start_pos, $end_pos) = $tokens->get_window( 5, 20 ); |
22
|
|
|
|
|
|
|
# $start_pos probably == 0 |
23
|
|
|
|
|
|
|
# $end_pos probably == 25 |
24
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
my $slice = $tokens->get_window_pos( 5, 20 ); |
26
|
|
|
|
|
|
|
for my $token (@$slice) { |
27
|
|
|
|
|
|
|
print "token = $token\n"; |
28
|
|
|
|
|
|
|
} |
29
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
=head1 DESCRIPTION |
31
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
Search::Tools::TokenListUtils contains pure-Perl methods inhertited |
33
|
|
|
|
|
|
|
by both Search::Tools::TokenList and Search::Tools::TokenListPP. |
34
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
=head1 METHODS |
36
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
=head2 str |
38
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
Returns a serialized version of the TokenList. If you haven't |
40
|
|
|
|
|
|
|
altered the TokenList since you got it from tokenize(), |
41
|
|
|
|
|
|
|
then str() returns a scalar string identical to (but not the same as) |
42
|
|
|
|
|
|
|
the string you passed to tokenize(). |
43
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
Both Search::Tools::TokenList and TokenListPP are overloaded |
45
|
|
|
|
|
|
|
to stringify to the str() value. |
46
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
=cut |
48
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
sub str { |
50
|
12
|
|
|
12
|
1
|
1190
|
my $self = shift; |
51
|
12
|
|
|
|
|
22
|
my $joiner = shift(@_); |
52
|
12
|
50
|
|
|
|
38
|
if ( !defined $joiner ) { |
53
|
12
|
|
|
|
|
17
|
$joiner = ''; |
54
|
|
|
|
|
|
|
} |
55
|
12
|
|
|
|
|
19
|
return join( $joiner, map {"$_"} @{ $self->as_array } ); |
|
376
|
|
|
|
|
430
|
|
|
12
|
|
|
|
|
32
|
|
56
|
|
|
|
|
|
|
} |
57
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
=head2 get_window( I [, I, I] ) |
59
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
Returns array with two values: I and I positions |
61
|
|
|
|
|
|
|
for the array of length I on either side of I. |
62
|
|
|
|
|
|
|
Like taking a slice of the TokenList. |
63
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
Note that I is the number of B not B. |
65
|
|
|
|
|
|
|
So if you're looking for the number of "words", think about |
66
|
|
|
|
|
|
|
I*2. |
67
|
|
|
|
|
|
|
|
68
|
|
|
|
|
|
|
Note too that I is the number of B on B |
69
|
|
|
|
|
|
|
side of I. So the entire window width (length of the returned |
70
|
|
|
|
|
|
|
slice) is I*2 +/-1. The window is guaranteed to be bounded |
71
|
|
|
|
|
|
|
by B. |
72
|
|
|
|
|
|
|
|
73
|
|
|
|
|
|
|
If I is true, the window is shifted to try and match |
74
|
|
|
|
|
|
|
the first token prior to I that returns true for is_sentence_start(). |
75
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
=cut |
77
|
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
sub get_window { |
79
|
77
|
|
|
77
|
1
|
70
|
my $self = shift; |
80
|
77
|
|
|
|
|
61
|
my $pos = shift; |
81
|
77
|
50
|
|
|
|
125
|
if ( !defined $pos ) { |
82
|
0
|
|
|
|
|
0
|
croak "pos required"; |
83
|
|
|
|
|
|
|
} |
84
|
|
|
|
|
|
|
|
85
|
77
|
|
50
|
|
|
123
|
my $size = int(shift) || 20; |
86
|
77
|
|
50
|
|
|
184
|
my $as_sentence = shift || 0; |
87
|
77
|
|
|
|
|
128
|
my $max_index = $self->len - 1; |
88
|
|
|
|
|
|
|
|
89
|
77
|
50
|
33
|
|
|
238
|
if ( $pos > $max_index or $pos < 0 ) { |
90
|
0
|
|
|
|
|
0
|
croak "illegal pos value: no such index in TokenList"; |
91
|
|
|
|
|
|
|
} |
92
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
#warn "window size $size for pos $pos"; |
94
|
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
# get the $size tokens on either side of $tok |
96
|
77
|
|
|
|
|
58
|
my ( $start, $end ); |
97
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
# is token too close to the top of the stack? |
99
|
77
|
100
|
|
|
|
107
|
if ( $pos > $size ) { |
100
|
62
|
|
|
|
|
53
|
$start = $pos - $size; |
101
|
|
|
|
|
|
|
} |
102
|
|
|
|
|
|
|
|
103
|
|
|
|
|
|
|
# is token too close to the bottom of the stack? |
104
|
77
|
100
|
|
|
|
107
|
if ( $pos < ( $max_index - $size ) ) { |
105
|
76
|
|
|
|
|
70
|
$end = $pos + $size; |
106
|
|
|
|
|
|
|
} |
107
|
77
|
|
100
|
|
|
117
|
$start ||= 0; |
108
|
77
|
|
66
|
|
|
103
|
$end ||= $max_index; |
109
|
|
|
|
|
|
|
|
110
|
77
|
50
|
|
|
|
97
|
if ($as_sentence) { |
111
|
0
|
|
|
|
|
0
|
my $sentence_starts = $self->get_sentence_starts; |
112
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
# default to what we have. |
114
|
0
|
|
|
|
|
0
|
my $start_for_pos = $start; |
115
|
0
|
|
|
|
|
0
|
my $i = 0; |
116
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
#warn "looking for sentence_start for start = $start end = $end\n"; |
118
|
0
|
|
|
|
|
0
|
for (@$sentence_starts) { |
119
|
|
|
|
|
|
|
|
120
|
|
|
|
|
|
|
#warn " $_ [$i]\n"; |
121
|
0
|
0
|
|
|
|
0
|
if ( $_ >= $pos ) { |
122
|
0
|
|
|
|
|
0
|
$start_for_pos = $sentence_starts->[$i]; |
123
|
0
|
|
|
|
|
0
|
last; |
124
|
|
|
|
|
|
|
} |
125
|
0
|
|
|
|
|
0
|
$i++; |
126
|
|
|
|
|
|
|
} |
127
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
#warn "found $start_for_pos (start = $start end = $end)\n"; |
129
|
0
|
0
|
|
|
|
0
|
if ( $start_for_pos != $start ) { |
130
|
0
|
0
|
|
|
|
0
|
if ( $start_for_pos < $start ) { |
131
|
0
|
|
|
|
|
0
|
$end -= ( $start - $start_for_pos ); |
132
|
|
|
|
|
|
|
} |
133
|
|
|
|
|
|
|
else { |
134
|
0
|
|
|
|
|
0
|
$end += ( $start_for_pos - $start ); |
135
|
|
|
|
|
|
|
} |
136
|
0
|
|
|
|
|
0
|
$start = $start_for_pos; |
137
|
|
|
|
|
|
|
} |
138
|
|
|
|
|
|
|
|
139
|
|
|
|
|
|
|
#warn "now $start_for_pos (start = $start end = $end)\n"; |
140
|
|
|
|
|
|
|
} |
141
|
|
|
|
|
|
|
else { |
142
|
|
|
|
|
|
|
|
143
|
|
|
|
|
|
|
# make sure window starts and ends with is_match |
144
|
77
|
|
|
|
|
216
|
while ( !$self->get_token($start)->is_match ) { |
145
|
31
|
|
|
|
|
65
|
$start++; |
146
|
|
|
|
|
|
|
} |
147
|
77
|
|
|
|
|
193
|
while ( !$self->get_token($end)->is_match ) { |
148
|
42
|
|
|
|
|
87
|
$end--; |
149
|
|
|
|
|
|
|
} |
150
|
|
|
|
|
|
|
} |
151
|
|
|
|
|
|
|
|
152
|
|
|
|
|
|
|
#warn "return $start .. $end"; |
153
|
|
|
|
|
|
|
#warn "$size ~~ " . ( $end - $start ); |
154
|
|
|
|
|
|
|
|
155
|
77
|
|
|
|
|
155
|
return ( $start, $end ); |
156
|
|
|
|
|
|
|
} |
157
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
=head2 get_window_tokens( I [, I] ) |
159
|
|
|
|
|
|
|
|
160
|
|
|
|
|
|
|
Like get_window() but returns an array ref of a slice |
161
|
|
|
|
|
|
|
of the TokenList containing Tokens. |
162
|
|
|
|
|
|
|
|
163
|
|
|
|
|
|
|
=cut |
164
|
|
|
|
|
|
|
|
165
|
|
|
|
|
|
|
sub get_window_tokens { |
166
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
167
|
0
|
|
|
|
|
0
|
my ( $start, $end ) = $self->get_window(@_); |
168
|
0
|
|
|
|
|
0
|
my @slice = (); |
169
|
0
|
|
|
|
|
0
|
for ( $start .. $end ) { |
170
|
0
|
|
|
|
|
0
|
push( @slice, $self->get_token($_) ); |
171
|
|
|
|
|
|
|
} |
172
|
0
|
|
|
|
|
0
|
return \@slice; |
173
|
|
|
|
|
|
|
} |
174
|
|
|
|
|
|
|
|
175
|
|
|
|
|
|
|
=head2 as_sentences([I]) |
176
|
|
|
|
|
|
|
|
177
|
|
|
|
|
|
|
Returns a reference to an array of arrays, |
178
|
|
|
|
|
|
|
where each child array is a "sentence" worth of Token objects. |
179
|
|
|
|
|
|
|
You can stringify each sentence array like: |
180
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
my $sentences = $tokenlist->as_sentences; |
182
|
|
|
|
|
|
|
for my $s (@$sentences) { |
183
|
|
|
|
|
|
|
printf("sentence: %s\n", join("", map {"$_"} @$s)); |
184
|
|
|
|
|
|
|
} |
185
|
|
|
|
|
|
|
|
186
|
|
|
|
|
|
|
If you pass a single true value to as_sentences(), |
187
|
|
|
|
|
|
|
then the array returned will consist of plain scalar strings |
188
|
|
|
|
|
|
|
with whitespace normalized. |
189
|
|
|
|
|
|
|
|
190
|
|
|
|
|
|
|
=cut |
191
|
|
|
|
|
|
|
|
192
|
|
|
|
|
|
|
sub as_sentences { |
193
|
2
|
|
|
2
|
1
|
375
|
my $self = shift; |
194
|
2
|
|
100
|
|
|
11
|
my $stringed = shift || 0; |
195
|
2
|
|
|
|
|
3
|
my @sents; |
196
|
|
|
|
|
|
|
my @s; |
197
|
|
|
|
|
|
|
|
198
|
|
|
|
|
|
|
# use array method since we do not know the iterator position |
199
|
2
|
|
|
|
|
3
|
for my $t ( @{ $self->as_array } ) { |
|
2
|
|
|
|
|
9
|
|
200
|
144
|
100
|
|
|
|
255
|
if ( $t->is_sentence_start ) { |
201
|
|
|
|
|
|
|
|
202
|
|
|
|
|
|
|
# if has any, add anonymous copy to master |
203
|
8
|
100
|
|
|
|
15
|
if (@s) { |
204
|
6
|
|
|
|
|
17
|
push @sents, [@s]; |
205
|
|
|
|
|
|
|
} |
206
|
|
|
|
|
|
|
|
207
|
|
|
|
|
|
|
# reset |
208
|
8
|
|
|
|
|
16
|
@s = (); |
209
|
|
|
|
|
|
|
} |
210
|
|
|
|
|
|
|
|
211
|
|
|
|
|
|
|
# add |
212
|
144
|
|
|
|
|
138
|
push @s, $t; |
213
|
|
|
|
|
|
|
} |
214
|
2
|
50
|
|
|
|
7
|
if (@s) { |
215
|
2
|
|
|
|
|
7
|
push @sents, [@s]; |
216
|
|
|
|
|
|
|
} |
217
|
2
|
100
|
|
|
|
8
|
if ($stringed) { |
218
|
1
|
|
|
|
|
2
|
my @stringed; |
219
|
1
|
|
|
|
|
3
|
for my $s (@sents) { |
220
|
4
|
|
|
|
|
7
|
my $str = join( "", map {"$_"} @$s ); |
|
72
|
|
|
|
|
136
|
|
221
|
4
|
|
|
|
|
28
|
$str =~ s/\s\s+/\ /g; |
222
|
4
|
|
|
|
|
24
|
$str =~ s/\s+$//; |
223
|
4
|
|
|
|
|
11
|
push @stringed, $str; |
224
|
|
|
|
|
|
|
} |
225
|
1
|
|
|
|
|
11
|
return \@stringed; |
226
|
|
|
|
|
|
|
} |
227
|
|
|
|
|
|
|
|
228
|
1
|
|
|
|
|
5
|
return \@sents; |
229
|
|
|
|
|
|
|
} |
230
|
|
|
|
|
|
|
|
231
|
|
|
|
|
|
|
1; |
232
|
|
|
|
|
|
|
|
233
|
|
|
|
|
|
|
__END__ |