File Coverage

blib/lib/Search/Tools/TokenListUtils.pm

Criterion	Covered	Total	%
statement	55	76	72.3
branch	15	26	57.6
condition	9	14	64.2
subroutine	5	6	83.3
pod	4	4	100.0
total	88	126	69.8

line	stmt	bran	cond	sub	pod	time	code
1							package Search::Tools::TokenListUtils;
2	32			32		13319	use Moo::Role;
	32					37
	32					173
3	32			32		6704	use Carp;
	32					41
	32					16561
4
5							our $VERSION = '1.004';
6
7							=head1 NAME
8
9							Search::Tools::TokenListUtils - mixin methods for TokenList and TokenListPP
10
11							=head1 SYNOPSIS
12
13							my $tokens = $tokenizer->tokenize( $string );
14							if ( $tokens->str eq $string) {
15							print "string is same, before and after tokenize()\n";
16							}
17							else {
18							warn "I'm filing a bug report against Search::Tools right away!\n";
19							}
20
21							my ($start_pos, $end_pos) = $tokens->get_window( 5, 20 );
22							# $start_pos probably == 0
23							# $end_pos probably == 25
24
25							my $slice = $tokens->get_window_pos( 5, 20 );
26							for my $token (@$slice) {
27							print "token = $token\n";
28							}
29
30							=head1 DESCRIPTION
31
32							Search::Tools::TokenListUtils contains pure-Perl methods inhertited
33							by both Search::Tools::TokenList and Search::Tools::TokenListPP.
34
35							=head1 METHODS
36
37							=head2 str
38
39							Returns a serialized version of the TokenList. If you haven't
40							altered the TokenList since you got it from tokenize(),
41							then str() returns a scalar string identical to (but not the same as)
42							the string you passed to tokenize().
43
44							Both Search::Tools::TokenList and TokenListPP are overloaded
45							to stringify to the str() value.
46
47							=cut
48
49							sub str {
50	12			12	1	1190	my $self = shift;
51	12					22	my $joiner = shift(@_);
52	12	50				38	if ( !defined $joiner ) {
53	12					17	$joiner = '';
54							}
55	12					19	return join( $joiner, map {"$_"} @{ $self->as_array } );
	376					430
	12					32
56							}
57
58							=head2 get_window( I [, I, I] )
59
60							Returns array with two values: I and I positions
61							for the array of length I on either side of I.
62							Like taking a slice of the TokenList.
63
64							Note that I is the number of B not B.
65							So if you're looking for the number of "words", think about
66							I*2.
67
68							Note too that I is the number of B on B
69							side of I. So the entire window width (length of the returned
70							slice) is I*2 +/-1. The window is guaranteed to be bounded
71							by B.
72
73							If I is true, the window is shifted to try and match
74							the first token prior to I that returns true for is_sentence_start().
75
76							=cut
77
78							sub get_window {
79	77			77	1	70	my $self = shift;
80	77					61	my $pos = shift;
81	77	50				125	if ( !defined $pos ) {
82	0					0	croak "pos required";
83							}
84
85	77		50			123	my $size = int(shift) \|\| 20;
86	77		50			184	my $as_sentence = shift \|\| 0;
87	77					128	my $max_index = $self->len - 1;
88
89	77	50	33			238	if ( $pos > $max_index or $pos < 0 ) {
90	0					0	croak "illegal pos value: no such index in TokenList";
91							}
92
93							#warn "window size $size for pos $pos";
94
95							# get the $size tokens on either side of $tok
96	77					58	my ( $start, $end );
97
98							# is token too close to the top of the stack?
99	77	100				107	if ( $pos > $size ) {
100	62					53	$start = $pos - $size;
101							}
102
103							# is token too close to the bottom of the stack?
104	77	100				107	if ( $pos < ( $max_index - $size ) ) {
105	76					70	$end = $pos + $size;
106							}
107	77		100			117	$start \|\|= 0;
108	77		66			103	$end \|\|= $max_index;
109
110	77	50				97	if ($as_sentence) {
111	0					0	my $sentence_starts = $self->get_sentence_starts;
112
113							# default to what we have.
114	0					0	my $start_for_pos = $start;
115	0					0	my $i = 0;
116
117							#warn "looking for sentence_start for start = $start end = $end\n";
118	0					0	for (@$sentence_starts) {
119
120							#warn " $_ [$i]\n";
121	0	0				0	if ( $_ >= $pos ) {
122	0					0	$start_for_pos = $sentence_starts->[$i];
123	0					0	last;
124							}
125	0					0	$i++;
126							}
127
128							#warn "found $start_for_pos (start = $start end = $end)\n";
129	0	0				0	if ( $start_for_pos != $start ) {
130	0	0				0	if ( $start_for_pos < $start ) {
131	0					0	$end -= ( $start - $start_for_pos );
132							}
133							else {
134	0					0	$end += ( $start_for_pos - $start );
135							}
136	0					0	$start = $start_for_pos;
137							}
138
139							#warn "now $start_for_pos (start = $start end = $end)\n";
140							}
141							else {
142
143							# make sure window starts and ends with is_match
144	77					216	while ( !$self->get_token($start)->is_match ) {
145	31					65	$start++;
146							}
147	77					193	while ( !$self->get_token($end)->is_match ) {
148	42					87	$end--;
149							}
150							}
151
152							#warn "return $start .. $end";
153							#warn "$size ~~ " . ( $end - $start );
154
155	77					155	return ( $start, $end );
156							}
157
158							=head2 get_window_tokens( I [, I] )
159
160							Like get_window() but returns an array ref of a slice
161							of the TokenList containing Tokens.
162
163							=cut
164
165							sub get_window_tokens {
166	0			0	1	0	my $self = shift;
167	0					0	my ( $start, $end ) = $self->get_window(@_);
168	0					0	my @slice = ();
169	0					0	for ( $start .. $end ) {
170	0					0	push( @slice, $self->get_token($_) );
171							}
172	0					0	return \@slice;
173							}
174
175							=head2 as_sentences([I])
176
177							Returns a reference to an array of arrays,
178							where each child array is a "sentence" worth of Token objects.
179							You can stringify each sentence array like:
180
181							my $sentences = $tokenlist->as_sentences;
182							for my $s (@$sentences) {
183							printf("sentence: %s\n", join("", map {"$_"} @$s));
184							}
185
186							If you pass a single true value to as_sentences(),
187							then the array returned will consist of plain scalar strings
188							with whitespace normalized.
189
190							=cut
191
192							sub as_sentences {
193	2			2	1	375	my $self = shift;
194	2		100			11	my $stringed = shift \|\| 0;
195	2					3	my @sents;
196							my @s;
197
198							# use array method since we do not know the iterator position
199	2					3	for my $t ( @{ $self->as_array } ) {
	2					9
200	144	100				255	if ( $t->is_sentence_start ) {
201
202							# if has any, add anonymous copy to master
203	8	100				15	if (@s) {
204	6					17	push @sents, [@s];
205							}
206
207							# reset
208	8					16	@s = ();
209							}
210
211							# add
212	144					138	push @s, $t;
213							}
214	2	50				7	if (@s) {
215	2					7	push @sents, [@s];
216							}
217	2	100				8	if ($stringed) {
218	1					2	my @stringed;
219	1					3	for my $s (@sents) {
220	4					7	my $str = join( "", map {"$_"} @$s );
	72					136
221	4					28	$str =~ s/\s\s+/\ /g;
222	4					24	$str =~ s/\s+$//;
223	4					11	push @stringed, $str;
224							}
225	1					11	return \@stringed;
226							}
227
228	1					5	return \@sents;
229							}
230
231							1;
232
233							__END__