| line | stmt | bran | cond | sub | pod | time | code | 
| 1 |  |  |  |  |  |  | package Search::Tools::TokenListUtils; | 
| 2 | 32 |  |  | 32 |  | 13319 | use Moo::Role; | 
|  | 32 |  |  |  |  | 37 |  | 
|  | 32 |  |  |  |  | 173 |  | 
| 3 | 32 |  |  | 32 |  | 6704 | use Carp; | 
|  | 32 |  |  |  |  | 41 |  | 
|  | 32 |  |  |  |  | 16561 |  | 
| 4 |  |  |  |  |  |  |  | 
| 5 |  |  |  |  |  |  | our $VERSION = '1.004'; | 
| 6 |  |  |  |  |  |  |  | 
| 7 |  |  |  |  |  |  | =head1 NAME | 
| 8 |  |  |  |  |  |  |  | 
| 9 |  |  |  |  |  |  | Search::Tools::TokenListUtils - mixin methods for TokenList and TokenListPP | 
| 10 |  |  |  |  |  |  |  | 
| 11 |  |  |  |  |  |  | =head1 SYNOPSIS | 
| 12 |  |  |  |  |  |  |  | 
| 13 |  |  |  |  |  |  | my $tokens = $tokenizer->tokenize( $string ); | 
| 14 |  |  |  |  |  |  | if ( $tokens->str eq $string) { | 
| 15 |  |  |  |  |  |  | print "string is same, before and after tokenize()\n"; | 
| 16 |  |  |  |  |  |  | } | 
| 17 |  |  |  |  |  |  | else { | 
| 18 |  |  |  |  |  |  | warn "I'm filing a bug report against Search::Tools right away!\n"; | 
| 19 |  |  |  |  |  |  | } | 
| 20 |  |  |  |  |  |  |  | 
| 21 |  |  |  |  |  |  | my ($start_pos, $end_pos) = $tokens->get_window( 5, 20 ); | 
| 22 |  |  |  |  |  |  | # $start_pos probably == 0 | 
| 23 |  |  |  |  |  |  | # $end_pos probably   == 25 | 
| 24 |  |  |  |  |  |  |  | 
| 25 |  |  |  |  |  |  | my $slice = $tokens->get_window_pos( 5, 20 ); | 
| 26 |  |  |  |  |  |  | for my $token (@$slice) { | 
| 27 |  |  |  |  |  |  | print "token = $token\n"; | 
| 28 |  |  |  |  |  |  | } | 
| 29 |  |  |  |  |  |  |  | 
| 30 |  |  |  |  |  |  | =head1 DESCRIPTION | 
| 31 |  |  |  |  |  |  |  | 
| 32 |  |  |  |  |  |  | Search::Tools::TokenListUtils contains pure-Perl methods inhertited | 
| 33 |  |  |  |  |  |  | by both Search::Tools::TokenList and Search::Tools::TokenListPP. | 
| 34 |  |  |  |  |  |  |  | 
| 35 |  |  |  |  |  |  | =head1 METHODS | 
| 36 |  |  |  |  |  |  |  | 
| 37 |  |  |  |  |  |  | =head2 str | 
| 38 |  |  |  |  |  |  |  | 
| 39 |  |  |  |  |  |  | Returns a serialized version of the TokenList. If you haven't | 
| 40 |  |  |  |  |  |  | altered the TokenList since you got it from tokenize(), | 
| 41 |  |  |  |  |  |  | then str() returns a scalar string identical to (but not the same as) | 
| 42 |  |  |  |  |  |  | the string you passed to tokenize(). | 
| 43 |  |  |  |  |  |  |  | 
| 44 |  |  |  |  |  |  | Both Search::Tools::TokenList and TokenListPP are overloaded | 
| 45 |  |  |  |  |  |  | to stringify to the str() value. | 
| 46 |  |  |  |  |  |  |  | 
| 47 |  |  |  |  |  |  | =cut | 
| 48 |  |  |  |  |  |  |  | 
| 49 |  |  |  |  |  |  | sub str { | 
| 50 | 12 |  |  | 12 | 1 | 1190 | my $self   = shift; | 
| 51 | 12 |  |  |  |  | 22 | my $joiner = shift(@_); | 
| 52 | 12 | 50 |  |  |  | 38 | if ( !defined $joiner ) { | 
| 53 | 12 |  |  |  |  | 17 | $joiner = ''; | 
| 54 |  |  |  |  |  |  | } | 
| 55 | 12 |  |  |  |  | 19 | return join( $joiner, map {"$_"} @{ $self->as_array } ); | 
|  | 376 |  |  |  |  | 430 |  | 
|  | 12 |  |  |  |  | 32 |  | 
| 56 |  |  |  |  |  |  | } | 
| 57 |  |  |  |  |  |  |  | 
| 58 |  |  |  |  |  |  | =head2 get_window( I [, I, I] ) | 
| 59 |  |  |  |  |  |  |  | 
| 60 |  |  |  |  |  |  | Returns array with two values: I and I positions | 
| 61 |  |  |  |  |  |  | for the array of length I on either side of I. | 
| 62 |  |  |  |  |  |  | Like taking a slice of the TokenList. | 
| 63 |  |  |  |  |  |  |  | 
| 64 |  |  |  |  |  |  | Note that I is the number of B not B. | 
| 65 |  |  |  |  |  |  | So if you're looking for the number of "words", think about | 
| 66 |  |  |  |  |  |  | I*2. | 
| 67 |  |  |  |  |  |  |  | 
| 68 |  |  |  |  |  |  | Note too that I is the number of B on B | 
| 69 |  |  |  |  |  |  | side of I. So the entire window width (length of the returned | 
| 70 |  |  |  |  |  |  | slice) is I*2 +/-1. The window is guaranteed to be bounded | 
| 71 |  |  |  |  |  |  | by B. | 
| 72 |  |  |  |  |  |  |  | 
| 73 |  |  |  |  |  |  | If I is true, the window is shifted to try and match | 
| 74 |  |  |  |  |  |  | the first token prior to I that returns true for is_sentence_start(). | 
| 75 |  |  |  |  |  |  |  | 
| 76 |  |  |  |  |  |  | =cut | 
| 77 |  |  |  |  |  |  |  | 
| 78 |  |  |  |  |  |  | sub get_window { | 
| 79 | 77 |  |  | 77 | 1 | 70 | my $self = shift; | 
| 80 | 77 |  |  |  |  | 61 | my $pos  = shift; | 
| 81 | 77 | 50 |  |  |  | 125 | if ( !defined $pos ) { | 
| 82 | 0 |  |  |  |  | 0 | croak "pos required"; | 
| 83 |  |  |  |  |  |  | } | 
| 84 |  |  |  |  |  |  |  | 
| 85 | 77 |  | 50 |  |  | 123 | my $size        = int(shift) || 20; | 
| 86 | 77 |  | 50 |  |  | 184 | my $as_sentence = shift      || 0; | 
| 87 | 77 |  |  |  |  | 128 | my $max_index   = $self->len - 1; | 
| 88 |  |  |  |  |  |  |  | 
| 89 | 77 | 50 | 33 |  |  | 238 | if ( $pos > $max_index or $pos < 0 ) { | 
| 90 | 0 |  |  |  |  | 0 | croak "illegal pos value: no such index in TokenList"; | 
| 91 |  |  |  |  |  |  | } | 
| 92 |  |  |  |  |  |  |  | 
| 93 |  |  |  |  |  |  | #warn "window size $size for pos $pos"; | 
| 94 |  |  |  |  |  |  |  | 
| 95 |  |  |  |  |  |  | # get the $size tokens on either side of $tok | 
| 96 | 77 |  |  |  |  | 58 | my ( $start, $end ); | 
| 97 |  |  |  |  |  |  |  | 
| 98 |  |  |  |  |  |  | # is token too close to the top of the stack? | 
| 99 | 77 | 100 |  |  |  | 107 | if ( $pos > $size ) { | 
| 100 | 62 |  |  |  |  | 53 | $start = $pos - $size; | 
| 101 |  |  |  |  |  |  | } | 
| 102 |  |  |  |  |  |  |  | 
| 103 |  |  |  |  |  |  | # is token too close to the bottom of the stack? | 
| 104 | 77 | 100 |  |  |  | 107 | if ( $pos < ( $max_index - $size ) ) { | 
| 105 | 76 |  |  |  |  | 70 | $end = $pos + $size; | 
| 106 |  |  |  |  |  |  | } | 
| 107 | 77 |  | 100 |  |  | 117 | $start ||= 0; | 
| 108 | 77 |  | 66 |  |  | 103 | $end   ||= $max_index; | 
| 109 |  |  |  |  |  |  |  | 
| 110 | 77 | 50 |  |  |  | 97 | if ($as_sentence) { | 
| 111 | 0 |  |  |  |  | 0 | my $sentence_starts = $self->get_sentence_starts; | 
| 112 |  |  |  |  |  |  |  | 
| 113 |  |  |  |  |  |  | # default to what we have. | 
| 114 | 0 |  |  |  |  | 0 | my $start_for_pos = $start; | 
| 115 | 0 |  |  |  |  | 0 | my $i             = 0; | 
| 116 |  |  |  |  |  |  |  | 
| 117 |  |  |  |  |  |  | #warn "looking for sentence_start for start = $start end = $end\n"; | 
| 118 | 0 |  |  |  |  | 0 | for (@$sentence_starts) { | 
| 119 |  |  |  |  |  |  |  | 
| 120 |  |  |  |  |  |  | #warn " $_ [$i]\n"; | 
| 121 | 0 | 0 |  |  |  | 0 | if ( $_ >= $pos ) { | 
| 122 | 0 |  |  |  |  | 0 | $start_for_pos = $sentence_starts->[$i]; | 
| 123 | 0 |  |  |  |  | 0 | last; | 
| 124 |  |  |  |  |  |  | } | 
| 125 | 0 |  |  |  |  | 0 | $i++; | 
| 126 |  |  |  |  |  |  | } | 
| 127 |  |  |  |  |  |  |  | 
| 128 |  |  |  |  |  |  | #warn "found $start_for_pos (start = $start end = $end)\n"; | 
| 129 | 0 | 0 |  |  |  | 0 | if ( $start_for_pos != $start ) { | 
| 130 | 0 | 0 |  |  |  | 0 | if ( $start_for_pos < $start ) { | 
| 131 | 0 |  |  |  |  | 0 | $end -= ( $start - $start_for_pos ); | 
| 132 |  |  |  |  |  |  | } | 
| 133 |  |  |  |  |  |  | else { | 
| 134 | 0 |  |  |  |  | 0 | $end += ( $start_for_pos - $start ); | 
| 135 |  |  |  |  |  |  | } | 
| 136 | 0 |  |  |  |  | 0 | $start = $start_for_pos; | 
| 137 |  |  |  |  |  |  | } | 
| 138 |  |  |  |  |  |  |  | 
| 139 |  |  |  |  |  |  | #warn "now $start_for_pos (start = $start end = $end)\n"; | 
| 140 |  |  |  |  |  |  | } | 
| 141 |  |  |  |  |  |  | else { | 
| 142 |  |  |  |  |  |  |  | 
| 143 |  |  |  |  |  |  | # make sure window starts and ends with is_match | 
| 144 | 77 |  |  |  |  | 216 | while ( !$self->get_token($start)->is_match ) { | 
| 145 | 31 |  |  |  |  | 65 | $start++; | 
| 146 |  |  |  |  |  |  | } | 
| 147 | 77 |  |  |  |  | 193 | while ( !$self->get_token($end)->is_match ) { | 
| 148 | 42 |  |  |  |  | 87 | $end--; | 
| 149 |  |  |  |  |  |  | } | 
| 150 |  |  |  |  |  |  | } | 
| 151 |  |  |  |  |  |  |  | 
| 152 |  |  |  |  |  |  | #warn "return $start .. $end"; | 
| 153 |  |  |  |  |  |  | #warn "$size ~~ " . ( $end - $start ); | 
| 154 |  |  |  |  |  |  |  | 
| 155 | 77 |  |  |  |  | 155 | return ( $start, $end ); | 
| 156 |  |  |  |  |  |  | } | 
| 157 |  |  |  |  |  |  |  | 
| 158 |  |  |  |  |  |  | =head2 get_window_tokens( I [, I] ) | 
| 159 |  |  |  |  |  |  |  | 
| 160 |  |  |  |  |  |  | Like get_window() but returns an array ref of a slice | 
| 161 |  |  |  |  |  |  | of the TokenList containing Tokens. | 
| 162 |  |  |  |  |  |  |  | 
| 163 |  |  |  |  |  |  | =cut | 
| 164 |  |  |  |  |  |  |  | 
| 165 |  |  |  |  |  |  | sub get_window_tokens { | 
| 166 | 0 |  |  | 0 | 1 | 0 | my $self = shift; | 
| 167 | 0 |  |  |  |  | 0 | my ( $start, $end ) = $self->get_window(@_); | 
| 168 | 0 |  |  |  |  | 0 | my @slice = (); | 
| 169 | 0 |  |  |  |  | 0 | for ( $start .. $end ) { | 
| 170 | 0 |  |  |  |  | 0 | push( @slice, $self->get_token($_) ); | 
| 171 |  |  |  |  |  |  | } | 
| 172 | 0 |  |  |  |  | 0 | return \@slice; | 
| 173 |  |  |  |  |  |  | } | 
| 174 |  |  |  |  |  |  |  | 
| 175 |  |  |  |  |  |  | =head2 as_sentences([I]) | 
| 176 |  |  |  |  |  |  |  | 
| 177 |  |  |  |  |  |  | Returns a reference to an array of arrays, | 
| 178 |  |  |  |  |  |  | where each child array is a "sentence" worth of Token objects. | 
| 179 |  |  |  |  |  |  | You can stringify each sentence array like: | 
| 180 |  |  |  |  |  |  |  | 
| 181 |  |  |  |  |  |  | my $sentences = $tokenlist->as_sentences; | 
| 182 |  |  |  |  |  |  | for my $s (@$sentences) { | 
| 183 |  |  |  |  |  |  | printf("sentence: %s\n", join("", map {"$_"} @$s)); | 
| 184 |  |  |  |  |  |  | } | 
| 185 |  |  |  |  |  |  |  | 
| 186 |  |  |  |  |  |  | If you pass a single true value to as_sentences(), | 
| 187 |  |  |  |  |  |  | then the array returned will consist of plain scalar strings | 
| 188 |  |  |  |  |  |  | with whitespace normalized. | 
| 189 |  |  |  |  |  |  |  | 
| 190 |  |  |  |  |  |  | =cut | 
| 191 |  |  |  |  |  |  |  | 
| 192 |  |  |  |  |  |  | sub as_sentences { | 
| 193 | 2 |  |  | 2 | 1 | 375 | my $self = shift; | 
| 194 | 2 |  | 100 |  |  | 11 | my $stringed = shift || 0; | 
| 195 | 2 |  |  |  |  | 3 | my @sents; | 
| 196 |  |  |  |  |  |  | my @s; | 
| 197 |  |  |  |  |  |  |  | 
| 198 |  |  |  |  |  |  | # use array method since we do not know the iterator position | 
| 199 | 2 |  |  |  |  | 3 | for my $t ( @{ $self->as_array } ) { | 
|  | 2 |  |  |  |  | 9 |  | 
| 200 | 144 | 100 |  |  |  | 255 | if ( $t->is_sentence_start ) { | 
| 201 |  |  |  |  |  |  |  | 
| 202 |  |  |  |  |  |  | # if has any, add anonymous copy to master | 
| 203 | 8 | 100 |  |  |  | 15 | if (@s) { | 
| 204 | 6 |  |  |  |  | 17 | push @sents, [@s]; | 
| 205 |  |  |  |  |  |  | } | 
| 206 |  |  |  |  |  |  |  | 
| 207 |  |  |  |  |  |  | # reset | 
| 208 | 8 |  |  |  |  | 16 | @s = (); | 
| 209 |  |  |  |  |  |  | } | 
| 210 |  |  |  |  |  |  |  | 
| 211 |  |  |  |  |  |  | # add | 
| 212 | 144 |  |  |  |  | 138 | push @s, $t; | 
| 213 |  |  |  |  |  |  | } | 
| 214 | 2 | 50 |  |  |  | 7 | if (@s) { | 
| 215 | 2 |  |  |  |  | 7 | push @sents, [@s]; | 
| 216 |  |  |  |  |  |  | } | 
| 217 | 2 | 100 |  |  |  | 8 | if ($stringed) { | 
| 218 | 1 |  |  |  |  | 2 | my @stringed; | 
| 219 | 1 |  |  |  |  | 3 | for my $s (@sents) { | 
| 220 | 4 |  |  |  |  | 7 | my $str = join( "", map {"$_"} @$s ); | 
|  | 72 |  |  |  |  | 136 |  | 
| 221 | 4 |  |  |  |  | 28 | $str =~ s/\s\s+/\ /g; | 
| 222 | 4 |  |  |  |  | 24 | $str =~ s/\s+$//; | 
| 223 | 4 |  |  |  |  | 11 | push @stringed, $str; | 
| 224 |  |  |  |  |  |  | } | 
| 225 | 1 |  |  |  |  | 11 | return \@stringed; | 
| 226 |  |  |  |  |  |  | } | 
| 227 |  |  |  |  |  |  |  | 
| 228 | 1 |  |  |  |  | 5 | return \@sents; | 
| 229 |  |  |  |  |  |  | } | 
| 230 |  |  |  |  |  |  |  | 
| 231 |  |  |  |  |  |  | 1; | 
| 232 |  |  |  |  |  |  |  | 
| 233 |  |  |  |  |  |  | __END__ |