line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Search::Fulltext::Tokenizer::Ngram; |
2
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
# ABSTRACT: Character n-gram tokenizer for Search::Fulltext |
4
|
|
|
|
|
|
|
|
5
|
1
|
|
|
1
|
|
688
|
use strict; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
41
|
|
6
|
1
|
|
|
1
|
|
25
|
use warnings; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
34
|
|
7
|
1
|
|
|
1
|
|
5
|
use Carp (); |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
27
|
|
8
|
1
|
|
|
1
|
|
5
|
use Scalar::Util qw/looks_like_number/; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
428
|
|
9
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
our $VERSION = 0.01; |
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
sub new { |
13
|
32
|
|
|
32
|
0
|
66
|
my ($class, $token_length) = @_; |
14
|
|
|
|
|
|
|
|
15
|
32
|
50
|
33
|
|
|
285
|
unless (looks_like_number $token_length and $token_length > 0) { |
16
|
0
|
|
|
|
|
0
|
Carp::croak('Token length must be 1+.'); |
17
|
|
|
|
|
|
|
} |
18
|
|
|
|
|
|
|
|
19
|
32
|
|
|
|
|
223
|
bless +{ token_length => $token_length } => $class; |
20
|
|
|
|
|
|
|
} |
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
sub create_token_iterator { |
23
|
32
|
|
|
32
|
0
|
66
|
my ($self, $text) = @_; |
24
|
|
|
|
|
|
|
|
25
|
32
|
|
|
|
|
41
|
my $token_index = -1; |
26
|
32
|
|
|
|
|
68
|
my $n = $self->token_length; |
27
|
|
|
|
|
|
|
return sub { |
28
|
242
|
|
|
|
|
267
|
GET_NEXT_TOKEN: |
29
|
|
|
|
|
|
|
{ |
30
|
218
|
|
|
218
|
|
291
|
++$token_index; |
31
|
242
|
100
|
|
|
|
1877
|
return if $token_index + $n > length($text); |
32
|
221
|
|
|
|
|
409
|
my $token = substr $text, $token_index, $n; |
33
|
221
|
100
|
|
|
|
632
|
redo GET_NEXT_TOKEN if $token =~ /\s/; |
34
|
197
|
|
|
|
|
1248
|
return ($token, $n, $token_index, $token_index + $n, $token_index); |
35
|
|
|
|
|
|
|
} |
36
|
32
|
|
|
|
|
614
|
}; |
37
|
|
|
|
|
|
|
} |
38
|
|
|
|
|
|
|
|
39
|
32
|
|
|
32
|
0
|
102
|
sub token_length { $_[0]->{token_length} } |
40
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
1; |
42
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
__END__ |