line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Search::Tools::Tokenizer; |
2
|
31
|
|
|
31
|
|
69770
|
use Moo; |
|
31
|
|
|
|
|
46470
|
|
|
31
|
|
|
|
|
171
|
|
3
|
|
|
|
|
|
|
extends 'Search::Tools::Object'; |
4
|
31
|
|
|
31
|
|
14484
|
use Search::Tools; # XS package required |
|
31
|
|
|
|
|
72
|
|
|
31
|
|
|
|
|
548
|
|
5
|
31
|
|
|
31
|
|
10318
|
use Search::Tools::Token; |
|
31
|
|
|
|
|
64
|
|
|
31
|
|
|
|
|
606
|
|
6
|
31
|
|
|
31
|
|
10164
|
use Search::Tools::TokenList; |
|
31
|
|
|
|
|
78
|
|
|
31
|
|
|
|
|
820
|
|
7
|
31
|
|
|
31
|
|
2559
|
use Search::Tools::UTF8; |
|
31
|
|
|
|
|
61
|
|
|
31
|
|
|
|
|
2412
|
|
8
|
31
|
|
|
31
|
|
156
|
use Carp; |
|
31
|
|
|
|
|
74
|
|
|
31
|
|
|
|
|
12148
|
|
9
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
our $VERSION = '1.006'; |
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
has 're' => ( is => 'rw', default => sub {qr/\w+(?:[\'\-\.]\w+)*/} ); |
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
sub BUILD { |
15
|
65
|
|
|
65
|
1
|
650
|
my $self = shift; |
16
|
65
|
50
|
|
|
|
967
|
if ( $self->debug ) { |
17
|
0
|
|
|
|
|
0
|
$self->set_debug( $self->debug - 1 ); # XS debug a level higher |
18
|
|
|
|
|
|
|
} |
19
|
65
|
|
|
|
|
540
|
return $self; |
20
|
|
|
|
|
|
|
} |
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
sub tokenize_pp { |
23
|
7
|
|
|
7
|
1
|
802
|
require Search::Tools::TokenPP; |
24
|
7
|
|
|
|
|
704
|
require Search::Tools::TokenListPP; |
25
|
|
|
|
|
|
|
|
26
|
7
|
|
|
|
|
14
|
my $self = shift; |
27
|
7
|
50
|
|
|
|
21
|
if ( !defined $_[0] ) { |
28
|
0
|
|
|
|
|
0
|
croak "str required"; |
29
|
|
|
|
|
|
|
} |
30
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
# XS modifies the original arg, so we do too. |
32
|
|
|
|
|
|
|
# this is same slight optimization XS does. ~5% |
33
|
7
|
100
|
|
|
|
46
|
if ( !is_ascii( $_[0] ) ) { |
34
|
2
|
|
|
|
|
10
|
$_[0] = to_utf8( $_[0] ); |
35
|
|
|
|
|
|
|
} |
36
|
7
|
|
|
|
|
14
|
my $heat_seeker = $_[1]; |
37
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
# match_num ($_[2]) not supported in PP |
39
|
|
|
|
|
|
|
|
40
|
7
|
|
|
|
|
15
|
my @heat = (); |
41
|
7
|
|
|
|
|
10
|
my @tokens = (); |
42
|
7
|
|
|
|
|
9
|
my $i = 0; |
43
|
7
|
|
|
|
|
12
|
my $re = $self->{re}; |
44
|
7
|
100
|
100
|
|
|
30
|
my $heat_seeker_is_coderef |
45
|
|
|
|
|
|
|
= ( defined $heat_seeker and ref($heat_seeker) eq 'CODE' ) ? 1 : 0; |
46
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
# TODO is_sentence_* logic |
48
|
7
|
|
|
|
|
242
|
for ( split( m/($re)/, $_[0] ) ) { |
49
|
343
|
100
|
|
|
|
573
|
next unless length($_); |
50
|
326
|
|
|
|
|
1084
|
my $tok = bless( |
51
|
|
|
|
|
|
|
{ 'pos' => $i++, |
52
|
|
|
|
|
|
|
str => $_, |
53
|
|
|
|
|
|
|
is_hot => 0, |
54
|
|
|
|
|
|
|
is_match => 0, |
55
|
|
|
|
|
|
|
len => byte_length($_), |
56
|
|
|
|
|
|
|
u8len => length($_), |
57
|
|
|
|
|
|
|
}, |
58
|
|
|
|
|
|
|
'Search::Tools::TokenPP' |
59
|
|
|
|
|
|
|
); |
60
|
326
|
100
|
|
|
|
1274
|
if ( $_ =~ m/^$re$/ ) { |
61
|
170
|
|
|
|
|
371
|
$tok->{is_match} = 1; |
62
|
170
|
100
|
|
|
|
292
|
if ($heat_seeker_is_coderef) { |
|
|
100
|
|
|
|
|
|
63
|
13
|
|
|
|
|
29
|
$heat_seeker->($tok); |
64
|
|
|
|
|
|
|
} |
65
|
|
|
|
|
|
|
elsif ( defined $heat_seeker ) { |
66
|
69
|
|
|
|
|
211
|
$tok->{is_hot} = $_ =~ m/$heat_seeker/; |
67
|
|
|
|
|
|
|
} |
68
|
|
|
|
|
|
|
} |
69
|
326
|
100
|
|
|
|
2388
|
push( @heat, $tok->{pos} ) if $tok->{is_hot}; |
70
|
326
|
|
|
|
|
571
|
push @tokens, $tok; |
71
|
|
|
|
|
|
|
} |
72
|
7
|
|
|
|
|
86
|
return bless( |
73
|
|
|
|
|
|
|
{ tokens => \@tokens, |
74
|
|
|
|
|
|
|
num => $i, |
75
|
|
|
|
|
|
|
'pos' => 0, |
76
|
|
|
|
|
|
|
heat => \@heat, |
77
|
|
|
|
|
|
|
}, |
78
|
|
|
|
|
|
|
'Search::Tools::TokenListPP' |
79
|
|
|
|
|
|
|
); |
80
|
|
|
|
|
|
|
} |
81
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
1; |
83
|
|
|
|
|
|
|
|
84
|
|
|
|
|
|
|
__END__ |