|  line  | 
 stmt  | 
 bran  | 
 cond  | 
 sub  | 
 pod  | 
 time  | 
 code  | 
| 
1
 | 
  
 
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 package Search::Tools::Tokenizer;  | 
| 
2
 | 
31
 | 
 
 | 
 
 | 
  
31
  
 | 
 
 | 
69770
 | 
 use Moo;  | 
| 
 
 | 
31
 | 
 
 | 
 
 | 
 
 | 
 
 | 
46470
 | 
    | 
| 
 
 | 
31
 | 
 
 | 
 
 | 
 
 | 
 
 | 
171
 | 
    | 
| 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 extends 'Search::Tools::Object';  | 
| 
4
 | 
31
 | 
 
 | 
 
 | 
  
31
  
 | 
 
 | 
14484
 | 
 use Search::Tools;    # XS package required  | 
| 
 
 | 
31
 | 
 
 | 
 
 | 
 
 | 
 
 | 
72
 | 
    | 
| 
 
 | 
31
 | 
 
 | 
 
 | 
 
 | 
 
 | 
548
 | 
    | 
| 
5
 | 
31
 | 
 
 | 
 
 | 
  
31
  
 | 
 
 | 
10318
 | 
 use Search::Tools::Token;  | 
| 
 
 | 
31
 | 
 
 | 
 
 | 
 
 | 
 
 | 
64
 | 
    | 
| 
 
 | 
31
 | 
 
 | 
 
 | 
 
 | 
 
 | 
606
 | 
    | 
| 
6
 | 
31
 | 
 
 | 
 
 | 
  
31
  
 | 
 
 | 
10164
 | 
 use Search::Tools::TokenList;  | 
| 
 
 | 
31
 | 
 
 | 
 
 | 
 
 | 
 
 | 
78
 | 
    | 
| 
 
 | 
31
 | 
 
 | 
 
 | 
 
 | 
 
 | 
820
 | 
    | 
| 
7
 | 
31
 | 
 
 | 
 
 | 
  
31
  
 | 
 
 | 
2559
 | 
 use Search::Tools::UTF8;  | 
| 
 
 | 
31
 | 
 
 | 
 
 | 
 
 | 
 
 | 
61
 | 
    | 
| 
 
 | 
31
 | 
 
 | 
 
 | 
 
 | 
 
 | 
2412
 | 
    | 
| 
8
 | 
31
 | 
 
 | 
 
 | 
  
31
  
 | 
 
 | 
156
 | 
 use Carp;  | 
| 
 
 | 
31
 | 
 
 | 
 
 | 
 
 | 
 
 | 
74
 | 
    | 
| 
 
 | 
31
 | 
 
 | 
 
 | 
 
 | 
 
 | 
12148
 | 
    | 
| 
9
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
10
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 our $VERSION = '1.006';  | 
| 
11
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
12
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 has 're' => ( is => 'rw', default => sub {qr/\w+(?:[\'\-\.]\w+)*/} );  | 
| 
13
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
14
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub BUILD {  | 
| 
15
 | 
65
 | 
 
 | 
 
 | 
  
65
  
 | 
  
1
  
 | 
650
 | 
     my $self = shift;  | 
| 
16
 | 
65
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
967
 | 
     if ( $self->debug ) {  | 
| 
17
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
         $self->set_debug( $self->debug - 1 );    # XS debug a level higher  | 
| 
18
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
19
 | 
65
 | 
 
 | 
 
 | 
 
 | 
 
 | 
540
 | 
     return $self;  | 
| 
20
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
21
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
22
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub tokenize_pp {  | 
| 
23
 | 
7
 | 
 
 | 
 
 | 
  
7
  
 | 
  
1
  
 | 
802
 | 
     require Search::Tools::TokenPP;  | 
| 
24
 | 
7
 | 
 
 | 
 
 | 
 
 | 
 
 | 
704
 | 
     require Search::Tools::TokenListPP;  | 
| 
25
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
26
 | 
7
 | 
 
 | 
 
 | 
 
 | 
 
 | 
14
 | 
     my $self = shift;  | 
| 
27
 | 
7
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
21
 | 
     if ( !defined $_[0] ) {  | 
| 
28
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
         croak "str required";  | 
| 
29
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
30
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
31
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # XS modifies the original arg, so we do too.  | 
| 
32
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # this is same slight optimization XS does. ~5%  | 
| 
33
 | 
7
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
46
 | 
     if ( !is_ascii( $_[0] ) ) {  | 
| 
34
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
10
 | 
         $_[0] = to_utf8( $_[0] );  | 
| 
35
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
36
 | 
7
 | 
 
 | 
 
 | 
 
 | 
 
 | 
14
 | 
     my $heat_seeker = $_[1];  | 
| 
37
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
38
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # match_num ($_[2]) not supported in PP  | 
| 
39
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
40
 | 
7
 | 
 
 | 
 
 | 
 
 | 
 
 | 
15
 | 
     my @heat   = ();  | 
| 
41
 | 
7
 | 
 
 | 
 
 | 
 
 | 
 
 | 
10
 | 
     my @tokens = ();  | 
| 
42
 | 
7
 | 
 
 | 
 
 | 
 
 | 
 
 | 
9
 | 
     my $i      = 0;  | 
| 
43
 | 
7
 | 
 
 | 
 
 | 
 
 | 
 
 | 
12
 | 
     my $re     = $self->{re};  | 
| 
44
 | 
7
 | 
  
100
  
 | 
  
100
  
 | 
 
 | 
 
 | 
30
 | 
     my $heat_seeker_is_coderef  | 
| 
45
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         = ( defined $heat_seeker and ref($heat_seeker) eq 'CODE' ) ? 1 : 0;  | 
| 
46
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
47
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # TODO is_sentence_* logic  | 
| 
48
 | 
7
 | 
 
 | 
 
 | 
 
 | 
 
 | 
242
 | 
     for ( split( m/($re)/, $_[0] ) ) {  | 
| 
49
 | 
343
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
573
 | 
         next unless length($_);  | 
| 
50
 | 
326
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1084
 | 
         my $tok = bless(  | 
| 
51
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             {   'pos'    => $i++,  | 
| 
52
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 str      => $_,  | 
| 
53
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 is_hot   => 0,  | 
| 
54
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 is_match => 0,  | 
| 
55
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 len      => byte_length($_),  | 
| 
56
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 u8len    => length($_),  | 
| 
57
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             },  | 
| 
58
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             'Search::Tools::TokenPP'  | 
| 
59
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         );  | 
| 
60
 | 
326
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
1274
 | 
         if ( $_ =~ m/^$re$/ ) {  | 
| 
61
 | 
170
 | 
 
 | 
 
 | 
 
 | 
 
 | 
371
 | 
             $tok->{is_match} = 1;  | 
| 
62
 | 
170
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
292
 | 
             if ($heat_seeker_is_coderef) {  | 
| 
 
 | 
 
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
63
 | 
13
 | 
 
 | 
 
 | 
 
 | 
 
 | 
29
 | 
                 $heat_seeker->($tok);  | 
| 
64
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             }  | 
| 
65
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             elsif ( defined $heat_seeker ) {  | 
| 
66
 | 
69
 | 
 
 | 
 
 | 
 
 | 
 
 | 
211
 | 
                 $tok->{is_hot} = $_ =~ m/$heat_seeker/;  | 
| 
67
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             }  | 
| 
68
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }  | 
| 
69
 | 
326
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
2388
 | 
         push( @heat, $tok->{pos} ) if $tok->{is_hot};  | 
| 
70
 | 
326
 | 
 
 | 
 
 | 
 
 | 
 
 | 
571
 | 
         push @tokens, $tok;  | 
| 
71
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
72
 | 
7
 | 
 
 | 
 
 | 
 
 | 
 
 | 
86
 | 
     return bless(  | 
| 
73
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         {   tokens => \@tokens,  | 
| 
74
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             num    => $i,  | 
| 
75
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             'pos'  => 0,  | 
| 
76
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             heat   => \@heat,  | 
| 
77
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         },  | 
| 
78
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         'Search::Tools::TokenListPP'  | 
| 
79
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     );  | 
| 
80
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
81
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
82
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 1;  | 
| 
83
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
84
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 __END__  |