|  line  | 
 stmt  | 
 bran  | 
 cond  | 
 sub  | 
 pod  | 
 time  | 
 code  | 
| 
1
 | 
  
 
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 package Search::Tools::QueryParser;  | 
| 
2
 | 
26
 | 
 
 | 
 
 | 
  
26
  
 | 
 
 | 
90717
 | 
 use Moo;  | 
| 
 
 | 
26
 | 
 
 | 
 
 | 
 
 | 
 
 | 
129412
 | 
    | 
| 
 
 | 
26
 | 
 
 | 
 
 | 
 
 | 
 
 | 
206
 | 
    | 
| 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 extends 'Search::Tools::Object';  | 
| 
4
 | 
26
 | 
 
 | 
 
 | 
  
26
  
 | 
 
 | 
23376
 | 
 use Carp;  | 
| 
 
 | 
26
 | 
 
 | 
 
 | 
 
 | 
 
 | 
69
 | 
    | 
| 
 
 | 
26
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1843
 | 
    | 
| 
5
 | 
26
 | 
 
 | 
 
 | 
  
26
  
 | 
 
 | 
2916
 | 
 use Data::Dump qw( dump );  | 
| 
 
 | 
26
 | 
 
 | 
 
 | 
 
 | 
 
 | 
31222
 | 
    | 
| 
 
 | 
26
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1569
 | 
    | 
| 
6
 | 
26
 | 
 
 | 
 
 | 
  
26
  
 | 
 
 | 
16929
 | 
 use Search::Query::Parser;  | 
| 
 
 | 
26
 | 
 
 | 
 
 | 
 
 | 
 
 | 
4155494
 | 
    | 
| 
 
 | 
26
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1251
 | 
    | 
| 
7
 | 
26
 | 
 
 | 
 
 | 
  
26
  
 | 
 
 | 
6802
 | 
 use Encode;  | 
| 
 
 | 
26
 | 
 
 | 
 
 | 
 
 | 
 
 | 
110667
 | 
    | 
| 
 
 | 
26
 | 
 
 | 
 
 | 
 
 | 
 
 | 
2683
 | 
    | 
| 
8
 | 
26
 | 
 
 | 
 
 | 
  
26
  
 | 
 
 | 
232
 | 
 use Data::Dump;  | 
| 
 
 | 
26
 | 
 
 | 
 
 | 
 
 | 
 
 | 
65
 | 
    | 
| 
 
 | 
26
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1373
 | 
    | 
| 
9
 | 
26
 | 
 
 | 
 
 | 
  
26
  
 | 
 
 | 
15209
 | 
 use Search::Tools::Query;  | 
| 
 
 | 
26
 | 
 
 | 
 
 | 
 
 | 
 
 | 
106
 | 
    | 
| 
 
 | 
26
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1123
 | 
    | 
| 
10
 | 
26
 | 
 
 | 
 
 | 
  
26
  
 | 
 
 | 
225
 | 
 use Search::Tools::UTF8;  | 
| 
 
 | 
26
 | 
 
 | 
 
 | 
 
 | 
 
 | 
61
 | 
    | 
| 
 
 | 
26
 | 
 
 | 
 
 | 
 
 | 
 
 | 
2849
 | 
    | 
| 
11
 | 
26
 | 
 
 | 
 
 | 
  
26
  
 | 
 
 | 
222
 | 
 use Search::Tools::XML;  | 
| 
 
 | 
26
 | 
 
 | 
 
 | 
 
 | 
 
 | 
70
 | 
    | 
| 
 
 | 
26
 | 
 
 | 
 
 | 
 
 | 
 
 | 
653
 | 
    | 
| 
12
 | 
26
 | 
 
 | 
 
 | 
  
26
  
 | 
 
 | 
161
 | 
 use Search::Tools::RegEx;  | 
| 
 
 | 
26
 | 
 
 | 
 
 | 
 
 | 
 
 | 
73
 | 
    | 
| 
 
 | 
26
 | 
 
 | 
 
 | 
 
 | 
 
 | 
686
 | 
    | 
| 
13
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
14
 | 
26
 | 
 
 | 
 
 | 
  
26
  
 | 
 
 | 
156
 | 
 use namespace::autoclean;  | 
| 
 
 | 
26
 | 
 
 | 
 
 | 
 
 | 
 
 | 
63
 | 
    | 
| 
 
 | 
26
 | 
 
 | 
 
 | 
 
 | 
 
 | 
134
 | 
    | 
| 
15
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
16
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 our $VERSION = '1.007';  | 
| 
17
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
18
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 my $XML = Search::Tools::XML->new();  | 
| 
19
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 my $C2E = $XML->char2ent_map;  | 
| 
20
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
21
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # we turn locale pragma on in a small block  | 
| 
22
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # because we don't want it to mess up our regex building  | 
| 
23
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # or taint vars in other areas. We just want to use setlocale()  | 
| 
24
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # and make sure we get correct ->utf8 encoding  | 
| 
25
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 my ( $locale, $lang, $charset );  | 
| 
26
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 {  | 
| 
27
 | 
26
 | 
 
 | 
 
 | 
  
26
  
 | 
 
 | 
17507
 | 
     use POSIX qw(locale_h);  | 
| 
 
 | 
26
 | 
 
 | 
 
 | 
 
 | 
 
 | 
164357
 | 
    | 
| 
 
 | 
26
 | 
 
 | 
 
 | 
 
 | 
 
 | 
204
 | 
    | 
| 
28
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     $locale = setlocale(LC_CTYPE);  | 
| 
29
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     ( $lang, $charset ) = split( m/\./, $locale );  | 
| 
30
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     $charset ||= q/UTF-8/;    # 
 | 
| 
31
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     $lang = q/en_US/ if $lang =~ m/^(posix|c)$/i;  | 
| 
32
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
33
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
34
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 my %Defaults = (  | 
| 
35
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     and_word                => q/and|near\d*/,  | 
| 
36
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     charset                 => $charset,  | 
| 
37
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     default_field           => "",  | 
| 
38
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     ignore_case             => 1,  | 
| 
39
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     ignore_fields           => {},  | 
| 
40
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     ignore_first_char       => quotemeta(q/'-/),  | 
| 
41
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     ignore_last_char        => quotemeta(q/'-/),  | 
| 
42
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     lang                    => $lang,  | 
| 
43
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     locale                  => $locale,  | 
| 
44
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     not_word                => q/not/,  | 
| 
45
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     or_word                 => q/or/,  | 
| 
46
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     phrase_delim            => q/"/,  | 
| 
47
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     query_class             => 'Search::Tools::Query',  | 
| 
48
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     query_dialect           => "Search::Query::Dialect::Native",  | 
| 
49
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     stemmer                 => undef,  | 
| 
50
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     stopwords               => [],  | 
| 
51
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     tag_re                  => $XML->tag_re,  | 
| 
52
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     term_re                 => qr/\w+(?:[\'\-]\w+)*/,  | 
| 
53
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     term_min_length         => 1,  | 
| 
54
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     treat_uris_like_phrases => 1,  | 
| 
55
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     whitespace              => $XML->html_whitespace,  | 
| 
56
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     wildcard                => q/*/,  | 
| 
57
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     word_characters         => q/\w/ . quotemeta(q/'-/),  | 
| 
58
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 );  | 
| 
59
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
60
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 for my $attr ( keys %Defaults ) {  | 
| 
61
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     has( $attr => ( is => 'rw', default => sub { $Defaults{$attr} } ) );  | 
| 
62
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
63
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 has 'start_bound'        => ( is => 'ro' );  | 
| 
64
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 has 'end_bound'          => ( is => 'ro' );  | 
| 
65
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 has 'plain_phrase_bound' => ( is => 'ro' );  | 
| 
66
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 has 'html_phrase_bound'  => ( is => 'ro' );  | 
| 
67
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
68
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub get_defaults {  | 
| 
69
 | 
  
0
  
 | 
 
 | 
 
 | 
  
0
  
 | 
  
1
  
 | 
0
 | 
     return {%Defaults};  | 
| 
70
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
71
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
72
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub BUILD {  | 
| 
73
 | 
51
 | 
 
 | 
 
 | 
  
51
  
 | 
  
1
  
 | 
375
 | 
     my $self = shift;  | 
| 
74
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
75
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # TODO handle case where both term_re and word_characters are defined  | 
| 
76
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
77
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # charset/locale/lang are a bit interdependent  | 
| 
78
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # so make sure charset/lang are set if locale is explicitly passed.  | 
| 
79
 | 
51
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
299
 | 
     if ( $self->{locale} ne $Defaults{locale} ) {  | 
| 
80
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
10
 | 
         ( $self->{lang}, $self->{charset} ) = split( m/\./, $self->{locale} );  | 
| 
81
 | 
1
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
12
 | 
         $self->{lang} = 'en_US' if $self->{lang} =~ m/^(posix|c)$/i;  | 
| 
82
 | 
1
 | 
 
 | 
  
 33
  
 | 
 
 | 
 
 | 
6
 | 
         $self->{charset} ||= $Defaults{charset};  | 
| 
83
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
84
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
85
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # make sure ignore_fields is a hash ref  | 
| 
86
 | 
51
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
242
 | 
     if ( ref( $self->{ignore_fields} ) eq 'ARRAY' ) {  | 
| 
87
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         $self->{ignore_fields}  | 
| 
88
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
             = { map { $_ => $_ } @{ $self->{ignore_fields} } };  | 
| 
 
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
    | 
| 
 
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
    | 
| 
89
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
90
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
91
 | 
51
 | 
 
 | 
 
 | 
 
 | 
 
 | 
260
 | 
     $self->_setup_regex_builder;  | 
| 
92
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
93
 | 
51
 | 
 
 | 
 
 | 
 
 | 
 
 | 
590
 | 
     return $self;  | 
| 
94
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
95
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
96
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub parse {  | 
| 
97
 | 
63
 | 
 
 | 
 
 | 
  
63
  
 | 
  
1
  
 | 
11614
 | 
     my $self      = shift;  | 
| 
98
 | 
63
 | 
 
 | 
 
 | 
 
 | 
 
 | 
154
 | 
     my $query_str = shift;  | 
| 
99
 | 
63
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
263
 | 
     confess "query required" unless defined $query_str;  | 
| 
100
 | 
63
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
234
 | 
     if ( ref $query_str ) {  | 
| 
101
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
         croak "query must be a scalar string";  | 
| 
102
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
103
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
104
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     #$query_str = to_utf8( $query_str, $self->charset );  | 
| 
105
 | 
63
 | 
 
 | 
 
 | 
 
 | 
 
 | 
259
 | 
     my $extracted = $self->_extract_terms($query_str);  | 
| 
106
 | 
63
 | 
 
 | 
 
 | 
 
 | 
 
 | 
262
 | 
     my %regex;  | 
| 
107
 | 
63
 | 
 
 | 
 
 | 
 
 | 
 
 | 
189
 | 
 TERM: for my $term ( @{ $extracted->{terms} } ) {  | 
| 
 
 | 
63
 | 
 
 | 
 
 | 
 
 | 
 
 | 
310
 | 
    | 
| 
108
 | 
145
 | 
 
 | 
 
 | 
 
 | 
 
 | 
2570
 | 
         my ( $plain, $html, $escaped ) = $self->_build_regex($term);  | 
| 
109
 | 
145
 | 
 
 | 
 
 | 
 
 | 
 
 | 
690
 | 
         my $is_phrase = $term =~ m/\ /;  | 
| 
110
 | 
145
 | 
 
 | 
 
 | 
 
 | 
 
 | 
317
 | 
         my @phrase_terms;  | 
| 
111
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
112
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # if the term is a phrase,  | 
| 
113
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # build regex for each term in the phrase  | 
| 
114
 | 
145
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
470
 | 
         if ($is_phrase) {  | 
| 
115
 | 
32
 | 
 
 | 
 
 | 
 
 | 
 
 | 
215
 | 
             my @pts = split( /\ /, $term );  | 
| 
116
 | 
32
 | 
 
 | 
 
 | 
 
 | 
 
 | 
108
 | 
             for my $pt (@pts) {  | 
| 
117
 | 
87
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1448
 | 
                 my ( $pt_plain, $pt_html, $pt_esc )  | 
| 
118
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                     = $self->_build_regex($pt);  | 
| 
119
 | 
87
 | 
 
 | 
 
 | 
 
 | 
 
 | 
3997
 | 
                 push @phrase_terms,  | 
| 
120
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                     Search::Tools::RegEx->new(  | 
| 
121
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                     plain     => $pt_plain,  | 
| 
122
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                     html      => $pt_html,  | 
| 
123
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                     term      => $pt,  | 
| 
124
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                     term_re   => qr/$pt_esc/i,  | 
| 
125
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                     is_phrase => 0,  | 
| 
126
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                     );  | 
| 
127
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             }  | 
| 
128
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }  | 
| 
129
 | 
145
 | 
 
 | 
 
 | 
 
 | 
 
 | 
9856
 | 
         $regex{$term} = Search::Tools::RegEx->new(  | 
| 
130
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             plain        => $plain,  | 
| 
131
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             html         => $html,  | 
| 
132
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             term         => $term,  | 
| 
133
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             term_re      => qr/$escaped/i,  | 
| 
134
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             is_phrase    => $is_phrase,  | 
| 
135
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             phrase_terms => \@phrase_terms,  | 
| 
136
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         );  | 
| 
137
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
138
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
139
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     return $self->{query_class}->new(  | 
| 
140
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         dialect => $extracted->{dialect},  | 
| 
141
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         terms   => $extracted->{terms},  | 
| 
142
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         fields  => $extracted->{fields},  | 
| 
143
 | 
63
 | 
 
 | 
 
 | 
 
 | 
 
 | 
2072
 | 
         str     => to_utf8( $query_str, $self->charset ),  | 
| 
144
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         regex   => \%regex,  | 
| 
145
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         qp      => $self,  | 
| 
146
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     );  | 
| 
147
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
148
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
149
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub _extract_terms {  | 
| 
150
 | 
63
 | 
 
 | 
 
 | 
  
63
  
 | 
 
 | 
143
 | 
     my $self  = shift;  | 
| 
151
 | 
63
 | 
 
 | 
 
 | 
 
 | 
 
 | 
137
 | 
     my $query = shift;  | 
| 
152
 | 
63
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
205
 | 
     confess "need query to extract terms" unless defined $query;  | 
| 
153
 | 
63
 | 
 
 | 
 
 | 
 
 | 
 
 | 
335
 | 
     my $stopwords     = $self->stopwords;  | 
| 
154
 | 
63
 | 
 
 | 
 
 | 
 
 | 
 
 | 
210
 | 
     my $and_word      = $self->and_word;  | 
| 
155
 | 
63
 | 
 
 | 
 
 | 
 
 | 
 
 | 
210
 | 
     my $or_word       = $self->or_word;  | 
| 
156
 | 
63
 | 
 
 | 
 
 | 
 
 | 
 
 | 
269
 | 
     my $not_word      = $self->not_word;  | 
| 
157
 | 
63
 | 
 
 | 
 
 | 
 
 | 
 
 | 
209
 | 
     my $wildcard      = $self->wildcard;  | 
| 
158
 | 
63
 | 
 
 | 
 
 | 
 
 | 
 
 | 
222
 | 
     my $phrase        = $self->phrase_delim;  | 
| 
159
 | 
63
 | 
 
 | 
 
 | 
 
 | 
 
 | 
238
 | 
     my $igf           = $self->ignore_first_char;  | 
| 
160
 | 
63
 | 
 
 | 
 
 | 
 
 | 
 
 | 
200
 | 
     my $igl           = $self->ignore_last_char;  | 
| 
161
 | 
63
 | 
 
 | 
 
 | 
 
 | 
 
 | 
226
 | 
     my $wordchar      = $self->word_characters;  | 
| 
162
 | 
63
 | 
 
 | 
 
 | 
 
 | 
 
 | 
206
 | 
     my $default_field = $self->default_field;  | 
| 
163
 | 
63
 | 
 
 | 
 
 | 
 
 | 
 
 | 
170
 | 
     my $esc_wildcard  = quotemeta($wildcard);  | 
| 
164
 | 
63
 | 
 
 | 
 
 | 
 
 | 
 
 | 
2817
 | 
     my $word_re       = qr/(($esc_wildcard)?[$wordchar]+($esc_wildcard)?)/;  | 
| 
165
 | 
63
 | 
 
 | 
 
 | 
 
 | 
 
 | 
363
 | 
     my $min_length    = $self->term_min_length;  | 
| 
166
 | 
63
 | 
 
 | 
 
 | 
 
 | 
 
 | 
240
 | 
     my $raw_query     = $query;  | 
| 
167
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
168
 | 
63
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
302
 | 
     $stopwords = [ split( /\s+/, $stopwords ) ] unless ref $stopwords;  | 
| 
169
 | 
63
 | 
 
 | 
 
 | 
 
 | 
 
 | 
234
 | 
     my %stophash = map { to_utf8( lc($_), $self->charset ) => 1 } @$stopwords;  | 
| 
 
 | 
15
 | 
 
 | 
 
 | 
 
 | 
 
 | 
97
 | 
    | 
| 
170
 | 
63
 | 
 
 | 
 
 | 
 
 | 
 
 | 
191
 | 
     my ( %words, %uniq, $c );  | 
| 
171
 | 
63
 | 
 
 | 
 
 | 
 
 | 
 
 | 
2596
 | 
     my $parser = Search::Query::Parser->new(  | 
| 
172
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         and_regex     => qr{$and_word}i,  | 
| 
173
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         or_regex      => qr{$or_word}i,  | 
| 
174
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         not_regex     => qr{$not_word}i,  | 
| 
175
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         default_field => $default_field,  | 
| 
176
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         query_class   => $self->query_dialect,  | 
| 
177
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     );  | 
| 
178
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
179
 | 
63
 | 
 
 | 
 
 | 
 
 | 
 
 | 
160290
 | 
     my $baked_query = $raw_query;  | 
| 
180
 | 
63
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
496
 | 
     $baked_query = lc($baked_query) if $self->ignore_case;  | 
| 
181
 | 
63
 | 
 
 | 
 
 | 
 
 | 
 
 | 
6289
 | 
     $baked_query = to_utf8( $baked_query, $self->charset );  | 
| 
182
 | 
63
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
333
 | 
     my $dialect = $parser->parse($baked_query) or croak $parser->error;  | 
| 
183
 | 
63
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
197856
 | 
     $self->debug && carp "parsetree: " . Data::Dump::dump( $dialect->tree );  | 
| 
184
 | 
63
 | 
 
 | 
 
 | 
 
 | 
 
 | 
931
 | 
     my $fields_searched  | 
| 
185
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         = $self->_get_value_from_tree( \%uniq, $dialect->tree, $c );  | 
| 
186
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
187
 | 
63
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
1719
 | 
     $self->debug && carp "parsed: " . Data::Dump::dump( \%uniq );  | 
| 
188
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
189
 | 
63
 | 
 
 | 
 
 | 
 
 | 
 
 | 
679
 | 
     my $count = scalar( keys %uniq );  | 
| 
190
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
191
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # parse uniq into word tokens  | 
| 
192
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # including removing stop words  | 
| 
193
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
194
 | 
63
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
1351
 | 
     $self->debug && carp "word_re: $word_re";  | 
| 
195
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
196
 | 
63
 | 
 
 | 
 
 | 
 
 | 
 
 | 
740
 | 
 U: for my $u ( sort { $uniq{$a} <=> $uniq{$b} } keys %uniq ) {  | 
| 
 
 | 
168
 | 
 
 | 
 
 | 
 
 | 
 
 | 
538
 | 
    | 
| 
197
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
198
 | 
152
 | 
 
 | 
 
 | 
 
 | 
 
 | 
412
 | 
         my $n = $uniq{$u};  | 
| 
199
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
200
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # only phrases have space  | 
| 
201
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # but due to our word_re, a single non-spaced string  | 
| 
202
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # might actually be multiple word tokens  | 
| 
203
 | 
152
 | 
 
 | 
  
100
  
 | 
 
 | 
 
 | 
871
 | 
         my $isphrase = $u =~ m/\s/ || 0;  | 
| 
204
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
205
 | 
152
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
649
 | 
         if ( $self->treat_uris_like_phrases ) {  | 
| 
206
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
207
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             # special case: treat email addresses, uris, as phrase  | 
| 
208
 | 
152
 | 
 
 | 
  
100
  
 | 
 
 | 
 
 | 
3317
 | 
             $isphrase ||= $u =~ m/[$wordchar][\@\.\\\/][$wordchar]/ || 0;  | 
| 
 
 | 
 
 | 
 
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
209
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }  | 
| 
210
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
211
 | 
152
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
3587
 | 
         $self->debug && carp "$u -> isphrase = $isphrase";  | 
| 
212
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
213
 | 
152
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1274
 | 
         my @w = ();  | 
| 
214
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
215
 | 
152
 | 
 
 | 
 
 | 
 
 | 
 
 | 
708
 | 
     TOK: for my $w ( split( m/\s+/, to_utf8( $u, $self->charset ) ) ) {  | 
| 
216
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
217
 | 
199
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
869
 | 
             next TOK unless $w =~ m/\S/;  | 
| 
218
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
219
 | 
199
 | 
 
 | 
 
 | 
 
 | 
 
 | 
950
 | 
             $w =~ s/\Q$phrase\E//g;  | 
| 
220
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
221
 | 
199
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1380
 | 
             while ( $w =~ m/$word_re/g ) {  | 
| 
222
 | 
208
 | 
 
 | 
 
 | 
 
 | 
 
 | 
598
 | 
                 my $tok = _untaint($1);  | 
| 
223
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
224
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 # strip ignorable chars  | 
| 
225
 | 
208
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
1529
 | 
                 $tok =~ s/^[$igf]+// if length($igf);  | 
| 
226
 | 
208
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
1280
 | 
                 $tok =~ s/[$igl]+$// if length($igl);  | 
| 
227
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
228
 | 
208
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
594
 | 
                 unless ($tok) {  | 
| 
229
 | 
  
0
  
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
0
 | 
                     $self->debug && carp "no token for '$w' $word_re";  | 
| 
230
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
                     next TOK;  | 
| 
231
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 }  | 
| 
232
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
233
 | 
208
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
4614
 | 
                 $self->debug && carp "found token: $tok";  | 
| 
234
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
235
 | 
208
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
1925
 | 
                 if ( exists $stophash{ lc($tok) } ) {  | 
| 
236
 | 
13
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
243
 | 
                     $self->debug && carp "$tok = stopword";  | 
| 
237
 | 
13
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
114
 | 
                     next TOK unless $isphrase;  | 
| 
238
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 }  | 
| 
239
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
240
 | 
202
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
551
 | 
                 unless ($isphrase) {  | 
| 
241
 | 
115
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
1798
 | 
                     next TOK if $tok =~ m/^($and_word|$or_word|$not_word)$/i;  | 
| 
242
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 }  | 
| 
243
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
244
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 # if tainting was on, odd things can happen.  | 
| 
245
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 # so check one more time  | 
| 
246
 | 
202
 | 
 
 | 
 
 | 
 
 | 
 
 | 
887
 | 
                 $tok = to_utf8( $tok, $self->charset );  | 
| 
247
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
248
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 # final sanity check  | 
| 
249
 | 
202
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
661
 | 
                 if ( !Encode::is_utf8($tok) ) {  | 
| 
250
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
                     carp "$tok is NOT utf8";  | 
| 
251
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
                     next TOK;  | 
| 
252
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 }  | 
| 
253
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
254
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 #$self->debug && carp "pushing $tok into wordlist";  | 
| 
255
 | 
202
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1216
 | 
                 push( @w, $tok );  | 
| 
256
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
257
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             }  | 
| 
258
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
259
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }  | 
| 
260
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
261
 | 
152
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
550
 | 
         next U unless @w;  | 
| 
262
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
263
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         #$self->debug && carp "joining \@w: " . Data::Dump::dump(\@w);  | 
| 
264
 | 
146
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
405
 | 
         if ($isphrase) {  | 
| 
265
 | 
32
 | 
 
 | 
 
 | 
 
 | 
 
 | 
215
 | 
             $words{ join( ' ', @w ) } = $n + $count++;  | 
| 
266
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }  | 
| 
267
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         else {  | 
| 
268
 | 
114
 | 
 
 | 
 
 | 
 
 | 
 
 | 
305
 | 
             for (@w) {  | 
| 
269
 | 
115
 | 
 
 | 
 
 | 
 
 | 
 
 | 
573
 | 
                 $words{$_} = $n + $count++;  | 
| 
270
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             }  | 
| 
271
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }  | 
| 
272
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
273
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
274
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
275
 | 
63
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
1355
 | 
     $self->debug && carp "tokenized: " . Data::Dump::dump( \%words );  | 
| 
276
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
277
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # make sure we don't have 'foo' and 'foo*'  | 
| 
278
 | 
63
 | 
 
 | 
 
 | 
 
 | 
 
 | 
672
 | 
     for ( keys %words ) {  | 
| 
279
 | 
147
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
761
 | 
         if ( $_ =~ m/$esc_wildcard/ ) {  | 
| 
280
 | 
12
 | 
 
 | 
 
 | 
 
 | 
 
 | 
103
 | 
             ( my $copy = $_ ) =~ s,$esc_wildcard,,g;  | 
| 
281
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
282
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             # delete the more exact of the two  | 
| 
283
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             # since the * will match both  | 
| 
284
 | 
12
 | 
 
 | 
 
 | 
 
 | 
 
 | 
40
 | 
             delete( $words{$copy} );  | 
| 
285
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }  | 
| 
286
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
287
 | 
147
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
550
 | 
         if ( length $_ < $min_length ) {  | 
| 
288
 | 
1
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
19
 | 
             $self->debug and carp "token too short: '$_'";  | 
| 
289
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
8
 | 
             delete $words{$_};  | 
| 
290
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }  | 
| 
291
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
292
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
293
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
294
 | 
63
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
1372
 | 
     $self->debug && carp "wildcards removed: " . Data::Dump::dump( \%words );  | 
| 
295
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
296
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # if any words need to be stemmed  | 
| 
297
 | 
63
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
763
 | 
     if ( $self->stemmer ) {  | 
| 
298
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
299
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # split each $word into words  | 
| 
300
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # stem each word  | 
| 
301
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # if stem ne word, break into chars and find first N common  | 
| 
302
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # rejoin $uniq  | 
| 
303
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
304
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         #carp "stemming ON\n";  | 
| 
305
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
306
 | 
8
 | 
 
 | 
 
 | 
 
 | 
 
 | 
31
 | 
     K: for ( keys %words ) {  | 
| 
307
 | 
15
 | 
 
 | 
 
 | 
 
 | 
 
 | 
61
 | 
             my (@w) = split /\s+/;  | 
| 
308
 | 
15
 | 
 
 | 
 
 | 
 
 | 
 
 | 
33
 | 
         W: for my $w (@w) {  | 
| 
309
 | 
27
 | 
 
 | 
 
 | 
 
 | 
 
 | 
57
 | 
                 my $func = $self->stemmer;  | 
| 
310
 | 
27
 | 
 
 | 
 
 | 
 
 | 
 
 | 
64
 | 
                 my $f = &$func( $self, $w );  | 
| 
311
 | 
27
 | 
  
 50
  
 | 
  
 33
  
 | 
 
 | 
 
 | 
211
 | 
                 if ( !defined $f or !length $f ) {  | 
| 
312
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
                     next W;  | 
| 
313
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 }  | 
| 
314
 | 
27
 | 
 
 | 
 
 | 
 
 | 
 
 | 
113
 | 
                 $f = to_utf8($f);  | 
| 
315
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
316
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 #warn "w: $w\nf: $f\n";  | 
| 
317
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
318
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 # add wildcard to indicate chars were lost  | 
| 
319
 | 
27
 | 
 
 | 
 
 | 
 
 | 
 
 | 
88
 | 
                 $w = $f . $wildcard;  | 
| 
320
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
321
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             }  | 
| 
322
 | 
15
 | 
 
 | 
 
 | 
 
 | 
 
 | 
40
 | 
             my $new = join ' ', @w;  | 
| 
323
 | 
15
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
83
 | 
             if ( $new ne $_ ) {  | 
| 
324
 | 
15
 | 
 
 | 
 
 | 
 
 | 
 
 | 
47
 | 
                 $words{$new} = $words{$_};  | 
| 
325
 | 
15
 | 
 
 | 
 
 | 
 
 | 
 
 | 
45
 | 
                 delete $words{$_};  | 
| 
326
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             }  | 
| 
327
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }  | 
| 
328
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
329
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
330
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
331
 | 
63
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
1249
 | 
     $self->debug && carp "stemming done: " . Data::Dump::dump( \%words );  | 
| 
332
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
333
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # sort keeps query in same order as we entered  | 
| 
334
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     return {  | 
| 
335
 | 
63
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1030
 | 
         terms => [ sort { $words{$a} <=> $words{$b} } keys %words ],  | 
| 
 
 | 
156
 | 
 
 | 
 
 | 
 
 | 
 
 | 
862
 | 
    | 
| 
336
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         fields  => [ keys %$fields_searched ],  | 
| 
337
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         dialect => $dialect,  | 
| 
338
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         query   => $raw_query,  | 
| 
339
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     };  | 
| 
340
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
341
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
342
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
343
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # stolen nearly verbatim from Taint::Runtime  | 
| 
344
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # apparently regex can be tainted when running under 'use locale'.  | 
| 
345
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # as of version 0.24 this should not be needed but until I can find a way  | 
| 
346
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # to easily test the Taint feature, we just do this. It's low overhead.  | 
| 
347
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub _untaint {  | 
| 
348
 | 
208
 | 
 
 | 
 
 | 
  
208
  
 | 
 
 | 
570
 | 
     my $str = shift;  | 
| 
349
 | 
208
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
663
 | 
     my $ref = ref($str) ? $str : \$str;  | 
| 
350
 | 
208
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
564
 | 
     if ( !defined $$ref ) {  | 
| 
351
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
         $$ref = undef;  | 
| 
352
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
353
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     else {  | 
| 
354
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         $$ref  | 
| 
355
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             = ( $$ref =~ /(.*)/ )  | 
| 
356
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             ? $1  | 
| 
357
 | 
208
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
983
 | 
             : do { confess("Couldn't find data to untaint") };  | 
| 
 
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
    | 
| 
358
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
359
 | 
208
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
695
 | 
     return ref($str) ? 1 : $str;  | 
| 
360
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
361
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
362
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub _get_value_from_tree {  | 
| 
363
 | 
63
 | 
 
 | 
 
 | 
  
63
  
 | 
 
 | 
4407
 | 
     my $self      = shift;  | 
| 
364
 | 
63
 | 
 
 | 
 
 | 
 
 | 
 
 | 
131
 | 
     my $uniq      = shift;  | 
| 
365
 | 
63
 | 
 
 | 
 
 | 
 
 | 
 
 | 
139
 | 
     my $parseTree = shift;  | 
| 
366
 | 
63
 | 
 
 | 
 
 | 
 
 | 
 
 | 
142
 | 
     my $c         = shift;  | 
| 
367
 | 
63
 | 
 
 | 
 
 | 
 
 | 
 
 | 
167
 | 
     my %fields    = ();  | 
| 
368
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
369
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # we only want the values from non minus queries  | 
| 
370
 | 
63
 | 
 
 | 
 
 | 
 
 | 
 
 | 
170
 | 
     for my $node ( '+', '' ) {  | 
| 
371
 | 
126
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
439
 | 
         next unless exists $parseTree->{$node};  | 
| 
372
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
373
 | 
63
 | 
 
 | 
 
 | 
 
 | 
 
 | 
130
 | 
         my @branches = @{ $parseTree->{$node} };  | 
| 
 
 | 
63
 | 
 
 | 
 
 | 
 
 | 
 
 | 
228
 | 
    | 
| 
374
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
375
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         #warn dump \@branches;  | 
| 
376
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
377
 | 
63
 | 
 
 | 
 
 | 
 
 | 
 
 | 
175
 | 
         for my $leaf (@branches) {  | 
| 
378
 | 
150
 | 
 
 | 
 
 | 
 
 | 
 
 | 
346
 | 
             my $v = $leaf->{value};  | 
| 
379
 | 
150
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
398
 | 
             if ( !defined $v ) {  | 
| 
380
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
                 croak "undefined value in query tree: " . dump($leaf);  | 
| 
381
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             }  | 
| 
382
 | 
150
 | 
  
 50
  
 | 
  
 66
  
 | 
 
 | 
 
 | 
467
 | 
             if ( defined $leaf->{field}  | 
| 
383
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 and exists $self->ignore_fields->{ $leaf->{field} } )  | 
| 
384
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             {  | 
| 
385
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
                 next;  | 
| 
386
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             }  | 
| 
387
 | 
150
 | 
 
 | 
 
 | 
 
 | 
 
 | 
291
 | 
             my $field = $leaf->{field};  | 
| 
388
 | 
150
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
356
 | 
             if ( defined $field ) {  | 
| 
389
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
11
 | 
                 $fields{$field}++;  | 
| 
390
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             }  | 
| 
391
 | 
150
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
552
 | 
             if ( ref $v eq 'HASH' ) {  | 
| 
 
 | 
 
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
392
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
                 my $f = $self->_get_value_from_tree( $uniq, $v, $c );  | 
| 
393
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
                 $fields{$_} = $f->{$_} for ( keys %$f );  | 
| 
394
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             }  | 
| 
395
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             elsif ( ref $v eq 'ARRAY' ) {  | 
| 
396
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
4
 | 
                 for my $value (@$v) {  | 
| 
397
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
6
 | 
                     $value =~ s/\s+/ /g;  | 
| 
398
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
7
 | 
                     $uniq->{$value} = ++$c;  | 
| 
399
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 }  | 
| 
400
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             }  | 
| 
401
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             else {  | 
| 
402
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
403
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 # if the $leaf is a proximity query,  | 
| 
404
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 # ignore the "phrase-ness" of it and split  | 
| 
405
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 # on whitespace. This is a compromise,  | 
| 
406
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 # mitigated by the tendency of HeatMap  | 
| 
407
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 # to reward proximity anyway.  | 
| 
408
 | 
149
 | 
  
100
  
 | 
  
 66
  
 | 
 
 | 
 
 | 
507
 | 
                 if ( $leaf->{proximity} and $leaf->{proximity} > 1 ) {  | 
| 
409
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
8
 | 
                     my @tokens = split( m/\ +/, $v );  | 
| 
410
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
6
 | 
                     $uniq->{$_} = ++$c for @tokens;  | 
| 
411
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
4
 | 
                     next;  | 
| 
412
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 }  | 
| 
413
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
414
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 # collapse any whitespace  | 
| 
415
 | 
148
 | 
 
 | 
 
 | 
 
 | 
 
 | 
566
 | 
                 $v =~ s,\s+,\ ,g;  | 
| 
416
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
417
 | 
148
 | 
 
 | 
 
 | 
 
 | 
 
 | 
689
 | 
                 $uniq->{$v} = ++$c;  | 
| 
418
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             }  | 
| 
419
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }  | 
| 
420
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
421
 | 
63
 | 
 
 | 
 
 | 
 
 | 
 
 | 
302
 | 
     return \%fields;  | 
| 
422
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
423
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
424
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub _setup_regex_builder {  | 
| 
425
 | 
51
 | 
 
 | 
 
 | 
  
51
  
 | 
 
 | 
124
 | 
     my $self = shift;  | 
| 
426
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
427
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # TODO optional for term_re  | 
| 
428
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
429
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # a search for a '<' or '>' should still highlight,  | 
| 
430
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # since < or > can be indexed as literal < and >  | 
| 
431
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # but this causes a great deal of hassle  | 
| 
432
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # so we just ignore them.  | 
| 
433
 | 
51
 | 
 
 | 
 
 | 
 
 | 
 
 | 
222
 | 
     my $wordchars = $self->word_characters;  | 
| 
434
 | 
51
 | 
 
 | 
 
 | 
 
 | 
 
 | 
176
 | 
     $wordchars =~ s,[<>&],,g;  | 
| 
435
 | 
51
 | 
 
 | 
 
 | 
 
 | 
 
 | 
162
 | 
     $self->{html_safe_wordchars} = $wordchars;    # remember for build  | 
| 
436
 | 
51
 | 
 
 | 
 
 | 
 
 | 
 
 | 
192
 | 
     my $ignore_first    = $self->ignore_first_char;  | 
| 
437
 | 
51
 | 
 
 | 
 
 | 
 
 | 
 
 | 
173
 | 
     my $ignore_last     = $self->ignore_last_char;  | 
| 
438
 | 
51
 | 
 
 | 
 
 | 
 
 | 
 
 | 
303
 | 
     my $html_whitespace = $self->whitespace;  | 
| 
439
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
440
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # what's the boundary between a word and a not-word?  | 
| 
441
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # by default:  | 
| 
442
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     #	the beginning of a string  | 
| 
443
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     #	the end of a string  | 
| 
444
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     #	whatever we've defined as WhiteSpace  | 
| 
445
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     #	any character that is not a WordChar  | 
| 
446
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     #   any character we explicitly ignore at start or end of word  | 
| 
447
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     #  | 
| 
448
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # the \A and \Z (beginning and end) should help if the word butts up  | 
| 
449
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # against the beginning or end of a tagset  | 
| 
450
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # like  Word or Word   | 
| 
451
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
452
 | 
51
 | 
 
 | 
 
 | 
 
 | 
 
 | 
358
 | 
     my @start_bound = (  | 
| 
453
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         '\A',  | 
| 
454
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         '[>]',  | 
| 
455
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         '(?:&[\w\#]+;)',    # because a ; might be a legitimate wordchar  | 
| 
456
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                             # and we treat a char entity like a single char.  | 
| 
457
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                             # if &char; resolves to a legit wordchar  | 
| 
458
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                             # this might give unexpected results.  | 
| 
459
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                             # NOTE that   etc is in $WhiteSpace  | 
| 
460
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         $html_whitespace,  | 
| 
461
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         '[^' . $wordchars . ']'  | 
| 
462
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     );  | 
| 
463
 | 
51
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
979
 | 
     push( @start_bound, qr/[$ignore_first]+/i ) if length $ignore_first;  | 
| 
464
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
465
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     my @end_bound  | 
| 
466
 | 
51
 | 
 
 | 
 
 | 
 
 | 
 
 | 
409
 | 
         = ( '\Z', '[<&]', $html_whitespace, '[^' . $wordchars . ']' );  | 
| 
467
 | 
51
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
624
 | 
     push( @end_bound, qr/[$ignore_last]+/i ) if length $ignore_last;  | 
| 
468
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
469
 | 
51
 | 
 
 | 
  
 33
  
 | 
 
 | 
 
 | 
593
 | 
     $self->{start_bound} ||= join( '|', @start_bound );  | 
| 
470
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
471
 | 
51
 | 
 
 | 
  
 33
  
 | 
 
 | 
 
 | 
461
 | 
     $self->{end_bound} ||= join( '|', @end_bound );  | 
| 
472
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
473
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # the whitespace in a query phrase might be:  | 
| 
474
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     #	any ignore_last_char, followed by  | 
| 
475
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     #	one or more nonwordchar or whitespace, followed by  | 
| 
476
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     #	any ignore_first_char  | 
| 
477
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # define for both text and html  | 
| 
478
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # NOTE the first/last swap for plain vs html  | 
| 
479
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # is intentional because of how regex are built.  | 
| 
480
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
481
 | 
51
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
5083
 | 
     my @plain_phrase_bound = (  | 
| 
 
 | 
 
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
482
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         ( length($ignore_last) ? qr/[$ignore_last]*/i : '' ),  | 
| 
483
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         qr/(?:[\s\x20]|[^$wordchars])+/is,  | 
| 
484
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         ( length($ignore_first) ? qr/[$ignore_first]?/i : '' ),  | 
| 
485
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     );  | 
| 
486
 | 
51
 | 
 
 | 
 
 | 
 
 | 
 
 | 
372
 | 
     $self->{plain_phrase_bound} = join( '', @plain_phrase_bound );  | 
| 
487
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
488
 | 
51
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
2835
 | 
     my @html_phrase_bound = (  | 
| 
 
 | 
 
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
489
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         ( length($ignore_first) ? qr/[$ignore_first]*/i : '' ),  | 
| 
490
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         qr/(?:$html_whitespace|[^$wordchars])+/is,  | 
| 
491
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         ( length($ignore_last) ? qr/[$ignore_last]?/i : '' ),  | 
| 
492
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     );  | 
| 
493
 | 
51
 | 
 
 | 
 
 | 
 
 | 
 
 | 
261128
 | 
     $self->{html_phrase_bound} = join( '', @html_phrase_bound );  | 
| 
494
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
495
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
496
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
497
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub _build_regex {  | 
| 
498
 | 
232
 | 
 
 | 
 
 | 
  
232
  
 | 
 
 | 
582
 | 
     my $self      = shift;  | 
| 
499
 | 
232
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
765
 | 
     my $q         = shift or croak "need query to build()";  | 
| 
500
 | 
232
 | 
 
 | 
 
 | 
 
 | 
 
 | 
605
 | 
     my $wild      = $self->{html_safe_wordchars};  | 
| 
501
 | 
232
 | 
 
 | 
 
 | 
 
 | 
 
 | 
541
 | 
     my $st_bound  = $self->{start_bound};  | 
| 
502
 | 
232
 | 
 
 | 
 
 | 
 
 | 
 
 | 
511
 | 
     my $end_bound = $self->{end_bound};  | 
| 
503
 | 
232
 | 
 
 | 
 
 | 
 
 | 
 
 | 
546
 | 
     my $wc        = $self->{html_safe_wordchars};  | 
| 
504
 | 
232
 | 
 
 | 
 
 | 
 
 | 
 
 | 
463
 | 
     my $ppb       = $self->{plain_phrase_bound};  | 
| 
505
 | 
232
 | 
 
 | 
 
 | 
 
 | 
 
 | 
471
 | 
     my $hpb       = $self->{html_phrase_bound};  | 
| 
506
 | 
232
 | 
 
 | 
 
 | 
 
 | 
 
 | 
797
 | 
     my $wildcard  = $self->wildcard;  | 
| 
507
 | 
232
 | 
 
 | 
 
 | 
 
 | 
 
 | 
521
 | 
     my $wild_esc  = quotemeta($wildcard);  | 
| 
508
 | 
232
 | 
 
 | 
 
 | 
 
 | 
 
 | 
553
 | 
     my $tag_re    = $self->tag_re;  | 
| 
509
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
510
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # define simple pattern for plain text  | 
| 
511
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # and complex pattern for HTML markup  | 
| 
512
 | 
232
 | 
 
 | 
 
 | 
 
 | 
 
 | 
452
 | 
     my ( $plain, $html );  | 
| 
513
 | 
232
 | 
 
 | 
 
 | 
 
 | 
 
 | 
479
 | 
     my $escaped = quotemeta($q);  | 
| 
514
 | 
232
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1403
 | 
     $escaped =~ s/\\[$wild_esc]/[$wc]*/g;    # wildcard  | 
| 
515
 | 
232
 | 
 
 | 
 
 | 
 
 | 
 
 | 
933
 | 
     $escaped =~ s/\\[\s]/$ppb/g;             # whitespace  | 
| 
516
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
517
 | 
232
 | 
 
 | 
 
 | 
 
 | 
 
 | 
40141
 | 
     $plain = qr/  | 
| 
518
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 (  | 
| 
519
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 \A|$ppb  | 
| 
520
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 )  | 
| 
521
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 (  | 
| 
522
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 ${escaped}  | 
| 
523
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 )  | 
| 
524
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 (  | 
| 
525
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 \Z|$ppb  | 
| 
526
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 )  | 
| 
527
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 /xis;  | 
| 
528
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
529
 | 
232
 | 
 
 | 
 
 | 
 
 | 
 
 | 
2146
 | 
     my (@char) = split( m//, $q );  | 
| 
530
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
531
 | 
232
 | 
 
 | 
 
 | 
 
 | 
 
 | 
526
 | 
     my $counter = -1;  | 
| 
532
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
533
 | 
232
 | 
 
 | 
 
 | 
 
 | 
 
 | 
633
 | 
 CHAR: foreach my $c (@char) {  | 
| 
534
 | 
1461
 | 
 
 | 
 
 | 
 
 | 
 
 | 
2490
 | 
         $counter++;  | 
| 
535
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
536
 | 
1461
 | 
 
 | 
  
100
  
 | 
 
 | 
 
 | 
4672
 | 
         my $ent = $C2E->{$c} || undef;  | 
| 
537
 | 
1461
 | 
 
 | 
 
 | 
 
 | 
 
 | 
2552
 | 
         my $num = ord($c);  | 
| 
538
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
539
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # if this is a special regexp char, protect it  | 
| 
540
 | 
1461
 | 
 
 | 
 
 | 
 
 | 
 
 | 
2409
 | 
         $c = quotemeta($c);  | 
| 
541
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
542
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # if it's a *, replace it with the Wild class  | 
| 
543
 | 
1461
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
3077
 | 
         $c = "[$wild]*" if $c eq $wild_esc;  | 
| 
544
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
545
 | 
1461
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
2891
 | 
         if ( $c eq '\ ' ) {  | 
| 
546
 | 
55
 | 
 
 | 
 
 | 
 
 | 
 
 | 
213
 | 
             $c = $hpb . $tag_re . '*';  | 
| 
547
 | 
55
 | 
 
 | 
 
 | 
 
 | 
 
 | 
160
 | 
             next CHAR;  | 
| 
548
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }  | 
| 
549
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
550
 | 
1406
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1982
 | 
         my $aka;  | 
| 
551
 | 
1406
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
2694
 | 
         if ($ent) {  | 
| 
552
 | 
1404
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
3600
 | 
             $aka = $ent eq "$num;" ? $ent : "$ent|$num;";  | 
| 
553
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }  | 
| 
554
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         else {  | 
| 
555
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
6
 | 
             $aka = "$num;";  | 
| 
556
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }  | 
| 
557
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
558
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # make $c into a regexp  | 
| 
559
 | 
1406
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
18415
 | 
         $c = qr/$c|$aka/i unless $c eq "[$wild]*";  | 
| 
560
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
561
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # any char might be followed by zero or more tags, unless it's the last char  | 
| 
562
 | 
1406
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
7552
 | 
         $c .= $tag_re . '*' unless $counter == $#char;  | 
| 
563
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
564
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
565
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
566
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # re-join the chars into a single string  | 
| 
567
 | 
232
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1079
 | 
     my $safe = join( "\n", @char );   # use \n to make it legible in debugging  | 
| 
568
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
569
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # for debugging legibility we include newlines, so make sure we s//x in matches  | 
| 
570
 | 
232
 | 
 
 | 
 
 | 
 
 | 
 
 | 
169069
 | 
     $html = qr/  | 
| 
571
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 (  | 
| 
572
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 ${st_bound}  | 
| 
573
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 )  | 
| 
574
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 (  | 
| 
575
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 ${safe}  | 
| 
576
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 )  | 
| 
577
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 (  | 
| 
578
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 ${end_bound}  | 
| 
579
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 )  | 
| 
580
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 /xis;  | 
| 
581
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
582
 | 
232
 | 
 
 | 
 
 | 
 
 | 
 
 | 
3995
 | 
     return ( $plain, $html, $escaped );  | 
| 
583
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
584
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
585
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub _build_term_re {  | 
| 
586
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
587
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # this based on SWISH::PhraseHighlight::set_match_regexp()  | 
| 
588
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
589
 | 
  
0
  
 | 
 
 | 
 
 | 
  
0
  
 | 
 
 | 
 
 | 
     my $self = shift;  | 
| 
590
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
591
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     #dump $self;  | 
| 
592
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
593
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     my $wc = $self->word_characters;  | 
| 
594
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     $self->{_wc_regexp}  | 
| 
595
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         = qr/[^$wc]+/io;    # regexp for splitting into swish-words  | 
| 
596
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
597
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     my $igf = $self->ignore_first_char;  | 
| 
598
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     my $igl = $self->ignore_last_char;  | 
| 
599
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     for ( $igf, $igl ) {  | 
| 
600
 | 
  
0
  
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         if ($_) {  | 
| 
601
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             $_ = "[$_]*";  | 
| 
602
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }  | 
| 
603
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         else {  | 
| 
604
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             $_ = '';  | 
| 
605
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }  | 
| 
606
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
607
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
608
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     $self->{_ignoreFirst} = $igf;  | 
| 
609
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     $self->{_ignoreLast}  = $igl;  | 
| 
610
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
611
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
612
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
613
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 1;  | 
| 
614
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
615
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 __END__  |