|  line  | 
 stmt  | 
 bran  | 
 cond  | 
 sub  | 
 pod  | 
 time  | 
 code  | 
| 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 package Plucene::Analysis::CharTokenizer;  | 
| 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head1 NAME   | 
| 
4
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
5
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Plucene::Analysis::CharTokenizer - base class for character tokenisers  | 
| 
6
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
7
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head1 SYNOPSIS  | 
| 
8
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
9
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	# isa Plucene::Analysis::Tokenizer  | 
| 
10
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
11
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	my $next = $chartokenizer->next;  | 
| 
12
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	  | 
| 
13
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head1 DESCRIPTION  | 
| 
14
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
15
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 This is an abstract base class for simple, character-oriented tokenizers.  | 
| 
16
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
17
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head1 METHODS  | 
| 
18
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
19
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =cut  | 
| 
20
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
21
 | 
19
 | 
 
 | 
 
 | 
  
19
  
 | 
 
 | 
119
 | 
 use strict;  | 
| 
 
 | 
19
 | 
 
 | 
 
 | 
 
 | 
 
 | 
46
 | 
    | 
| 
 
 | 
19
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1130
 | 
    | 
| 
22
 | 
19
 | 
 
 | 
 
 | 
  
19
  
 | 
 
 | 
106
 | 
 use warnings;  | 
| 
 
 | 
19
 | 
 
 | 
 
 | 
 
 | 
 
 | 
42
 | 
    | 
| 
 
 | 
19
 | 
 
 | 
 
 | 
 
 | 
 
 | 
463
 | 
    | 
| 
23
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
24
 | 
19
 | 
 
 | 
 
 | 
  
19
  
 | 
 
 | 
104
 | 
 use Carp;  | 
| 
 
 | 
19
 | 
 
 | 
 
 | 
 
 | 
 
 | 
45
 | 
    | 
| 
 
 | 
19
 | 
 
 | 
 
 | 
 
 | 
 
 | 
2081
 | 
    | 
| 
25
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
26
 | 
19
 | 
 
 | 
 
 | 
  
19
  
 | 
 
 | 
10687
 | 
 use Plucene::Analysis::Token;  | 
| 
 
 | 
19
 | 
 
 | 
 
 | 
 
 | 
 
 | 
57
 | 
    | 
| 
 
 | 
19
 | 
 
 | 
 
 | 
 
 | 
 
 | 
155
 | 
    | 
| 
27
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
28
 | 
19
 | 
 
 | 
 
 | 
  
19
  
 | 
 
 | 
754
 | 
 use base 'Plucene::Analysis::Tokenizer';  | 
| 
 
 | 
19
 | 
 
 | 
 
 | 
 
 | 
 
 | 
44
 | 
    | 
| 
 
 | 
19
 | 
 
 | 
 
 | 
 
 | 
 
 | 
11664
 | 
    | 
| 
29
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
30
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head2 token_re  | 
| 
31
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
32
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 This should be defined in subclasses.  | 
| 
33
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
34
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =cut  | 
| 
35
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
36
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # And here we deviate from the script  | 
| 
37
 | 
0
 | 
 
 | 
 
 | 
  
0
  
 | 
  
1
  
 | 
0
 | 
 sub token_re { die "You should define this" }  | 
| 
38
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
39
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # Class::Virtually::Abstract doesn't like being called twice.  | 
| 
40
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
41
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head2 normalize  | 
| 
42
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
43
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 This will normalise the character before it is added to the token.  | 
| 
44
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
45
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =cut  | 
| 
46
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
47
 | 
382
 | 
 
 | 
 
 | 
  
382
  
 | 
  
1
  
 | 
2220
 | 
 sub normalize { return $_[1] }  | 
| 
48
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
49
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head2 next  | 
| 
50
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
51
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	my $next = $chartokenizer->next;  | 
| 
52
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
53
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 This will return the next token in the string, or undef at the end   | 
| 
54
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 of the string.  | 
| 
55
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	  | 
| 
56
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =cut  | 
| 
57
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
58
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub next {  | 
| 
59
 | 
144566
 | 
 
 | 
 
 | 
  
144566
  
 | 
  
1
  
 | 
210839
 | 
 	my $self = shift;  | 
| 
60
 | 
144566
 | 
 
 | 
 
 | 
 
 | 
 
 | 
385815
 | 
 	my $re   = $self->token_re();  | 
| 
61
 | 
144566
 | 
 
 | 
 
 | 
 
 | 
 
 | 
267537
 | 
 	my $fh   = $self->{reader};  | 
| 
62
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	retry:  | 
| 
63
 | 
159707
 | 
  
100
  
 | 
  
 66
  
 | 
 
 | 
 
 | 
628270
 | 
 	if (!defined $self->{buffer} or !length $self->{buffer}) {  | 
| 
64
 | 
16240
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
57176
 | 
 		return if eof($fh);  | 
| 
65
 | 
15664
 | 
 
 | 
 
 | 
 
 | 
 
 | 
196373
 | 
 		$self->{start} = tell($fh);  | 
| 
66
 | 
15664
 | 
 
 | 
 
 | 
 
 | 
 
 | 
129748
 | 
 		$self->{buffer} .= <$fh>;  | 
| 
67
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	}  | 
| 
68
 | 
159131
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
1639977
 | 
 	return unless length $self->{buffer};  | 
| 
69
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
70
 | 
159131
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
1075456
 | 
 	if ($self->{buffer} =~ s/(.*?)($re)//) {  | 
| 
71
 | 
143990
 | 
 
 | 
 
 | 
 
 | 
 
 | 
305185
 | 
 		$self->{start} += length $1;  | 
| 
72
 | 
143990
 | 
 
 | 
 
 | 
 
 | 
 
 | 
421935
 | 
 		my $word = $self->normalize($2);  | 
| 
73
 | 
143990
 | 
 
 | 
 
 | 
 
 | 
 
 | 
580586
 | 
 		my $rv   = Plucene::Analysis::Token->new(  | 
| 
74
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 			text  => $word,  | 
| 
75
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 			start => $self->{start},  | 
| 
76
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 			end   => ($self->{start} + length($word)));  | 
| 
77
 | 
143990
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1964224
 | 
 		$self->{start} += length($word);  | 
| 
78
 | 
143990
 | 
 
 | 
 
 | 
 
 | 
 
 | 
701270
 | 
 		return $rv;  | 
| 
79
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	}  | 
| 
80
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
81
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	# No match, rest of buffer is useless.  | 
| 
82
 | 
15141
 | 
 
 | 
 
 | 
 
 | 
 
 | 
29849
 | 
 	$self->{buffer} = "";  | 
| 
83
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
84
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	# But we should try for some more text  | 
| 
85
 | 
15141
 | 
 
 | 
 
 | 
 
 | 
 
 | 
29562
 | 
 	goto retry;  | 
| 
86
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
87
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
88
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 1;  |