line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Parse::Highlife::Tokenizer; |
2
|
|
|
|
|
|
|
|
3
|
1
|
|
|
1
|
|
434
|
use Parse::Highlife::Utils qw(params offset_to_coordinate get_source_info extend_match); |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
78
|
|
4
|
1
|
|
|
1
|
|
864
|
use Parse::Highlife::Token::Regex; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
38
|
|
5
|
1
|
|
|
1
|
|
492
|
use Parse::Highlife::Token::Delimited; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
25
|
|
6
|
1
|
|
|
1
|
|
465
|
use Parse::Highlife::Token::Characters; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
27
|
|
7
|
|
|
|
|
|
|
|
8
|
1
|
|
|
1
|
|
5
|
use Data::Dump qw(dump); |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
769
|
|
9
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
sub new |
11
|
|
|
|
|
|
|
{ |
12
|
0
|
|
|
0
|
0
|
|
my( $class, @args ) = @_; |
13
|
0
|
|
|
|
|
|
my $self = bless {}, $class; |
14
|
0
|
|
|
|
|
|
return $self -> _init( @args ); |
15
|
|
|
|
|
|
|
} |
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
sub _init |
18
|
|
|
|
|
|
|
{ |
19
|
0
|
|
|
0
|
|
|
my( $self, @args ) = @_; |
20
|
0
|
|
|
|
|
|
$self->{'tokens'} = []; |
21
|
0
|
|
|
|
|
|
$self->{'tokennames'} = []; # to preserve order |
22
|
0
|
|
|
|
|
|
$self->{'debug'} = 1; |
23
|
0
|
|
|
|
|
|
return $self; |
24
|
|
|
|
|
|
|
} |
25
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
sub get_token |
27
|
|
|
|
|
|
|
{ |
28
|
0
|
|
|
0
|
0
|
|
my( $self, $tokenname ) = @_; |
29
|
0
|
|
|
|
|
|
my $pos = -1; |
30
|
0
|
|
|
|
|
|
my $p = 0; |
31
|
0
|
|
|
|
|
|
for( my $p = 0; $p < scalar @{$self->{'tokennames'}}; $p++ ) { |
|
0
|
|
|
|
|
|
|
32
|
0
|
0
|
|
|
|
|
if( $self->{'tokennames'}->[$p] eq $tokenname ) { |
33
|
0
|
|
|
|
|
|
$pos = $p; |
34
|
0
|
|
|
|
|
|
last; |
35
|
|
|
|
|
|
|
} |
36
|
|
|
|
|
|
|
} |
37
|
0
|
0
|
|
|
|
|
die "ERR: I do not know about a token named '$tokenname'\n" |
38
|
|
|
|
|
|
|
if $pos == -1; |
39
|
0
|
|
|
|
|
|
return $self->{'tokens'}->[$pos]; |
40
|
|
|
|
|
|
|
} |
41
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
sub token |
43
|
|
|
|
|
|
|
{ |
44
|
0
|
|
|
0
|
0
|
|
my( $self, $name, $regex, $start, $end, $escape, $characters ) |
45
|
|
|
|
|
|
|
= params( \@_, |
46
|
|
|
|
|
|
|
-name => '', |
47
|
|
|
|
|
|
|
-regex => '', |
48
|
|
|
|
|
|
|
-start => '', |
49
|
|
|
|
|
|
|
-end => '', |
50
|
|
|
|
|
|
|
-escape => "\\", |
51
|
|
|
|
|
|
|
-characters => '', |
52
|
|
|
|
|
|
|
); |
53
|
0
|
|
|
|
|
|
my @args = splice( @_, 1 ); |
54
|
|
|
|
|
|
|
|
55
|
0
|
0
|
|
|
|
|
die "ERR: token has no name.\n" unless length $name; |
56
|
|
|
|
|
|
|
|
57
|
0
|
|
|
|
|
|
my $token; |
58
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
# try to find a same token definition that can be reused |
60
|
0
|
|
|
|
|
|
my $already_defined = 0; |
61
|
0
|
|
|
|
|
|
foreach my $t ( @{$self->{'tokens'}} ) { |
|
0
|
|
|
|
|
|
|
62
|
0
|
0
|
0
|
|
|
|
if( |
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
63
|
|
|
|
|
|
|
( length $regex && |
64
|
|
|
|
|
|
|
ref $t eq 'Parse::Highlife::Token::Regex' && |
65
|
|
|
|
|
|
|
$t->{'regex'} eq $regex ) |
66
|
|
|
|
|
|
|
|| |
67
|
|
|
|
|
|
|
( length $start && length $end && |
68
|
|
|
|
|
|
|
ref $t eq 'Parse::Highlife::Token::Delimited' && |
69
|
|
|
|
|
|
|
$t->{'start'} eq $start && |
70
|
|
|
|
|
|
|
$t->{'end'} eq $end ) |
71
|
|
|
|
|
|
|
|| |
72
|
|
|
|
|
|
|
( length $characters && |
73
|
|
|
|
|
|
|
ref $t eq 'Parse::Highlife::Token::Characters' && |
74
|
|
|
|
|
|
|
$t->{'characters'} eq $characters ) |
75
|
|
|
|
|
|
|
) |
76
|
|
|
|
|
|
|
{ |
77
|
0
|
|
|
|
|
|
$token = $t; |
78
|
0
|
|
|
|
|
|
$already_defined = 1; |
79
|
0
|
|
|
|
|
|
last; |
80
|
|
|
|
|
|
|
} |
81
|
|
|
|
|
|
|
} |
82
|
|
|
|
|
|
|
|
83
|
0
|
0
|
|
|
|
|
if( ! $already_defined ) { |
84
|
|
|
|
|
|
|
|
85
|
0
|
0
|
0
|
|
|
|
if( length $regex ) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
86
|
0
|
|
|
|
|
|
$token = Parse::Highlife::Token::Regex -> new( @args ); |
87
|
|
|
|
|
|
|
} |
88
|
|
|
|
|
|
|
elsif( length $start && length $end ) { |
89
|
0
|
|
|
|
|
|
$token = Parse::Highlife::Token::Delimited -> new( @args ); |
90
|
|
|
|
|
|
|
} |
91
|
|
|
|
|
|
|
elsif( length $characters ) { |
92
|
0
|
|
|
|
|
|
$token = Parse::Highlife::Token::Characters -> new( @args ); |
93
|
|
|
|
|
|
|
} |
94
|
|
|
|
|
|
|
else { |
95
|
0
|
|
|
|
|
|
die "ERR: incomplete token definition.\n"; |
96
|
|
|
|
|
|
|
} |
97
|
|
|
|
|
|
|
|
98
|
0
|
|
|
|
|
|
$token->{'name'} = $name; |
99
|
|
|
|
|
|
|
|
100
|
0
|
|
|
|
|
|
push @{$self->{'tokens'}}, $token; |
|
0
|
|
|
|
|
|
|
101
|
0
|
|
|
|
|
|
push @{$self->{'tokennames'}}, $name; |
|
0
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
} |
103
|
0
|
|
|
|
|
|
return $token; |
104
|
|
|
|
|
|
|
} |
105
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
sub tokenize |
107
|
|
|
|
|
|
|
{ |
108
|
0
|
|
|
0
|
0
|
|
my( $self, $string ) = @_; |
109
|
0
|
|
|
|
|
|
my $tokens = []; |
110
|
|
|
|
|
|
|
|
111
|
0
|
|
|
|
|
|
my $i = 0; |
112
|
0
|
|
|
|
|
|
my $unknown_characters = ''; |
113
|
0
|
|
|
|
|
|
while( $i < length $string ) { |
114
|
|
|
|
|
|
|
# find the first matching token |
115
|
0
|
|
|
|
|
|
my $found = 0; |
116
|
0
|
|
|
|
|
|
my $match; |
117
|
0
|
|
|
|
|
|
for( my $t = 0; $t < @{$self->{'tokens'}}; $t++ ) { |
|
0
|
|
|
|
|
|
|
118
|
0
|
|
|
|
|
|
my $tokenname = $self->{'tokennames'}->[$t]; |
119
|
0
|
|
|
|
|
|
my $token = $self->{'tokens'}->[$t]; |
120
|
0
|
|
|
|
|
|
$match = $token -> match( $string, $i ); # returns 0 oder hash with info |
121
|
0
|
0
|
|
|
|
|
if( $match ) { |
122
|
0
|
|
|
|
|
|
$match->{'token-name'} = $tokenname; # only the Tokenizer knows this |
123
|
0
|
|
|
|
|
|
$match->{'is-ignored'} = $token -> is_ignored(); |
124
|
0
|
|
|
|
|
|
$i = $match->{'offset-after-match'}; |
125
|
0
|
|
|
|
|
|
$found = 1; |
126
|
0
|
|
|
|
|
|
last; |
127
|
|
|
|
|
|
|
} |
128
|
|
|
|
|
|
|
} |
129
|
0
|
0
|
|
|
|
|
if( $found ) { |
130
|
|
|
|
|
|
|
# save unknown token |
131
|
0
|
0
|
|
|
|
|
if( length $unknown_characters ) { |
132
|
0
|
|
|
|
|
|
my $unknown = |
133
|
|
|
|
|
|
|
extend_match( |
134
|
|
|
|
|
|
|
$string, |
135
|
|
|
|
|
|
|
{ |
136
|
|
|
|
|
|
|
'token-classname' => 'Parse::Highlife::Token::Unknown', |
137
|
|
|
|
|
|
|
'matched-substring' => $unknown_characters, |
138
|
|
|
|
|
|
|
'first-offset' => $i - length( $unknown_characters ), |
139
|
|
|
|
|
|
|
'token-name' => '', |
140
|
|
|
|
|
|
|
} |
141
|
|
|
|
|
|
|
); |
142
|
0
|
|
|
|
|
|
$unknown->{'is-ignored'} = 1; # unknown tokens are ignored (good?) |
143
|
0
|
|
|
|
|
|
push @{$tokens}, $unknown; |
|
0
|
|
|
|
|
|
|
144
|
0
|
|
|
|
|
|
$unknown_characters = ''; |
145
|
|
|
|
|
|
|
} |
146
|
0
|
|
|
|
|
|
push @{$tokens}, $match; |
|
0
|
|
|
|
|
|
|
147
|
|
|
|
|
|
|
} |
148
|
|
|
|
|
|
|
else { |
149
|
0
|
|
|
|
|
|
$unknown_characters .= substr $string, $i, 1; |
150
|
0
|
|
|
|
|
|
$i ++; |
151
|
|
|
|
|
|
|
|
152
|
|
|
|
|
|
|
#my( $line, $column ) = offset_to_coordinate( $string, $i ); |
153
|
|
|
|
|
|
|
#print "ERR: could not find a matching token at line $line, column $column:\n\n"; |
154
|
|
|
|
|
|
|
#print get_source_info( $string, $i ); |
155
|
|
|
|
|
|
|
#exit; |
156
|
|
|
|
|
|
|
} |
157
|
|
|
|
|
|
|
} |
158
|
0
|
|
|
|
|
|
return $tokens; |
159
|
|
|
|
|
|
|
} |
160
|
|
|
|
|
|
|
|
161
|
|
|
|
|
|
|
1; |