| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
#!/usr/bin/env perl |
|
2
|
|
|
|
|
|
|
|
|
3
|
5
|
|
|
5
|
|
22653
|
use warnings; |
|
|
5
|
|
|
|
|
10
|
|
|
|
5
|
|
|
|
|
151
|
|
|
4
|
5
|
|
|
5
|
|
28
|
use strict; |
|
|
5
|
|
|
|
|
10
|
|
|
|
5
|
|
|
|
|
2570
|
|
|
5
|
|
|
|
|
|
|
|
|
6
|
|
|
|
|
|
|
package Gwybodaeth::Tokenize; |
|
7
|
|
|
|
|
|
|
|
|
8
|
|
|
|
|
|
|
=head1 NAME |
|
9
|
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
Tokenize - Split up data on whitespace into tokens. |
|
11
|
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
=head1 SYNOPSIS |
|
13
|
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
use Tokenize; |
|
15
|
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
my $t = Tokenize->new(); |
|
17
|
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
$t->tokenize($data); |
|
19
|
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
=head1 DESCRIPTION |
|
21
|
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
This module tokenizes data, where a token is delimited by whitespace. |
|
23
|
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
=over |
|
25
|
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
=item new() |
|
27
|
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
Reterns an instance of the class. |
|
29
|
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
=cut |
|
31
|
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
sub new { |
|
33
|
5
|
|
|
5
|
1
|
78
|
my $class = shift; |
|
34
|
5
|
|
|
|
|
12
|
my $self = {}; |
|
35
|
5
|
|
|
|
|
14
|
bless $self, $class; |
|
36
|
5
|
|
|
|
|
18
|
return $self; |
|
37
|
|
|
|
|
|
|
} |
|
38
|
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
=item tokenize($data) |
|
40
|
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
Tokenizes the data supplied in the array reference $data. |
|
42
|
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
=cut |
|
44
|
|
|
|
|
|
|
|
|
45
|
|
|
|
|
|
|
# Takes a reference to the input data as a parameter. |
|
46
|
|
|
|
|
|
|
sub tokenize { |
|
47
|
10
|
|
|
10
|
1
|
732
|
my($self, $data) = @_; |
|
48
|
|
|
|
|
|
|
|
|
49
|
10
|
|
|
|
|
13
|
my @tokenized; |
|
50
|
|
|
|
|
|
|
|
|
51
|
10
|
|
|
|
|
15
|
for (@{ $data }) { |
|
|
10
|
|
|
|
|
24
|
|
|
52
|
34
|
|
|
|
|
85
|
for (split /\s+/x) { |
|
53
|
93
|
100
|
|
|
|
258
|
next if / |
|
54
|
|
|
|
|
|
|
# string is entirly whitespace or empty |
|
55
|
|
|
|
|
|
|
^\s*$/x; |
|
56
|
83
|
|
|
|
|
181
|
push @tokenized, $_; |
|
57
|
|
|
|
|
|
|
} |
|
58
|
|
|
|
|
|
|
} |
|
59
|
|
|
|
|
|
|
|
|
60
|
10
|
|
|
|
|
35
|
return $self->_tokenize_clean(\@tokenized); |
|
61
|
|
|
|
|
|
|
} |
|
62
|
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
# Takes a reference to the data which needs to be cleaned |
|
64
|
|
|
|
|
|
|
sub _tokenize_clean { |
|
65
|
10
|
|
|
10
|
|
92
|
my($self, $data) = @_; |
|
66
|
|
|
|
|
|
|
|
|
67
|
10
|
|
|
|
|
14
|
for my $i (0..$#{ $data }) { |
|
|
10
|
|
|
|
|
31
|
|
|
68
|
|
|
|
|
|
|
|
|
69
|
85
|
100
|
|
|
|
73
|
next if (not defined ${ $data }[$i]); |
|
|
85
|
|
|
|
|
180
|
|
|
70
|
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
# If a token begins with '<' but doesn't end with '>' |
|
72
|
|
|
|
|
|
|
# then the token has been split up. |
|
73
|
83
|
100
|
100
|
|
|
80
|
if ((${$data}[$i] =~ /^\< # line begins with a opening square bracket/x |
|
|
83
|
|
100
|
|
|
261
|
|
|
|
4
|
|
66
|
|
|
98
|
|
|
74
|
|
|
|
|
|
|
&& |
|
75
|
82
|
|
|
|
|
266
|
${$data}[$i] =~ /[^\>]$ # line doesn't end with a closing square |
|
76
|
|
|
|
|
|
|
# bracket |
|
77
|
|
|
|
|
|
|
/x) |
|
78
|
|
|
|
|
|
|
|| |
|
79
|
|
|
|
|
|
|
# If the token begins but doesn't end with " the token may |
|
80
|
|
|
|
|
|
|
# have been split up |
|
81
|
8
|
|
|
|
|
47
|
(${$data}[$i] =~ /^\" # line begins with a double quote/x |
|
82
|
|
|
|
|
|
|
&& |
|
83
|
|
|
|
|
|
|
${$data}[$i] =~ / |
|
84
|
|
|
|
|
|
|
[^\"]$ # line doesn't end with a double quote |
|
85
|
|
|
|
|
|
|
/x)) |
|
86
|
|
|
|
|
|
|
{ |
|
87
|
|
|
|
|
|
|
# Concatinate the next line to the current |
|
88
|
|
|
|
|
|
|
# partial token. We add a space inbetween to repair from |
|
89
|
|
|
|
|
|
|
# the split operation. |
|
90
|
2
|
|
|
|
|
4
|
${ $data }[$i] .= " ${ $data }[$i+1]"; |
|
|
2
|
|
|
|
|
4
|
|
|
|
2
|
|
|
|
|
5
|
|
|
91
|
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
# Re-index the token list to take into account the last |
|
93
|
|
|
|
|
|
|
# concatination. |
|
94
|
2
|
|
|
|
|
5
|
for my $j (($i+1)..($#{ $data }-1)) { |
|
|
2
|
|
|
|
|
7
|
|
|
95
|
4
|
|
|
|
|
6
|
${ $data }[$j] = ${ $data }[$j + 1]; |
|
|
4
|
|
|
|
|
10
|
|
|
|
4
|
|
|
|
|
6
|
|
|
96
|
|
|
|
|
|
|
} |
|
97
|
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
# The last data element should now be deleted |
|
99
|
|
|
|
|
|
|
# as the data has been shifted up one in the |
|
100
|
|
|
|
|
|
|
# list. |
|
101
|
2
|
|
|
|
|
4
|
delete ${ $data }[$#{ $data }]; |
|
|
2
|
|
|
|
|
5
|
|
|
|
2
|
|
|
|
|
3
|
|
|
102
|
|
|
|
|
|
|
|
|
103
|
2
|
|
|
|
|
4
|
redo; # try again in case the token is split onto more than 2 lines |
|
104
|
|
|
|
|
|
|
} |
|
105
|
|
|
|
|
|
|
} |
|
106
|
10
|
|
|
|
|
50
|
return $data; |
|
107
|
|
|
|
|
|
|
} |
|
108
|
|
|
|
|
|
|
1; |
|
109
|
|
|
|
|
|
|
__END__ |