line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
#!/usr/bin/env perl |
2
|
|
|
|
|
|
|
|
3
|
5
|
|
|
5
|
|
22653
|
use warnings; |
|
5
|
|
|
|
|
10
|
|
|
5
|
|
|
|
|
151
|
|
4
|
5
|
|
|
5
|
|
28
|
use strict; |
|
5
|
|
|
|
|
10
|
|
|
5
|
|
|
|
|
2570
|
|
5
|
|
|
|
|
|
|
|
6
|
|
|
|
|
|
|
package Gwybodaeth::Tokenize; |
7
|
|
|
|
|
|
|
|
8
|
|
|
|
|
|
|
=head1 NAME |
9
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
Tokenize - Split up data on whitespace into tokens. |
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
=head1 SYNOPSIS |
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
use Tokenize; |
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
my $t = Tokenize->new(); |
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
$t->tokenize($data); |
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
=head1 DESCRIPTION |
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
This module tokenizes data, where a token is delimited by whitespace. |
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
=over |
25
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
=item new() |
27
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
Reterns an instance of the class. |
29
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
=cut |
31
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
sub new { |
33
|
5
|
|
|
5
|
1
|
78
|
my $class = shift; |
34
|
5
|
|
|
|
|
12
|
my $self = {}; |
35
|
5
|
|
|
|
|
14
|
bless $self, $class; |
36
|
5
|
|
|
|
|
18
|
return $self; |
37
|
|
|
|
|
|
|
} |
38
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
=item tokenize($data) |
40
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
Tokenizes the data supplied in the array reference $data. |
42
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
=cut |
44
|
|
|
|
|
|
|
|
45
|
|
|
|
|
|
|
# Takes a reference to the input data as a parameter. |
46
|
|
|
|
|
|
|
sub tokenize { |
47
|
10
|
|
|
10
|
1
|
732
|
my($self, $data) = @_; |
48
|
|
|
|
|
|
|
|
49
|
10
|
|
|
|
|
13
|
my @tokenized; |
50
|
|
|
|
|
|
|
|
51
|
10
|
|
|
|
|
15
|
for (@{ $data }) { |
|
10
|
|
|
|
|
24
|
|
52
|
34
|
|
|
|
|
85
|
for (split /\s+/x) { |
53
|
93
|
100
|
|
|
|
258
|
next if / |
54
|
|
|
|
|
|
|
# string is entirly whitespace or empty |
55
|
|
|
|
|
|
|
^\s*$/x; |
56
|
83
|
|
|
|
|
181
|
push @tokenized, $_; |
57
|
|
|
|
|
|
|
} |
58
|
|
|
|
|
|
|
} |
59
|
|
|
|
|
|
|
|
60
|
10
|
|
|
|
|
35
|
return $self->_tokenize_clean(\@tokenized); |
61
|
|
|
|
|
|
|
} |
62
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
# Takes a reference to the data which needs to be cleaned |
64
|
|
|
|
|
|
|
sub _tokenize_clean { |
65
|
10
|
|
|
10
|
|
92
|
my($self, $data) = @_; |
66
|
|
|
|
|
|
|
|
67
|
10
|
|
|
|
|
14
|
for my $i (0..$#{ $data }) { |
|
10
|
|
|
|
|
31
|
|
68
|
|
|
|
|
|
|
|
69
|
85
|
100
|
|
|
|
73
|
next if (not defined ${ $data }[$i]); |
|
85
|
|
|
|
|
180
|
|
70
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
# If a token begins with '<' but doesn't end with '>' |
72
|
|
|
|
|
|
|
# then the token has been split up. |
73
|
83
|
100
|
100
|
|
|
80
|
if ((${$data}[$i] =~ /^\< # line begins with a opening square bracket/x |
|
83
|
|
100
|
|
|
261
|
|
|
4
|
|
66
|
|
|
98
|
|
74
|
|
|
|
|
|
|
&& |
75
|
82
|
|
|
|
|
266
|
${$data}[$i] =~ /[^\>]$ # line doesn't end with a closing square |
76
|
|
|
|
|
|
|
# bracket |
77
|
|
|
|
|
|
|
/x) |
78
|
|
|
|
|
|
|
|| |
79
|
|
|
|
|
|
|
# If the token begins but doesn't end with " the token may |
80
|
|
|
|
|
|
|
# have been split up |
81
|
8
|
|
|
|
|
47
|
(${$data}[$i] =~ /^\" # line begins with a double quote/x |
82
|
|
|
|
|
|
|
&& |
83
|
|
|
|
|
|
|
${$data}[$i] =~ / |
84
|
|
|
|
|
|
|
[^\"]$ # line doesn't end with a double quote |
85
|
|
|
|
|
|
|
/x)) |
86
|
|
|
|
|
|
|
{ |
87
|
|
|
|
|
|
|
# Concatinate the next line to the current |
88
|
|
|
|
|
|
|
# partial token. We add a space inbetween to repair from |
89
|
|
|
|
|
|
|
# the split operation. |
90
|
2
|
|
|
|
|
4
|
${ $data }[$i] .= " ${ $data }[$i+1]"; |
|
2
|
|
|
|
|
4
|
|
|
2
|
|
|
|
|
5
|
|
91
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
# Re-index the token list to take into account the last |
93
|
|
|
|
|
|
|
# concatination. |
94
|
2
|
|
|
|
|
5
|
for my $j (($i+1)..($#{ $data }-1)) { |
|
2
|
|
|
|
|
7
|
|
95
|
4
|
|
|
|
|
6
|
${ $data }[$j] = ${ $data }[$j + 1]; |
|
4
|
|
|
|
|
10
|
|
|
4
|
|
|
|
|
6
|
|
96
|
|
|
|
|
|
|
} |
97
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
# The last data element should now be deleted |
99
|
|
|
|
|
|
|
# as the data has been shifted up one in the |
100
|
|
|
|
|
|
|
# list. |
101
|
2
|
|
|
|
|
4
|
delete ${ $data }[$#{ $data }]; |
|
2
|
|
|
|
|
5
|
|
|
2
|
|
|
|
|
3
|
|
102
|
|
|
|
|
|
|
|
103
|
2
|
|
|
|
|
4
|
redo; # try again in case the token is split onto more than 2 lines |
104
|
|
|
|
|
|
|
} |
105
|
|
|
|
|
|
|
} |
106
|
10
|
|
|
|
|
50
|
return $data; |
107
|
|
|
|
|
|
|
} |
108
|
|
|
|
|
|
|
1; |
109
|
|
|
|
|
|
|
__END__ |