|  line  | 
 stmt  | 
 bran  | 
 cond  | 
 sub  | 
 pod  | 
 time  | 
 code  | 
| 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #!/usr/bin/env perl  | 
| 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
3
 | 
5
 | 
 
 | 
 
 | 
  
5
  
 | 
 
 | 
22653
 | 
 use warnings;  | 
| 
 
 | 
5
 | 
 
 | 
 
 | 
 
 | 
 
 | 
10
 | 
    | 
| 
 
 | 
5
 | 
 
 | 
 
 | 
 
 | 
 
 | 
151
 | 
    | 
| 
4
 | 
5
 | 
 
 | 
 
 | 
  
5
  
 | 
 
 | 
28
 | 
 use strict;  | 
| 
 
 | 
5
 | 
 
 | 
 
 | 
 
 | 
 
 | 
10
 | 
    | 
| 
 
 | 
5
 | 
 
 | 
 
 | 
 
 | 
 
 | 
2570
 | 
    | 
| 
5
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
6
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 package Gwybodaeth::Tokenize;  | 
| 
7
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
8
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head1 NAME  | 
| 
9
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
10
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Tokenize - Split up data on whitespace into tokens.  | 
| 
11
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
12
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head1 SYNOPSIS  | 
| 
13
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
14
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     use Tokenize;  | 
| 
15
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
16
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     my $t = Tokenize->new();  | 
| 
17
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
18
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     $t->tokenize($data);  | 
| 
19
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
20
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head1 DESCRIPTION  | 
| 
21
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
22
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 This module tokenizes data, where a token is delimited by whitespace.  | 
| 
23
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
24
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =over  | 
| 
25
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
26
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =item new()  | 
| 
27
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
28
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Reterns an instance of the class.  | 
| 
29
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
30
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =cut  | 
| 
31
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
32
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub new {  | 
| 
33
 | 
5
 | 
 
 | 
 
 | 
  
5
  
 | 
  
1
  
 | 
78
 | 
     my $class = shift;  | 
| 
34
 | 
5
 | 
 
 | 
 
 | 
 
 | 
 
 | 
12
 | 
     my $self = {};  | 
| 
35
 | 
5
 | 
 
 | 
 
 | 
 
 | 
 
 | 
14
 | 
     bless $self, $class;  | 
| 
36
 | 
5
 | 
 
 | 
 
 | 
 
 | 
 
 | 
18
 | 
     return $self;  | 
| 
37
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
38
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
39
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =item tokenize($data)  | 
| 
40
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
41
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Tokenizes the data supplied in the array reference $data.  | 
| 
42
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
43
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =cut  | 
| 
44
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
45
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # Takes a reference to the input data as a parameter.  | 
| 
46
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub tokenize {  | 
| 
47
 | 
10
 | 
 
 | 
 
 | 
  
10
  
 | 
  
1
  
 | 
732
 | 
     my($self, $data) = @_;  | 
| 
48
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
49
 | 
10
 | 
 
 | 
 
 | 
 
 | 
 
 | 
13
 | 
     my @tokenized;  | 
| 
50
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
51
 | 
10
 | 
 
 | 
 
 | 
 
 | 
 
 | 
15
 | 
     for (@{ $data }) {  | 
| 
 
 | 
10
 | 
 
 | 
 
 | 
 
 | 
 
 | 
24
 | 
    | 
| 
52
 | 
34
 | 
 
 | 
 
 | 
 
 | 
 
 | 
85
 | 
         for (split /\s+/x) {  | 
| 
53
 | 
93
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
258
 | 
             next if /  | 
| 
54
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                     # string is entirly whitespace or empty  | 
| 
55
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                     ^\s*$/x;  | 
| 
56
 | 
83
 | 
 
 | 
 
 | 
 
 | 
 
 | 
181
 | 
             push @tokenized, $_;  | 
| 
57
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }  | 
| 
58
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
59
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
60
 | 
10
 | 
 
 | 
 
 | 
 
 | 
 
 | 
35
 | 
     return $self->_tokenize_clean(\@tokenized);  | 
| 
61
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
62
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
63
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # Takes a reference to the data which needs to be cleaned  | 
| 
64
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub _tokenize_clean {  | 
| 
65
 | 
10
 | 
 
 | 
 
 | 
  
10
  
 | 
 
 | 
92
 | 
     my($self, $data) = @_;  | 
| 
66
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
67
 | 
10
 | 
 
 | 
 
 | 
 
 | 
 
 | 
14
 | 
     for my $i (0..$#{ $data }) {  | 
| 
 
 | 
10
 | 
 
 | 
 
 | 
 
 | 
 
 | 
31
 | 
    | 
| 
68
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
           | 
| 
69
 | 
85
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
73
 | 
         next if (not defined ${ $data }[$i]);  | 
| 
 
 | 
85
 | 
 
 | 
 
 | 
 
 | 
 
 | 
180
 | 
    | 
| 
70
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
           | 
| 
71
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             # If a token begins with '<' but doesn't end with '>'  | 
| 
72
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             # then the token has been split up.  | 
| 
73
 | 
83
 | 
  
100
  
 | 
  
100
  
 | 
 
 | 
 
 | 
80
 | 
         if ((${$data}[$i] =~ /^\< # line begins with a opening square bracket/x   | 
| 
 
 | 
83
 | 
 
 | 
  
100
  
 | 
 
 | 
 
 | 
261
 | 
    | 
| 
 
 | 
4
 | 
 
 | 
  
 66
  
 | 
 
 | 
 
 | 
98
 | 
    | 
| 
74
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 &&   | 
| 
75
 | 
82
 | 
 
 | 
 
 | 
 
 | 
 
 | 
266
 | 
             ${$data}[$i] =~ /[^\>]$ # line doesn't end with a closing square  | 
| 
76
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                                     # bracket  | 
| 
77
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                             /x)  | 
| 
78
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             ||  | 
| 
79
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             # If the token begins but doesn't end with " the token may  | 
| 
80
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             # have been split up   | 
| 
81
 | 
8
 | 
 
 | 
 
 | 
 
 | 
 
 | 
47
 | 
             (${$data}[$i] =~ /^\" # line begins with a double quote/x   | 
| 
82
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 &&   | 
| 
83
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
              ${$data}[$i] =~ /  | 
| 
84
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                             [^\"]$ # line doesn't end with a double quote  | 
| 
85
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                              /x))   | 
| 
86
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             {  | 
| 
87
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             # Concatinate the next line to the current  | 
| 
88
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             # partial token. We add a space inbetween to repair from  | 
| 
89
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             # the split operation.   | 
| 
90
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
4
 | 
             ${ $data }[$i] .= " ${ $data }[$i+1]";  | 
| 
 
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
4
 | 
    | 
| 
 
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
5
 | 
    | 
| 
91
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
92
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             # Re-index the token list to take into account the last  | 
| 
93
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             # concatination.  | 
| 
94
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
5
 | 
             for my $j (($i+1)..($#{ $data }-1)) {  | 
| 
 
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
7
 | 
    | 
| 
95
 | 
4
 | 
 
 | 
 
 | 
 
 | 
 
 | 
6
 | 
                 ${ $data }[$j] = ${ $data }[$j + 1];  | 
| 
 
 | 
4
 | 
 
 | 
 
 | 
 
 | 
 
 | 
10
 | 
    | 
| 
 
 | 
4
 | 
 
 | 
 
 | 
 
 | 
 
 | 
6
 | 
    | 
| 
96
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             }  | 
| 
97
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
               | 
| 
98
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             # The last data element should now be deleted  | 
| 
99
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             # as the data has been shifted up one in the   | 
| 
100
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             # list.  | 
| 
101
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
4
 | 
             delete ${ $data }[$#{ $data }];  | 
| 
 
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
5
 | 
    | 
| 
 
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
3
 | 
    | 
| 
102
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
103
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
4
 | 
             redo; # try again in case the token is split onto more than 2 lines  | 
| 
104
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }  | 
| 
105
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
106
 | 
10
 | 
 
 | 
 
 | 
 
 | 
 
 | 
50
 | 
     return $data;  | 
| 
107
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
108
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 1;  | 
| 
109
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 __END__  |