File Coverage

blib/lib/Gwybodaeth/Tokenize.pm

Criterion	Covered	Total	%
statement	41	41	100.0
branch	6	6	100.0
condition	8	9	88.8
subroutine	5	5	100.0
pod	2	2	100.0
total	62	63	98.4

line	stmt	bran	cond	sub	pod	time	code
1							#!/usr/bin/env perl
2
3	5			5		22653	use warnings;
	5					10
	5					151
4	5			5		28	use strict;
	5					10
	5					2570
5
6							package Gwybodaeth::Tokenize;
7
8							=head1 NAME
9
10							Tokenize - Split up data on whitespace into tokens.
11
12							=head1 SYNOPSIS
13
14							use Tokenize;
15
16							my $t = Tokenize->new();
17
18							$t->tokenize($data);
19
20							=head1 DESCRIPTION
21
22							This module tokenizes data, where a token is delimited by whitespace.
23
24							=over
25
26							=item new()
27
28							Reterns an instance of the class.
29
30							=cut
31
32							sub new {
33	5			5	1	78	my $class = shift;
34	5					12	my $self = {};
35	5					14	bless $self, $class;
36	5					18	return $self;
37							}
38
39							=item tokenize($data)
40
41							Tokenizes the data supplied in the array reference $data.
42
43							=cut
44
45							# Takes a reference to the input data as a parameter.
46							sub tokenize {
47	10			10	1	732	my($self, $data) = @_;
48
49	10					13	my @tokenized;
50
51	10					15	for (@{ $data }) {
	10					24
52	34					85	for (split /\s+/x) {
53	93	100				258	next if /
54							# string is entirly whitespace or empty
55							^\s*$/x;
56	83					181	push @tokenized, $_;
57							}
58							}
59
60	10					35	return $self->_tokenize_clean(\@tokenized);
61							}
62
63							# Takes a reference to the data which needs to be cleaned
64							sub _tokenize_clean {
65	10			10		92	my($self, $data) = @_;
66
67	10					14	for my $i (0..$#{ $data }) {
	10					31
68
69	85	100				73	next if (not defined ${ $data }[$i]);
	85					180
70
71							# If a token begins with '<' but doesn't end with '>'
72							# then the token has been split up.
73	83	100	100			80	if ((${$data}[$i] =~ /^\< # line begins with a opening square bracket/x
	83		100			261
	4		66			98
74							&&
75	82					266	${$data}[$i] =~ /[^\>]$ # line doesn't end with a closing square
76							# bracket
77							/x)
78							\|\|
79							# If the token begins but doesn't end with " the token may
80							# have been split up
81	8					47	(${$data}[$i] =~ /^\" # line begins with a double quote/x
82							&&
83							${$data}[$i] =~ /
84							[^\"]$ # line doesn't end with a double quote
85							/x))
86							{
87							# Concatinate the next line to the current
88							# partial token. We add a space inbetween to repair from
89							# the split operation.
90	2					4	${ $data }[$i] .= " ${ $data }[$i+1]";
	2					4
	2					5
91
92							# Re-index the token list to take into account the last
93							# concatination.
94	2					5	for my $j (($i+1)..($#{ $data }-1)) {
	2					7
95	4					6	${ $data }[$j] = ${ $data }[$j + 1];
	4					10
	4					6
96							}
97
98							# The last data element should now be deleted
99							# as the data has been shifted up one in the
100							# list.
101	2					4	delete ${ $data }[$#{ $data }];
	2					5
	2					3
102
103	2					4	redo; # try again in case the token is split onto more than 2 lines
104							}
105							}
106	10					50	return $data;
107							}
108							1;
109							__END__