File Coverage

blib/lib/Plucene/Analysis/CharTokenizer.pm

Criterion	Covered	Total	%
statement	32	33	96.9
branch	7	8	87.5
condition	2	3	66.6
subroutine	7	8	87.5
pod	3	3	100.0
total	51	55	92.7

line	stmt	bran	cond	sub	pod	time	code
1							package Plucene::Analysis::CharTokenizer;
2
3							=head1 NAME
4
5							Plucene::Analysis::CharTokenizer - base class for character tokenisers
6
7							=head1 SYNOPSIS
8
9							# isa Plucene::Analysis::Tokenizer
10
11							my $next = $chartokenizer->next;
12
13							=head1 DESCRIPTION
14
15							This is an abstract base class for simple, character-oriented tokenizers.
16
17							=head1 METHODS
18
19							=cut
20
21	19			19		119	use strict;
	19					46
	19					1130
22	19			19		106	use warnings;
	19					42
	19					463
23
24	19			19		104	use Carp;
	19					45
	19					2081
25
26	19			19		10687	use Plucene::Analysis::Token;
	19					57
	19					155
27
28	19			19		754	use base 'Plucene::Analysis::Tokenizer';
	19					44
	19					11664
29
30							=head2 token_re
31
32							This should be defined in subclasses.
33
34							=cut
35
36							# And here we deviate from the script
37	0			0	1	0	sub token_re { die "You should define this" }
38
39							# Class::Virtually::Abstract doesn't like being called twice.
40
41							=head2 normalize
42
43							This will normalise the character before it is added to the token.
44
45							=cut
46
47	382			382	1	2220	sub normalize { return $_[1] }
48
49							=head2 next
50
51							my $next = $chartokenizer->next;
52
53							This will return the next token in the string, or undef at the end
54							of the string.
55
56							=cut
57
58							sub next {
59	144566			144566	1	210839	my $self = shift;
60	144566					385815	my $re = $self->token_re();
61	144566					267537	my $fh = $self->{reader};
62							retry:
63	159707	100	66			628270	if (!defined $self->{buffer} or !length $self->{buffer}) {
64	16240	100				57176	return if eof($fh);
65	15664					196373	$self->{start} = tell($fh);
66	15664					129748	$self->{buffer} .= <$fh>;
67							}
68	159131	50				1639977	return unless length $self->{buffer};
69
70	159131	100				1075456	if ($self->{buffer} =~ s/(.*?)($re)//) {
71	143990					305185	$self->{start} += length $1;
72	143990					421935	my $word = $self->normalize($2);
73	143990					580586	my $rv = Plucene::Analysis::Token->new(
74							text => $word,
75							start => $self->{start},
76							end => ($self->{start} + length($word)));
77	143990					1964224	$self->{start} += length($word);
78	143990					701270	return $rv;
79							}
80
81							# No match, rest of buffer is useless.
82	15141					29849	$self->{buffer} = "";
83
84							# But we should try for some more text
85	15141					29562	goto retry;
86							}
87
88							1;