File Coverage

blib/lib/Plucene/Analysis/Standard/StandardTokenizer.pm

Criterion	Covered	Total	%
statement	17	20	85.0
branch	2	4	50.0
condition			n/a
subroutine	5	6	83.3
pod	2	2	100.0
total	26	32	81.2

line	stmt	bran	sub	pod	time	code
1						package Plucene::Analysis::Standard::StandardTokenizer;
2
3						=head1 NAME
4
5						Plucene::Analysis::Standard::StandardTokenizer - standard tokenizer
6
7						=head1 SYNOPSIS
8
9						# isa Plucene::Analysis::CharTokenizer
10
11						=head1 DESCRIPTION
12
13						This is the standard tokenizer.
14
15						This should be a good tokenizer for most European-language documents.
16
17						=head1 METHODS
18
19						=cut
20
21	2		2		816	use strict;
	2				5
	2				69
22	2		2		15	use warnings;
	2				5
	2				61
23
24	2		2		12	use base 'Plucene::Analysis::CharTokenizer';
	2				4
	2				672
25
26						# Don't blame me, blame the Plucene people!
27	2		2		2128	my $alpha = qr/\p{IsAlpha}+/;
	2				22
	2				30
28						my $apostrophe = qr/$alpha('$alpha)+/;
29						my $acronym = qr/$alpha\.($alpha\.)+/;
30						my $company = qr/$alpha(&\|\@)$alpha/;
31						my $hostname = qr/\w+(\.\w+)+/;
32						my $email = qr/\w+\@$hostname/;
33						my $p = qr/[_\/.,-]/;
34						my $hasdigit = qr/\w\d\w/;
35						my $num = qr/\w+$p$hasdigit\|$hasdigit$p\w+
36						\|\w+($p$hasdigit$p\w+)+
37						\|$hasdigit($p\w+$p$hasdigit)+
38						\|\w+$p$hasdigit($p\w+$p$hasdigit)+
39						\|$hasdigit$p\w+($p$hasdigit$p\w+)+/x;
40
41						=head2 token_re
42
43						The regular expression for tokenising.
44
45						=cut
46
47						sub token_re {
48	0		0	1	0	qr/
49						$apostrophe \| $acronym \| $company \| $hostname \| $email \| $num
50						\| \w+
51						/x;
52						}
53
54						=head2 normalize
55
56						Remove 's and .
57
58						=cut
59
60						sub normalize {
61	1		1	1	710	my $class = shift;
62
63						# These are in the StandardFilter in Java, but Perl is not Java.
64						# Thankfully.
65	1				3	local $_ = shift;
66	1	50			7	if (/$apostrophe/) { s/'s//; }
	0				0
67	1	50			8	if (/$company/) { s/\.//g; }
	0				0
68	1				4	return $_;
69						}
70
71						1;