File Coverage

blib/lib/Search/Tokenizer.pm

Criterion	Covered	Total	%
statement	61	67	91.0
branch	20	28	71.4
condition	5	8	62.5
subroutine	8	11	72.7
pod	4	5	80.0
total	98	119	82.3

line	stmt	bran	cond	sub	pod	time	code
1							package Search::Tokenizer;
2	1			1		62126	use warnings;
	1					3
	1					34
3	1			1		16	use strict;
	1					2
	1					35
4	1			1		1021	use Unicode::CaseFold ();
	1					1205
	1					559
5
6							our $VERSION = '1.01';
7
8							sub new {
9	2			2	0	398	my $class = shift;
10
11							# defaults
12	2					11	my $regex = qr/\w+/;
13	2					6	my $lower = 1;
14	2					7	my $filter = undef;
15	2					5	my $filter_in_place = undef;
16	2					6	my $stopwords = undef;
17
18							# parse arguments
19	2	50				11	unshift @_, "regex" if @_ == 1; # positional API
20	2					11	while (my $arg = shift) {
21	7					12	my $val = shift;
22	7		100			33	$arg .= "=>" . (ref($val) \|\| "NOREF");
23	7					18	for ($arg) {
24	7	100				30	/^regex=>Regexp$/ and do { $regex = $val; last};
	2					5
	2					15
25	5	100				19	/^lower=>NOREF$/ and do { $lower = !!$val; last};
	1					4
	1					5
26	4	50				15	/^filter=>CODE$/ and do { $filter = $val; last};
	0					0
	0					0
27	4	100				16	/^filter_in_place=>CODE$/ and do { $filter_in_place = $val; last};
	2					5
	2					7
28	2	50				14	/^stopwords=>HASH$/ and do { $stopwords = $val; last};
	2					5
	2					8
29	0					0	die "Invalid option or invalid operand: $arg";
30							}
31							}
32
33							# check that regex doest not match the empty string
34	2	50				14	not "" =~ $regex
35							or die "regex $regex matches the empty string: cannot tokenize";
36
37							# return tokenizer factory: closure
38							return sub {
39	3			3		1290	my $string = shift;
40	3					7	my $term_index = -1;
41
42							# return tokenizer : additional closure on $string and $term_index
43							return sub {
44
45							# get next occurrence of $regex in $string (thanks to the /g flag)
46	10					160	while ($string =~ /$regex/g) {
47
48							# index of this term within the input string
49	12					19	$term_index += 1;
50
51							# boundaries for the match
52	12					46	my ($start, $end) = ($-[0], $+[0]);
53
54							# extract matched substring (more efficient than $&)
55	12					36	my $term = substr($string, $start, $end-$start);
56
57							# apply filtering and stopwords, if any
58	12	100				53	$term = Unicode::CaseFold::fc($term) if $lower;
59	12	50				80	$term = $filter->($term) if $filter;
60	12	50				283	$filter_in_place->($term) if $filter_in_place;
61	12	100	66			308	undef $term if $stopwords and $stopwords->{$term};
62
63							# if $term was not cancelled by filters above, return it
64	12	100				60	if ($term) {
65	7	50				57	return wantarray ? ($term, length($term), $start, $end, $term_index)
66							: $term;
67							}
68							} # otherwise, loop again to extract next term
69
70							# otherwise, no more term in input string, return undef or empty list
71	3					9	return;
72	3					25	};
73	2					23	};
74							}
75
76							sub word {
77	0			0	1	0	__PACKAGE__->new(regex => qr/\w+/, @_);
78							}
79
80							sub word_locale {
81	1			1		851	use locale;
	1					223
	1					6
82	0			0	1	0	__PACKAGE__->new(regex => qr/\w+/, @_);
83							}
84
85							sub word_unicode {
86	1			1	1	1104	__PACKAGE__->new(regex => qr/\p{Word}+/, @_);
	1			0		12
	1					14
	0					0
87							}
88
89							sub unaccent {
90	1			1	1	2809	require Text::Transliterator::Unaccent;
91	1					98402	my %args = @_;
92	1		33			1031	my $want_lower = !exists $args{lower} \|\| $args{lower};
93	1	50				7	my %unaccenter_args = $want_lower ? () : (upper => 0);
94	1					13	my $unaccenter = Text::Transliterator::Unaccent->new(%unaccenter_args);
95	1					40745	__PACKAGE__->new(regex => qr/\p{Word}+/,
96							filter_in_place => $unaccenter,
97							%args);
98							}
99
100
101							1; # End of Search::Tokenizer
102
103
104							__END__