File Coverage

blib/lib/Search/Tokenizer.pm

Criterion	Covered	Total	%
statement	71	77	92.2
branch	23	32	71.8
condition	5	8	62.5
subroutine	10	13	76.9
pod	5	6	83.3
total	114	136	83.8

line	stmt	bran	cond	sub	pod	time	code
1							package Search::Tokenizer;
2	1			1		70076	use warnings;
	1					3
	1					33
3	1			1		6	use strict;
	1					2
	1					22
4	1			1		5	use Carp qw(croak);
	1					2
	1					45
5	1			1		549	use Unicode::CaseFold qw(fc); # because CORE::fc only came with Perl 5.16
	1					1001
	1					116
6
7							our $VERSION = '1.03';
8
9							#======================================================================
10							# MAIN FUNCTIONALITY
11							#======================================================================
12
13							sub new {
14	2			2	0	641	my $class = shift;
15
16							# defaults
17	1			1		669	my $regex = qr/\p{Word}+/;
	1					15
	1					15
	2					9
18	2					4	my $lower = 1;
19	2					6	my $filter = undef;
20	2					4	my $filter_in_place = undef;
21	2					3	my $stopwords = undef;
22
23							# parse arguments
24	2	50				11	unshift @_, "regex" if @_ == 1; # positional API
25	2					13	while (my ($arg, $val) = splice(@_, 0, 2)) {
26	7		100			26	$arg .= "=>" . (ref($val) \|\| "SCALAR");
27							CHECK:
28	7					13	for ($arg) {
29	7	100				26	/^regex=>Regexp$/ and do { $regex = $val; last CHECK};
	2					7
	2					11
30	5	100				13	/^lower=>SCALAR$/ and do { $lower = !!$val; last CHECK};
	1					3
	1					4
31	4	50				11	/^filter=>CODE$/ and do { $filter = $val; last CHECK};
	0					0
	0					0
32	4	100				12	/^filter_in_place=>CODE$/ and do { $filter_in_place = $val; last CHECK};
	2					5
	2					7
33	2	50				10	/^stopwords=>HASH$/ and do { $stopwords = $val; last CHECK};
	2					4
	2					7
34	0					0	croak "Invalid option or invalid operand: $arg";
35							}
36							}
37
38							# check that regex doest not match the empty string
39	2	50				14	not "" =~ $regex
40							or croak "regex $regex matches the empty string: cannot tokenize";
41
42							# return tokenizer factory: closure
43							return sub {
44	3			3		16	my ($string, @other_args) = @_;
45							not @other_args
46	3	50				21	or croak "too many args -- just a single string is expected";
47
48	3					5	my $term_index = -1;
49
50							# return tokenizer : additional closure on $string and $term_index
51							return sub {
52
53							# get next occurrence of $regex in $string (thanks to the /g flag)
54	10					51	while ($string =~ /$regex/g) {
55
56							# index of this term within the input string
57	12					19	$term_index += 1;
58
59							# boundaries for the match
60	12					18	my $end = pos($string);
61	12					27	my $term = $&; # used to be slow in older perls, but now OK
62	12					16	my $start = $end - length($term);
63
64							# the old way used to be as follows, but it is ridiculously slow on utf8 strings
65							# .. see https://github.com/Perl/perl5/issues/18786
66							#
67							# my ($start, $end) = ($-[0], $+[0]);
68							# my $term = substr($string, $start, $end-$start);
69
70							# apply filtering and stopwords, if any
71	12	100				38	$term = Unicode::CaseFold::fc($term) if $lower;
72	12	50				72	$term = $filter->($term) if $filter;
73	12	50				169	$filter_in_place->($term) if $filter_in_place;
74	12	100	66			182	undef $term if $stopwords and $stopwords->{$term};
75
76							# if $term was not cancelled by filters above, return it
77	12	100				42	if ($term) {
78	7	50				41	return wantarray ? ($term, length($term), $start, $end, $term_index)
79							: $term;
80							}
81							} # otherwise, loop again to extract next term
82
83							# otherwise, that's the end of the input string, return undef or empty list
84	3					7	return;
85	3					40	};
86	2					19	};
87							}
88
89							#======================================================================
90							# BUILTIN TOKENIZERS
91							#======================================================================
92
93
94							sub word {
95	0			0	1	0	__PACKAGE__->new(regex => qr/\w+/, @_);
96							}
97
98							sub word_locale {
99	1			1		23237	use locale;
	1					525
	1					6
100	0			0	1	0	__PACKAGE__->new(regex => qr/\w+/, @_);
101							}
102
103							sub word_unicode {
104	0			0	1	0	__PACKAGE__->new(regex => qr/\p{Word}+/, @_);
105							}
106
107							sub unaccent {
108	1			1	1	565	require Text::Transliterator::Unaccent;
109	1					39908	my %args = @_;
110	1		33			7	my $want_lower = !exists $args{lower} \|\| $args{lower};
111	1	50				5	my %unaccenter_args = $want_lower ? () : (upper => 0);
112	1					9	my $unaccenter = Text::Transliterator::Unaccent->new(%unaccenter_args);
113	1					36028	__PACKAGE__->new(regex => qr/\p{Word}+/,
114							filter_in_place => $unaccenter,
115							%args);
116							}
117
118
119							#======================================================================
120							# UTILITY FUNCTION
121							#======================================================================
122
123							sub unroll {
124	3			3	1	15	my $iterator = shift;
125	3					6	my $no_details = shift;
126	3					4	my @results;
127
128	3					7	while (my @r = $iterator->() ) {
129	7	100				25	push @results, $no_details ? $r[0] : \@r;
130							}
131	3					27	return @results;
132							}
133
134
135							1; # End of Search::Tokenizer
136
137
138							__END__