File Coverage

blib/lib/Search/Tokenizer.pm

Criterion	Covered	Total	%
statement	62	68	91.1
branch	20	28	71.4
condition	5	8	62.5
subroutine	8	11	72.7
pod	4	5	80.0
total	99	120	82.5

line	stmt	bran	cond	sub	pod	time	code
1							package Search::Tokenizer;
2	1			1		78408	use warnings;
	1					3
	1					39
3	1			1		6	use strict;
	1					2
	1					19
4	1			1		520	use Unicode::CaseFold ();
	1					1080
	1					477
5
6							our $VERSION = '1.02';
7
8							sub new {
9	2			2	0	682	my $class = shift;
10
11							# defaults
12	2					10	my $regex = qr/\w+/;
13	2					6	my $lower = 1;
14	2					6	my $filter = undef;
15	2					4	my $filter_in_place = undef;
16	2					5	my $stopwords = undef;
17
18							# parse arguments
19	2	50				13	unshift @_, "regex" if @_ == 1; # positional API
20	2					10	while (my $arg = shift) {
21	7					11	my $val = shift;
22	7		100			30	$arg .= "=>" . (ref($val) \|\| "NOREF");
23	7					15	for ($arg) {
24	7	100				23	/^regex=>Regexp$/ and do { $regex = $val; last};
	2					10
	2					10
25	5	100				14	/^lower=>NOREF$/ and do { $lower = !!$val; last};
	1					3
	1					3
26	4	50				13	/^filter=>CODE$/ and do { $filter = $val; last};
	0					0
	0					0
27	4	100				14	/^filter_in_place=>CODE$/ and do { $filter_in_place = $val; last};
	2					6
	2					6
28	2	50				10	/^stopwords=>HASH$/ and do { $stopwords = $val; last};
	2					3
	2					6
29	0					0	die "Invalid option or invalid operand: $arg";
30							}
31							}
32
33							# check that regex doest not match the empty string
34	2	50				14	not "" =~ $regex
35							or die "regex $regex matches the empty string: cannot tokenize";
36
37							# return tokenizer factory: closure
38							return sub {
39	5			5		2383	my $string = shift;
40	5					10	my $term_index = -1;
41
42							# return tokenizer : additional closure on $string and $term_index
43							return sub {
44
45							# get next occurrence of $regex in $string (thanks to the /g flag)
46	3026					21341	while ($string =~ /$regex/g) {
47
48							# index of this term within the input string
49	3040					4629	$term_index += 1;
50
51							# boundaries for the match
52	3040					4460	my $end = pos($string);
53	3040					5081	my $term = $&; # used to be slow in older perls, but now OK
54	3040					4680	my $start = $end - length($term);
55
56							# the old way used to be as follows, but it is ridiculously slow on utf8 strings
57							# .. see https://github.com/Perl/perl5/issues/18786
58							#
59							# my ($start, $end) = ($-[0], $+[0]);
60							# my $term = substr($string, $start, $end-$start);
61
62							# apply filtering and stopwords, if any
63	3040	100				7293	$term = Unicode::CaseFold::fc($term) if $lower;
64	3040	50				19762	$term = $filter->($term) if $filter;
65	3040	50				54267	$filter_in_place->($term) if $filter_in_place;
66	3040	100	66			20991	undef $term if $stopwords and $stopwords->{$term};
67
68							# if $term was not cancelled by filters above, return it
69	3040	100				5191	if ($term) {
70	3021	50				11756	return wantarray ? ($term, length($term), $start, $end, $term_index)
71							: $term;
72							}
73							} # otherwise, loop again to extract next term
74
75							# otherwise, no more term in input string, return undef or empty list
76	5					12	return;
77	5					52	};
78	2					20	};
79							}
80
81							sub word {
82	0			0	1	0	__PACKAGE__->new(regex => qr/\w+/, @_);
83							}
84
85							sub word_locale {
86	1			1		562	use locale;
	1					622
	1					5
87	0			0	1	0	__PACKAGE__->new(regex => qr/\w+/, @_);
88							}
89
90							sub word_unicode {
91	1			1	1	783	__PACKAGE__->new(regex => qr/\p{Word}+/, @_);
	1			0		15
	1					16
	0					0
92							}
93
94							sub unaccent {
95	1			1	1	1831	require Text::Transliterator::Unaccent;
96	1					41170	my %args = @_;
97	1		33			8	my $want_lower = !exists $args{lower} \|\| $args{lower};
98	1	50				5	my %unaccenter_args = $want_lower ? () : (upper => 0);
99	1					10	my $unaccenter = Text::Transliterator::Unaccent->new(%unaccenter_args);
100	1					40022	__PACKAGE__->new(regex => qr/\p{Word}+/,
101							filter_in_place => $unaccenter,
102							%args);
103							}
104
105
106							1; # End of Search::Tokenizer
107
108
109							__END__