File Coverage

blib/lib/KinoSearch1/Analysis/Tokenizer.pm

Criterion	Covered	Total	%
statement	37	37	100.0
branch	4	4	100.0
condition	5	9	55.5
subroutine	9	9	100.0
pod	2	2	100.0
total	57	61	93.4

line	stmt	bran	cond	sub	pod	time	code
1							package KinoSearch1::Analysis::Tokenizer;
2	34			34		26094	use strict;
	34					331
	34					1180
3	34			34		182	use warnings;
	34					77
	34					842
4	34			34		691	use KinoSearch1::Util::ToolSet;
	34					72
	34					5023
5	34			34		212	use base qw( KinoSearch1::Analysis::Analyzer );
	34					75
	34					11715
6	34			34		19581	use locale;
	34					5216
	34					216
7
8							BEGIN {
9	34			34		1647	__PACKAGE__->init_instance_vars(
10
11							# constructor params / members
12							token_re => undef, # regex for a single token
13
14							# members
15							separator_re => undef, # regex for separations between tokens
16							);
17							}
18
19	34			34		2419	use KinoSearch1::Analysis::TokenBatch;
	34					72
	34					11187
20
21							sub init_instance {
22	43			43	1	97	my $self = shift;
23
24							# supply defaults if token_re wasn't specified
25	43	100				449	if ( !defined $self->{token_re} ) {
26	39					271	$self->{token_re} = qr/\b\w+(?:'\w+)?\b/;
27	39					194	$self->{separator_re} = qr/\W*/;
28							}
29
30							# if user-defined token_re...
31	43	100				230	if ( !defined $self->{separator_re} ) {
32
33							# define separator using lookahead
34	4					185	$self->{separator_re} = qr/
35							.*? # match up to...
36							(?= # but not including...
37							$self->{token_re} # a token,
38							\|\z # or the end of the string
39							)/xsm;
40							}
41							}
42
43							sub analyze {
44	15854			15854	1	20134	my ( $self, $batch ) = @_;
45
46	15854					62238	my $new_batch = KinoSearch1::Analysis::TokenBatch->new;
47	15854					23894	my $token_re = $self->{token_re};
48	15854					19777	my $separator_re = $self->{separator_re};
49
50							# alias input to $_
51	15854					48797	while ( $batch->next ) {
52	15795					46384	local $_ = $batch->get_text;
53
54							# ensure that pos is set to 0 for this scalar
55	15795					38552	pos = 0;
56
57							# accumulate token start_offsets and end_offsets
58	15795					27063	my ( @starts, @ends );
59	15795		33			676171	1 while ( m/$separator_re/g and push @starts,
			66
			66
60							pos and m/$token_re/g and push @ends, pos );
61
62							# correct for overshoot
63	15795					37609	$#starts = $#ends;
64
65							# add the new tokens to the batch
66	15795					123964	$new_batch->add_many_tokens( $_, \@starts, \@ends );
67							}
68
69	15854					83162	return $new_batch;
70							}
71
72							1;
73
74							__END__