File Coverage

blib/lib/Hailo/Tokenizer/Words.pm

Criterion	Covered	Total	%
statement	74	74	100.0
branch	22	24	91.6
condition	33	36	91.6
subroutine	10	10	100.0
pod	0	2	0.0
total	139	146	95.2

line	stmt	bran	cond	sub	pod	time	code
1							package Hailo::Tokenizer::Words;
2							our $AUTHORITY = 'cpan:AVAR';
3							$Hailo::Tokenizer::Words::VERSION = '0.75';
4	29			29		3266303	use v5.10.0;
	29					143
5	29			29		5533	use utf8;
	29					184
	29					220
6	29			29		2421	use Moose;
	29					1364052
	29					252
7	29			29		204193	use MooseX::StrictConstructor;
	29					93207
	29					282
8	29			29		130130	use Regexp::Common qw/ URI /;
	29					72181
	29					130
9	29			29		672507	use namespace::clean -except => 'meta';
	29					81
	29					300
10
11							with qw(Hailo::Role::Arguments
12							Hailo::Role::Tokenizer);
13
14							# [[:alpha:]] doesn't match combining characters on Perl >=5.12
15							my $ALPHABET = qr/(?![_\d])\w/;
16
17							# tokenization
18							my $SPACE = qr/\s/;
19							my $NONSPACE = qr/\S/;
20							my $DASH = qr/[â€“-]/;
21							my $POINT = qr/[.,]/;
22							my $APOSTROPHE = qr/['â€™Â´]/;
23							my $ELLIPSIS = qr/\.{2,}\|â€¦/;
24							my $NON_WORD = qr/[^\w\s]+/;
25							my $BARE_WORD = qr/\w+/;
26							my $CURRENCY = qr/[Â¤Â¥Â¢Â£\$]/;
27							my $NUMBER = qr/$CURRENCY?$POINT\d+(?:$POINT\d+)(?:$CURRENCY\|$ALPHABET+)?\|$CURRENCY?\d+(?:$POINT\d+)(?:$CURRENCY\|$ALPHABET+)?(?!\d\|$ALPHABET)/;
28							my $APOST_WORD = qr/$ALPHABET+(?:$APOSTROPHE$ALPHABET+)+/;
29							my $ABBREV = qr/$ALPHABET(?:\.$ALPHABET)+\./;
30							my $DOTTED = qr/$BARE_WORD?\.$BARE_WORD(?:\.$BARE_WORD)*/;
31							my $WORD_TYPES = qr/$NUMBER\|$ABBREV\|$DOTTED\|$APOST_WORD\|$BARE_WORD/;
32							my $WORD_APOST = qr/$WORD_TYPES(?:$DASH$WORD_TYPES)*$APOSTROPHE(?!$ALPHABET\|$NUMBER)/;
33							my $WORD = qr/$WORD_TYPES(?:(?:$DASH$WORD_TYPES)+\|$DASH(?!$DASH))?/;
34	14			14		11217	my $MIXED_CASE = qr/ \p{Lower}+ \p{Upper} \| \p{Upper}{2,} \p{Lower} /x;
	14					43
	14					194
35							my $UPPER_NONW = qr/^ (?:\p{Upper}+ \W+)(?<!I') (?: \p{Upper}* \p{Lower} ) /x;
36
37							# special tokens
38							my $TWAT_NAME = qr/ \@ [A-Za-z0-9_]+ /x;
39							my $EMAIL = qr/ [A-Z0-9._%+-]+ @ [A-Z0-9.-]+ (?: \. [A-Z]{2,4} )* /xi;
40							my $PERL_CLASS = qr/ (?: :: \w+ (?: :: \w+ )* \| \w+ (?: :: \w+ )+ ) (?: :: )? \| \w+ :: /x;
41							my $EXTRA_URI = qr{ (?: \w+ \+ ) ssh:// $NONSPACE+ }x;
42							my $ESC_SPACE = qr/(?:\\ )+/;
43							my $NAME = qr/(?:$BARE_WORD\|$ESC_SPACE)+/;
44							my $FILENAME = qr/ $NAME? \. $NAME (?: \. $NAME )* \| $NAME/x;
45							my $UNIX_PATH = qr{ / $FILENAME (?: / $FILENAME )* /? }x;
46							my $WIN_PATH = qr{ $ALPHABET : \\ $FILENAME (?: \\ $FILENAME )* \\?}x;
47							my $PATH = qr/$UNIX_PATH\|$WIN_PATH/;
48							my $DATE = qr/[0-9]{4}-W?[0-9]{1,2}-[0-9]{1,2}/i;
49							my $TIME = qr/[0-9]{1,2}:[0-9]{2}(?::[0-9]{2})?(?:Z\| ?[AP]M\|[-+Â±][0-9]{2}(?::?[0-9]{2})?)?/i;
50							my $DATETIME = qr/${DATE}T$TIME/;
51							my $IRC_NICK = qr/<(?: \|[&~]?[@%+~&])?[A-Za-z_`\-^\\|\\\{}\[\]][A-Za-z_0-9`\-^\\|\\\{}\[\]]+>/;
52							my $IRC_CHAN = qr/[#&+][^ \a\0\012\015,:]{1,199}/;
53							my $NUMERO = qr/#[0-9]+/;
54							my $CLOSE_TAG = qr{</[-\w]+>};
55
56							my $CASED_WORD = qr/$CLOSE_TAG\|$IRC_NICK\|$IRC_CHAN\|$DATETIME\|$DATE\|$TIME\|$PERL_CLASS\|$EXTRA_URI\|$EMAIL\|$TWAT_NAME\|$PATH\|$NUMERO/;
57
58							# capitalization
59							# The rest of the regexes are pretty hairy. The goal here is to catch the
60							# most common cases where a word should be capitalized. We try hard to
61							# guard against capitalizing things which don't look like proper words.
62							# Examples include URLs and code snippets.
63							my $OPEN_QUOTE = qr/['"â€˜â€œâ€žÂ«Â»ã€Œã€Žâ€¹â€š]/;
64							my $CLOSE_QUOTE = qr/['"â€™â€œâ€Â«Â»ã€ã€â€ºâ€˜]/;
65							my $TERMINATOR = qr/(?:[?!â€½]+\|(?<!\.)\.)/;
66							my $ADDRESS = qr/:/;
67							my $PUNCTUATION = qr/[?!â€½,;.:]/;
68							my $BOUNDARY = qr/$CLOSE_QUOTE?(?:\s$TERMINATOR\|$ADDRESS)\s+$OPEN_QUOTE?\s/;
69							my $LOOSE_WORD = qr/$IRC_CHAN\|$DATETIME\|$DATE\|$TIME\|$PATH\|$NUMBER\|$ABBREV\|$APOST_WORD\|$NUMERO\|$BARE_WORD(?:$DASH(?:$WORD_TYPES\|$BARE_WORD)\|$APOSTROPHE(?!$ALPHABET\|$NUMBER\|$APOSTROPHE)\|$DASH(?!$DASH{2}))*/;
70							my $SPLIT_WORD = qr{$LOOSE_WORD(?:/$LOOSE_WORD)?(?=$PUNCTUATION(?:\s+\|$)\|$CLOSE_QUOTE\|$TERMINATOR\|\s+\|$)};
71
72							# we want to capitalize words that come after "On example.com?"
73							# or "You mean 3.2?", but not "Yes, e.g."
74							my $DOTTED_STRICT = qr/$LOOSE_WORD(?:$POINT(?:\d+\|\w{2,}))?/;
75							my $WORD_STRICT = qr/$DOTTED_STRICT(?:$APOSTROPHE$DOTTED_STRICT)*/;
76
77							# This string is added to (and later removed from) the output string when
78							# capitalizing it in multiple passes. We use backspace, because that is
79							# unlikely to be in the input. This dirty approach can probably be replaced
80							# with regex grammars, but I haven't bothered to learn to use those.
81							my $SEPARATOR = "\x08";
82
83							# input -> tokens
84							sub make_tokens {
85	5117			5117	0	219802	my ($self, $input) = @_;
86
87	5117					8484	my @tokens;
88	5117					24104	$input =~ s/$DASH\K\s\n+\s//;
89	5117					13120	$input =~ s/\s\n+\s/ /gm;
90
91	5117					13547	while (length $input) {
92							# remove the next chunk of whitespace
93	28247					118542	$input =~ s/^$SPACE+//;
94	28247					46496	my $got_word;
95
96	28247		100			124555	while (length $input && $input =~ /^$NONSPACE/) {
97							# We convert it to ASCII and then look for a URI because $RE{URI}
98							# from Regexp::Common doesn't support non-ASCII domain names
99	34731					122250	my ($ascii) = $input =~ /^($NONSPACE+)/;
100	34731					71473	$ascii =~ s/[^[:ascii:]]/a/g;
101
102							# URIs
103	34731	100	100			360343	if (!$got_word && $ascii =~ / ^ $RE{URI} /xo) {
		100	100
		100
		50
104	8					28	my $uri_end = $+[0];
105	8					30	my $uri = substr $input, 0, $uri_end;
106	8					143	$input =~ s/^\Q$uri//;
107
108	8					30	push @tokens, [$self->{_spacing_normal}, $uri];
109	8					46	$got_word = 1;
110							}
111							# special words for which we preserve case
112							elsif (!$got_word && $input =~ s/ ^ (?<word> $CASED_WORD )//xo) {
113	313			3		7590	push @tokens, [$self->{_spacing_normal}, $+{word}];
	3					67976
	3					1231
	3					2038
114	313					2023	$got_word = 1;
115							}
116							# normal words
117							elsif ($input =~ / ^ $WORD /xo) {
118	26814					104493	my $word;
119
120							# special case to allow matching q{ridin'} as one word, even when
121							# it appears as q{"ridin'"}, but not as q{'ridin'}
122	26814	100				70847	my $last_char = @tokens ? substr $tokens[-1][1], -1, 1 : '';
123	26814	100	100			148053	if (!@tokens && $input =~ s/ ^ (?<word>$WORD_APOST) //xo
			66
			66
124							\|\| $last_char =~ / ^ $APOSTROPHE $ /xo
125							&& $input =~ s/ ^ (?<word>$WORD_APOST) (?<! $last_char ) //xo) {
126	21					161	$word = $+{word};
127							}
128							else {
129	26793	50				203141	$input =~ s/^($WORD)//o and $word = $1;
130							}
131
132							# Maybe preserve the casing of this word
133	26814	100	100			247301	$word = lc $word
			100
134							if $word ne uc $word
135							# Mixed-case words like "WoW" or "ATMs"
136							and $word !~ $MIXED_CASE
137							# Words that are upper case followed by a non-word character.
138							and $word !~ $UPPER_NONW;
139
140	26814					88195	push @tokens, [$self->{_spacing_normal}, $word];
141	26814					155208	$got_word = 1;
142							}
143							# everything else
144							elsif ($input =~ s/ ^ (?<non_word> $NON_WORD ) //xo) {
145	7596					47311	my $non_word = $+{non_word};
146	7596					18786	my $spacing = $self->{_spacing_normal};
147
148							# was the previous token a word?
149	7596	100				17723	if ($got_word) {
		100
150							$spacing = $input =~ /^$NONSPACE/
151							? $self->{_spacing_infix}
152	5646	100				19011	: $self->{_spacing_postfix};
153							}
154							# do we still have more tokens?
155							elsif ($input =~ /^$NONSPACE/) {
156	651					1361	$spacing = $self->{_spacing_prefix};
157							}
158
159	7596					45422	push @tokens, [$spacing, $non_word];
160							}
161							}
162							}
163
164	5117					17566	return \@tokens;
165							}
166
167							# tokens -> output
168							sub make_output {
169	1299			1299	0	115823	my ($self, $tokens) = @_;
170	1299					2145	my $reply = '';
171
172	1299					1886	for my $pos (0 .. $#{ $tokens }) {
	1299					3595
173	13926					17305	my ($spacing, $text) = @{ $tokens->[$pos] };
	13926					24467
174	13926					20344	$reply .= $text;
175
176							# append whitespace if this is not a prefix token or infix token,
177							# and this is not the last token, and the next token is not
178							# a postfix/infix token
179	13926	100	100			16585	if ($pos != $#{ $tokens }
	13926		100			52535
			66
			100
180							&& $spacing != $self->{_spacing_prefix}
181							&& $spacing != $self->{_spacing_infix}
182							&& !($pos < $#{ $tokens }
183							&& ($tokens->[$pos+1][0] == $self->{_spacing_postfix}
184							\|\| $tokens->[$pos+1][0] == $self->{_spacing_infix})
185							)
186							) {
187	9935					20679	$reply .= ' ';
188							}
189							}
190
191							# capitalize the first word
192	1299					36744	$reply =~ s/^\s$OPEN_QUOTE?\s\K($SPLIT_WORD)(?=$ELLIPSIS\|(?:(?:$CLOSE_QUOTE\|$TERMINATOR\|$ADDRESS\|$PUNCTUATION+)?(?:\s\|$)))/\u$1/o;
193
194							# capitalize the second word
195	1299					41542	$reply =~ s/^\s$OPEN_QUOTE?\s$SPLIT_WORD(?:(?:\s*$TERMINATOR\|$ADDRESS)\s+)\K($SPLIT_WORD)/\u$1/o;
196
197							# capitalize all other words after word boundaries
198							# we do it in two passes because we need to match two words at a time
199	1299					95891	$reply =~ s/(?:$ELLIPSIS\|\s+)$OPEN_QUOTE?\s*$WORD_STRICT$BOUNDARY\K($SPLIT_WORD)/$SEPARATOR\u$1$SEPARATOR/go;
200	1299					35986	$reply =~ s/$SEPARATOR$WORD_STRICT$SEPARATOR$BOUNDARY\K($SPLIT_WORD)/\u$1/go;
201	1299					3690	$reply =~ s/$SEPARATOR//go;
202
203							# end paragraphs with a period when it makes sense
204	1299					111908	$reply =~ s/(?:$ELLIPSIS\|\s+\|^)$OPEN_QUOTE?(?:$SPLIT_WORD(?:\.$SPLIT_WORD)*)\K($CLOSE_QUOTE?)$/.$1/o;
205
206							# capitalize I'm, I've...
207	1299					18252	$reply =~ s{(?:(?:$ELLIPSIS\|\s+)\|$OPEN_QUOTE)\Ki(?=$APOSTROPHE$ALPHABET)}{I}go;
208
209	1299					9573	return $reply;
210							}
211
212							__PACKAGE__->meta->make_immutable;
213
214							=encoding utf8
215
216							=head1 NAME
217
218							Hailo::Tokenizer::Words - A tokenizer for L<Hailo\|Hailo> which splits
219							on whitespace and word boundaries, mostly.
220
221							=head1 DESCRIPTION
222
223							This tokenizer does its best to handle various languages. It knows about most
224							apostrophes, quotes, and sentence terminators.
225
226							=head1 AUTHOR
227
228							Hinrik E<Ouml>rn SigurE<eth>sson, hinrik.sig@gmail.com
229
230							=head1 LICENSE AND COPYRIGHT
231
232							Copyright 2010 Hinrik E<Ouml>rn SigurE<eth>sson
233
234							This program is free software, you can redistribute it and/or modify
235							it under the same terms as Perl itself.
236
237							=cut