File Coverage

blib/lib/Hailo/Tokenizer/Words.pm

Criterion	Covered	Total	%
statement	13	13	100.0
branch			n/a
condition			n/a
subroutine	5	5	100.0
pod			n/a
total	18	18	100.0

line	stmt	sub	time	code
1				package Hailo::Tokenizer::Words;
2				BEGIN {
3	1	1	21560	$Hailo::Tokenizer::Words::AUTHORITY = 'cpan:AVAR';
4				}
5				{
6				$Hailo::Tokenizer::Words::VERSION = '0.72';
7				}
8
9	1	1	24	use 5.010;
	1		4
	1		33
10	1	1	5	use utf8;
	1		2
	1		6
11	1	1	801	use Any::Moose;
	1		34877
	1		8
12	1	1	608	use Any::Moose 'X::StrictConstructor';
	1		1
	1		6
13				use Regexp::Common qw/ URI /;
14				use namespace::clean -except => 'meta';
15
16				with qw(Hailo::Role::Arguments
17				Hailo::Role::Tokenizer);
18
19				# [[:alpha:]] doesn't match combining characters on Perl >=5.12
20				my $ALPHABET = qr/(?![_\d])\w/;
21
22				# tokenization
23				my $SPACE = qr/\s/;
24				my $NONSPACE = qr/\S/;
25				my $DASH = qr/[–-]/;
26				my $POINT = qr/[.,]/;
27				my $APOSTROPHE = qr/['’´]/;
28				my $ELLIPSIS = qr/\.{2,}\|…/;
29				my $NON_WORD = qr/[^\w\s]+/;
30				my $BARE_WORD = qr/\w+/;
31				my $CURRENCY = qr/[¤¥¢£\$]/;
32				my $NUMBER = qr/$CURRENCY?$POINT\d+(?:$POINT\d+)(?:$CURRENCY\|$ALPHABET+)?\|$CURRENCY?\d+(?:$POINT\d+)(?:$CURRENCY\|$ALPHABET+)?(?!\d\|$ALPHABET)/;
33				my $APOST_WORD = qr/$ALPHABET+(?:$APOSTROPHE$ALPHABET+)+/;
34				my $ABBREV = qr/$ALPHABET(?:\.$ALPHABET)+\./;
35				my $DOTTED = qr/$BARE_WORD?\.$BARE_WORD(?:\.$BARE_WORD)*/;
36				my $WORD_TYPES = qr/$NUMBER\|$ABBREV\|$DOTTED\|$APOST_WORD\|$BARE_WORD/;
37				my $WORD_APOST = qr/$WORD_TYPES(?:$DASH$WORD_TYPES)*$APOSTROPHE(?!$ALPHABET\|$NUMBER)/;
38				my $WORD = qr/$WORD_TYPES(?:(?:$DASH$WORD_TYPES)+\|$DASH(?!$DASH))?/;
39				my $MIXED_CASE = qr/ \p{Lower}+ \p{Upper} \| \p{Upper}{2,} \p{Lower} /x;
40				my $UPPER_NONW = qr/^ (?:\p{Upper}+ \W+)(?
41
42				# special tokens
43				my $TWAT_NAME = qr/ \@ [A-Za-z0-9_]+ /x;
44				my $EMAIL = qr/ [A-Z0-9._%+-]+ @ [A-Z0-9.-]+ (?: \. [A-Z]{2,4} )* /xi;
45				my $PERL_CLASS = qr/ (?: :: \w+ (?: :: \w+ )* \| \w+ (?: :: \w+ )+ ) (?: :: )? \| \w+ :: /x;
46				my $EXTRA_URI = qr{ (?: \w+ \+ ) ssh:// $NONSPACE+ }x;
47				my $ESC_SPACE = qr/(?:\\ )+/;
48				my $NAME = qr/(?:$BARE_WORD\|$ESC_SPACE)+/;
49				my $FILENAME = qr/ $NAME? \. $NAME (?: \. $NAME )* \| $NAME/x;
50				my $UNIX_PATH = qr{ / $FILENAME (?: / $FILENAME )* /? }x;
51				my $WIN_PATH = qr{ $ALPHABET : \\ $FILENAME (?: \\ $FILENAME )* \\?}x;
52				my $PATH = qr/$UNIX_PATH\|$WIN_PATH/;
53				my $DATE = qr/[0-9]{4}-W?[0-9]{1,2}-[0-9]{1,2}/i;
54				my $TIME = qr/[0-9]{1,2}:[0-9]{2}(?::[0-9]{2})?(?:Z\| ?[AP]M\|[-+±][0-9]{2}(?::?[0-9]{2})?)?/i;
55				my $DATETIME = qr/${DATE}T$TIME/;
56				my $IRC_NICK = qr/<(?: \|[&~]?[@%+~&])?[A-Za-z_`\-^\\|\\\{}\[\]][A-Za-z_0-9`\-^\\|\\\{}\[\]]+>/;
57				my $IRC_CHAN = qr/[#&+][^ \a\0\012\015,:]{1,199}/;
58				my $NUMERO = qr/#[0-9]+/;
59				my $CLOSE_TAG = qr{};
60
61				my $CASED_WORD = qr/$CLOSE_TAG\|$IRC_NICK\|$IRC_CHAN\|$DATETIME\|$DATE\|$TIME\|$PERL_CLASS\|$EXTRA_URI\|$EMAIL\|$TWAT_NAME\|$PATH\|$NUMERO/;
62
63				# capitalization
64				# The rest of the regexes are pretty hairy. The goal here is to catch the
65				# most common cases where a word should be capitalized. We try hard to
66				# guard against capitalizing things which don't look like proper words.
67				# Examples include URLs and code snippets.
68				my $OPEN_QUOTE = qr/['"‘“„«»「『‹‚]/;
69				my $CLOSE_QUOTE = qr/['"’“”«»」』›‘]/;
70				my $TERMINATOR = qr/(?:[?!‽]+\|(?
71				my $ADDRESS = qr/:/;
72				my $PUNCTUATION = qr/[?!‽,;.:]/;
73				my $BOUNDARY = qr/$CLOSE_QUOTE?(?:\s$TERMINATOR\|$ADDRESS)\s+$OPEN_QUOTE?\s/;
74				my $LOOSE_WORD = qr/$IRC_CHAN\|$DATETIME\|$DATE\|$TIME\|$PATH\|$NUMBER\|$ABBREV\|$APOST_WORD\|$NUMERO\|$BARE_WORD(?:$DASH(?:$WORD_TYPES\|$BARE_WORD)\|$APOSTROPHE(?!$ALPHABET\|$NUMBER\|$APOSTROPHE)\|$DASH(?!$DASH{2}))*/;
75				my $SPLIT_WORD = qr{$LOOSE_WORD(?:/$LOOSE_WORD)?(?=$PUNCTUATION(?:\s+\|$)\|$CLOSE_QUOTE\|$TERMINATOR\|\s+\|$)};
76
77				# we want to capitalize words that come after "On example.com?"
78				# or "You mean 3.2?", but not "Yes, e.g."
79				my $DOTTED_STRICT = qr/$LOOSE_WORD(?:$POINT(?:\d+\|\w{2,}))?/;
80				my $WORD_STRICT = qr/$DOTTED_STRICT(?:$APOSTROPHE$DOTTED_STRICT)*/;
81
82				# This string is added to (and later removed from) the output string when
83				# capitalizing it in multiple passes. We use backspace, because that is
84				# unlikely to be in the input. This dirty approach can probably be replaced
85				# with regex grammars, but I haven't bothered to learn to use those.
86				my $SEPARATOR = "\x08";
87
88				# input -> tokens
89				sub make_tokens {
90				my ($self, $input) = @_;
91
92				my @tokens;
93				$input =~ s/$DASH\K\s\n+\s//;
94				$input =~ s/\s\n+\s/ /gm;
95
96				while (length $input) {
97				# remove the next chunk of whitespace
98				$input =~ s/^$SPACE+//;
99				my $got_word;
100
101				while (length $input && $input =~ /^$NONSPACE/) {
102				# We convert it to ASCII and then look for a URI because $RE{URI}
103				# from Regexp::Common doesn't support non-ASCII domain names
104				my ($ascii) = $input =~ /^($NONSPACE+)/;
105				$ascii =~ s/[^[:ascii:]]/a/g;
106
107				# URIs
108				if (!$got_word && $ascii =~ / ^ $RE{URI} /xo) {
109				my $uri_end = $+[0];
110				my $uri = substr $input, 0, $uri_end;
111				$input =~ s/^\Q$uri//;
112
113				push @tokens, [$self->{_spacing_normal}, $uri];
114				$got_word = 1;
115				}
116				# special words for which we preserve case
117				elsif (!$got_word && $input =~ s/ ^ (? $CASED_WORD )//xo) {
118				push @tokens, [$self->{_spacing_normal}, $+{word}];
119				$got_word = 1;
120				}
121				# normal words
122				elsif ($input =~ / ^ $WORD /xo) {
123				my $word;
124
125				# special case to allow matching q{ridin'} as one word, even when
126				# it appears as q{"ridin'"}, but not as q{'ridin'}
127				my $last_char = @tokens ? substr $tokens[-1][1], -1, 1 : '';
128				if (!@tokens && $input =~ s/ ^ (?$WORD_APOST) //xo
129				\|\| $last_char =~ / ^ $APOSTROPHE $ /xo
130				&& $input =~ s/ ^ (?$WORD_APOST) (?
131				$word = $+{word};
132				}
133				else {
134				$input =~ s/^($WORD)//o and $word = $1;
135				}
136
137				# Maybe preserve the casing of this word
138				$word = lc $word
139				if $word ne uc $word
140				# Mixed-case words like "WoW" or "ATMs"
141				and $word !~ $MIXED_CASE
142				# Words that are upper case followed by a non-word character.
143				and $word !~ $UPPER_NONW;
144
145				push @tokens, [$self->{_spacing_normal}, $word];
146				$got_word = 1;
147				}
148				# everything else
149				elsif ($input =~ s/ ^ (? $NON_WORD ) //xo) {
150				my $non_word = $+{non_word};
151				my $spacing = $self->{_spacing_normal};
152
153				# was the previous token a word?
154				if ($got_word) {
155				$spacing = $input =~ /^$NONSPACE/
156				? $self->{_spacing_infix}
157				: $self->{_spacing_postfix};
158				}
159				# do we still have more tokens?
160				elsif ($input =~ /^$NONSPACE/) {
161				$spacing = $self->{_spacing_prefix};
162				}
163
164				push @tokens, [$spacing, $non_word];
165				}
166				}
167				}
168
169				return \@tokens;
170				}
171
172				# tokens -> output
173				sub make_output {
174				my ($self, $tokens) = @_;
175				my $reply = '';
176
177				for my $pos (0 .. $#{ $tokens }) {
178				my ($spacing, $text) = @{ $tokens->[$pos] };
179				$reply .= $text;
180
181				# append whitespace if this is not a prefix token or infix token,
182				# and this is not the last token, and the next token is not
183				# a postfix/infix token
184				if ($pos != $#{ $tokens }
185				&& $spacing != $self->{_spacing_prefix}
186				&& $spacing != $self->{_spacing_infix}
187				&& !($pos < $#{ $tokens }
188				&& ($tokens->[$pos+1][0] == $self->{_spacing_postfix}
189				\|\| $tokens->[$pos+1][0] == $self->{_spacing_infix})
190				)
191				) {
192				$reply .= ' ';
193				}
194				}
195
196				# capitalize the first word
197				$reply =~ s/^\s$OPEN_QUOTE?\s\K($SPLIT_WORD)(?=$ELLIPSIS\|(?:(?:$CLOSE_QUOTE\|$TERMINATOR\|$ADDRESS\|$PUNCTUATION+)?(?:\s\|$)))/\u$1/o;
198
199				# capitalize the second word
200				$reply =~ s/^\s$OPEN_QUOTE?\s$SPLIT_WORD(?:(?:\s*$TERMINATOR\|$ADDRESS)\s+)\K($SPLIT_WORD)/\u$1/o;
201
202				# capitalize all other words after word boundaries
203				# we do it in two passes because we need to match two words at a time
204				$reply =~ s/(?:$ELLIPSIS\|\s+)$OPEN_QUOTE?\s*$WORD_STRICT$BOUNDARY\K($SPLIT_WORD)/$SEPARATOR\u$1$SEPARATOR/go;
205				$reply =~ s/$SEPARATOR$WORD_STRICT$SEPARATOR$BOUNDARY\K($SPLIT_WORD)/\u$1/go;
206				$reply =~ s/$SEPARATOR//go;
207
208				# end paragraphs with a period when it makes sense
209				$reply =~ s/(?:$ELLIPSIS\|\s+\|^)$OPEN_QUOTE?(?:$SPLIT_WORD(?:\.$SPLIT_WORD)*)\K($CLOSE_QUOTE?)$/.$1/o;
210
211				# capitalize I'm, I've...
212				$reply =~ s{(?:(?:$ELLIPSIS\|\s+)\|$OPEN_QUOTE)\Ki(?=$APOSTROPHE$ALPHABET)}{I}go;
213
214				return $reply;
215				}
216
217				__PACKAGE__->meta->make_immutable;
218
219				=encoding utf8
220
221				=head1 NAME
222
223				Hailo::Tokenizer::Words - A tokenizer for L which splits
224				on whitespace and word boundaries, mostly.
225
226				=head1 DESCRIPTION
227
228				This tokenizer does its best to handle various languages. It knows about most
229				apostrophes, quotes, and sentence terminators.
230
231				=head1 AUTHOR
232
233				Hinrik Ern SigurEsson, hinrik.sig@gmail.com
234
235				=head1 LICENSE AND COPYRIGHT
236
237				Copyright 2010 Hinrik Ern SigurEsson
238
239				This program is free software, you can redistribute it and/or modify
240				it under the same terms as Perl itself.
241
242				=cut