| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package Hailo::Role::Tokenizer; |
|
2
|
|
|
|
|
|
|
our $AUTHORITY = 'cpan:AVAR'; |
|
3
|
|
|
|
|
|
|
$Hailo::Role::Tokenizer::VERSION = '0.75'; |
|
4
|
30
|
|
|
30
|
|
20877
|
use v5.10.0; |
|
|
30
|
|
|
|
|
120
|
|
|
5
|
30
|
|
|
30
|
|
221
|
use Moose::Role; |
|
|
30
|
|
|
|
|
69
|
|
|
|
30
|
|
|
|
|
263
|
|
|
6
|
30
|
|
|
30
|
|
109015
|
use MooseX::Types::Moose ':all'; |
|
|
30
|
|
|
|
|
68
|
|
|
|
30
|
|
|
|
|
288
|
|
|
7
|
30
|
|
|
30
|
|
245541
|
use namespace::clean -except => 'meta'; |
|
|
30
|
|
|
|
|
76
|
|
|
|
30
|
|
|
|
|
287
|
|
|
8
|
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
has spacing => ( |
|
10
|
|
|
|
|
|
|
isa => HashRef[Int], |
|
11
|
|
|
|
|
|
|
is => 'rw', |
|
12
|
|
|
|
|
|
|
default => sub { { |
|
13
|
|
|
|
|
|
|
normal => 0, |
|
14
|
|
|
|
|
|
|
prefix => 1, |
|
15
|
|
|
|
|
|
|
postfix => 2, |
|
16
|
|
|
|
|
|
|
infix => 3, |
|
17
|
|
|
|
|
|
|
} }, |
|
18
|
|
|
|
|
|
|
); |
|
19
|
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
sub BUILD { |
|
21
|
262
|
|
|
262
|
0
|
570
|
my ($self) = @_; |
|
22
|
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
# This performance hack is here because calling |
|
24
|
|
|
|
|
|
|
# $self->spacing->{...} was significant part Tokenizer execution |
|
25
|
|
|
|
|
|
|
# time (~20s / ~1200s) since we're doing one method call and a |
|
26
|
|
|
|
|
|
|
# hash dereference |
|
27
|
|
|
|
|
|
|
|
|
28
|
262
|
|
|
|
|
7401
|
my $spacing = $self->spacing; |
|
29
|
262
|
|
|
|
|
1115
|
while (my ($k, $v) = each %$spacing) { |
|
30
|
1048
|
|
|
|
|
3339
|
$self->{"_spacing_$k"} = $v; |
|
31
|
|
|
|
|
|
|
} |
|
32
|
|
|
|
|
|
|
|
|
33
|
262
|
|
|
|
|
6619
|
return; |
|
34
|
|
|
|
|
|
|
} |
|
35
|
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
requires 'make_tokens'; |
|
37
|
|
|
|
|
|
|
requires 'make_output'; |
|
38
|
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
1; |
|
40
|
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
=encoding utf8 |
|
42
|
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
=head1 NAME |
|
44
|
|
|
|
|
|
|
|
|
45
|
|
|
|
|
|
|
Hailo::Role::Tokenizer - A role representing a L<Hailo|Hailo> tokenizer |
|
46
|
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
=head1 METHODS |
|
48
|
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
=head2 C<new> |
|
50
|
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
This is the constructor. It takes no arguments. |
|
52
|
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
=head2 C<make_tokens> |
|
54
|
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
Takes a line of input and returns an array reference of tokens. A token is |
|
56
|
|
|
|
|
|
|
an array reference containing two elements: a I<spacing attribute> and the |
|
57
|
|
|
|
|
|
|
I<token text>. The spacing attribute is an integer which will be stored along |
|
58
|
|
|
|
|
|
|
with the token text in the database. The following values are currently being |
|
59
|
|
|
|
|
|
|
used: |
|
60
|
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
=over |
|
62
|
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
=item C<0> - normal token |
|
64
|
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
=item C<1> - prefix token (no whitespace follows it) |
|
66
|
|
|
|
|
|
|
|
|
67
|
|
|
|
|
|
|
=item C<2> - postfix token (no whitespace precedes it) |
|
68
|
|
|
|
|
|
|
|
|
69
|
|
|
|
|
|
|
=item C<3> - infix token (no whitespace follows or precedes it) |
|
70
|
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
=back |
|
72
|
|
|
|
|
|
|
|
|
73
|
|
|
|
|
|
|
=head2 C<make_output> |
|
74
|
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
Takes an array reference of tokens and returns a line of output. A token is |
|
76
|
|
|
|
|
|
|
an array reference as described in L<C<make_tokens>|/make_tokens>. The tokens |
|
77
|
|
|
|
|
|
|
will be joined together into a sentence according to the whitespace |
|
78
|
|
|
|
|
|
|
attributes associated with the tokens, as well as any formatting provided by |
|
79
|
|
|
|
|
|
|
the tokenizer implementation. |
|
80
|
|
|
|
|
|
|
|
|
81
|
|
|
|
|
|
|
=head1 AUTHORS |
|
82
|
|
|
|
|
|
|
|
|
83
|
|
|
|
|
|
|
Hinrik E<Ouml>rn SigurE<eth>sson, hinrik.sig@gmail.com |
|
84
|
|
|
|
|
|
|
|
|
85
|
|
|
|
|
|
|
E<AElig>var ArnfjE<ouml>rE<eth> Bjarmason <avar@cpan.org> |
|
86
|
|
|
|
|
|
|
|
|
87
|
|
|
|
|
|
|
=head1 LICENSE AND COPYRIGHT |
|
88
|
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
Copyright 2010 Hinrik E<Ouml>rn SigurE<eth>sson and |
|
90
|
|
|
|
|
|
|
E<AElig>var ArnfjE<ouml>rE<eth> Bjarmason <avar@cpan.org> |
|
91
|
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
This program is free software, you can redistribute it and/or modify |
|
93
|
|
|
|
|
|
|
it under the same terms as Perl itself. |
|
94
|
|
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
=cut |