line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package WordLists::Tag::Tagger;
|
2
|
2
|
|
|
2
|
|
127055
|
use strict;
|
|
2
|
|
|
|
|
5
|
|
|
2
|
|
|
|
|
69
|
|
3
|
2
|
|
|
2
|
|
11
|
use warnings;
|
|
2
|
|
|
|
|
4
|
|
|
2
|
|
|
|
|
55
|
|
4
|
2
|
|
|
2
|
|
1018
|
use utf8;
|
|
2
|
|
|
|
|
13
|
|
|
2
|
|
|
|
|
10
|
|
5
|
2
|
|
|
2
|
|
562
|
use WordLists::Common qw(/generic/);
|
|
2
|
|
|
|
|
6
|
|
|
2
|
|
|
|
|
494
|
|
6
|
2
|
|
|
2
|
|
2667
|
use Lingua::EN::Tagger;
|
|
2
|
|
|
|
|
222434
|
|
|
2
|
|
|
|
|
96
|
|
7
|
2
|
|
|
2
|
|
676
|
use WordLists::WordList;
|
|
2
|
|
|
|
|
5
|
|
|
2
|
|
|
|
|
60
|
|
8
|
2
|
|
|
2
|
|
11
|
use WordLists::Sense;
|
|
2
|
|
|
|
|
5
|
|
|
2
|
|
|
|
|
42
|
|
9
|
2
|
|
|
2
|
|
10
|
use WordLists::Base;
|
|
2
|
|
|
|
|
3
|
|
|
2
|
|
|
|
|
414
|
|
10
|
|
|
|
|
|
|
our $VERSION = $WordLists::Base::VERSION;
|
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
our $AUTOLOAD;
|
13
|
|
|
|
|
|
|
our @ignore_pos_codes = qw(cd to prp prps sym pp pps ppr lrb rrb ppc ppl );
|
14
|
|
|
|
|
|
|
sub _norm_word($)
|
15
|
|
|
|
|
|
|
{
|
16
|
11
|
|
|
11
|
|
21
|
my $s = shift;
|
17
|
11
|
|
|
|
|
21
|
$s=~s/\.//g;
|
18
|
11
|
|
|
|
|
17
|
$s=~s/\/.*//;
|
19
|
11
|
|
|
|
|
18
|
$s=~s/['’]s$//;
|
20
|
11
|
|
|
|
|
22
|
$s=~s/['‘’`"“”]$//;
|
21
|
11
|
|
|
|
|
19
|
$s=~s/^['‘’`"“”]//;
|
22
|
11
|
|
|
|
|
35
|
$s=~tr/…–\xA0\*//d;
|
23
|
11
|
|
|
|
|
30
|
return lc $s;
|
24
|
|
|
|
|
|
|
}
|
25
|
|
|
|
|
|
|
sub human_pos($)
|
26
|
|
|
|
|
|
|
{
|
27
|
29
|
|
|
29
|
0
|
47
|
my $sPos = shift;
|
28
|
29
|
|
|
|
|
58
|
$sPos =~ s<^nn.*$>
|
29
|
|
|
|
|
|
|
;
|
30
|
29
|
|
|
|
|
44
|
$sPos =~ s<^in$>
|
31
|
|
|
|
|
|
|
;
|
32
|
29
|
|
|
|
|
41
|
$sPos =~ s<^to$>
|
33
|
|
|
|
|
|
|
;
|
34
|
29
|
|
|
|
|
59
|
$sPos =~ s<^jj.*$>
|
35
|
|
|
|
|
|
|
;
|
36
|
29
|
|
|
|
|
48
|
$sPos =~ s<^md.*$>
|
37
|
|
|
|
|
|
|
;
|
38
|
29
|
|
|
|
|
35
|
$sPos =~ s<^vb.*$>
|
39
|
|
|
|
|
|
|
;
|
40
|
29
|
|
|
|
|
30
|
$sPos =~ s<^rb.*$>
|
41
|
|
|
|
|
|
|
;
|
42
|
29
|
|
|
|
|
56
|
$sPos =~ s<^det$>
|
43
|
|
|
|
|
|
|
;
|
44
|
29
|
|
|
|
|
36
|
$sPos =~ s<^cc$>
|
45
|
|
|
|
|
|
|
;
|
46
|
29
|
|
|
|
|
34
|
$sPos =~ s<^wrb$>
|
47
|
|
|
|
|
|
|
;
|
48
|
29
|
|
|
|
|
34
|
$sPos =~ s<^wdt$>
|
49
|
|
|
|
|
|
|
;
|
50
|
29
|
|
|
|
|
36
|
$sPos =~ s<^wp$>
|
51
|
|
|
|
|
|
|
;
|
52
|
29
|
|
|
|
|
36
|
$sPos =~ s<^prp.*$>
|
53
|
|
|
|
|
|
|
;
|
54
|
29
|
|
|
|
|
32
|
$sPos =~ s<^cd.*$>
|
55
|
|
|
|
|
|
|
;
|
56
|
29
|
|
|
|
|
37
|
$sPos =~ s<^uh$>
|
57
|
|
|
|
|
|
|
;
|
58
|
29
|
|
|
|
|
142
|
return $sPos;
|
59
|
|
|
|
|
|
|
}
|
60
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
sub new
|
63
|
|
|
|
|
|
|
{
|
64
|
1
|
|
|
1
|
0
|
14
|
my ($class, $args) = @_;
|
65
|
1
|
|
50
|
|
|
11
|
$args ||={};
|
66
|
1
|
|
|
|
|
12
|
my $self = {
|
67
|
|
|
|
|
|
|
tagger=>Lingua::EN::Tagger->new(%$args),
|
68
|
|
|
|
|
|
|
};
|
69
|
1
|
|
|
|
|
237810
|
bless ($self, $class);
|
70
|
|
|
|
|
|
|
}
|
71
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
1;
|
73
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
sub add_tags
|
75
|
|
|
|
|
|
|
{
|
76
|
0
|
|
|
0
|
0
|
0
|
my ($self, $sMS) = @_;
|
77
|
0
|
|
|
|
|
0
|
my $taggedMS = $self->{tagger}->add_tags( $sMS );
|
78
|
0
|
|
|
|
|
0
|
return $taggedMS;
|
79
|
|
|
|
|
|
|
}
|
80
|
|
|
|
|
|
|
|
81
|
|
|
|
|
|
|
sub add_human_tags
|
82
|
|
|
|
|
|
|
{
|
83
|
3
|
|
|
3
|
0
|
7
|
my ($self, $sMS) = @_;
|
84
|
3
|
|
|
|
|
10
|
my $sMSOUT = '';
|
85
|
3
|
|
|
|
|
5
|
foreach my $sSentence (@{$self->{tagger}->get_sentences($sMS)})
|
|
3
|
|
|
|
|
19
|
|
86
|
|
|
|
|
|
|
{
|
87
|
3
|
|
|
|
|
15857
|
$sSentence =~ tr/<>&//d;
|
88
|
3
|
|
|
|
|
16
|
my $taggedSentence = $self->{tagger}->add_tags( $sSentence );
|
89
|
3
|
100
|
|
|
|
3577
|
$taggedSentence =~ s`<([a-z]+)>([^<]+)\1>`human_pos($1) ne $1 ? qq($2) : $2;`ge;
|
|
11
|
|
|
|
|
26
|
|
90
|
3
|
|
|
|
|
140
|
$sMSOUT .= " $taggedSentence ";
|
91
|
|
|
|
|
|
|
}
|
92
|
3
|
|
|
|
|
31
|
return $sMSOUT;
|
93
|
|
|
|
|
|
|
}
|
94
|
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
sub get_wordlist
|
96
|
|
|
|
|
|
|
{
|
97
|
1
|
|
|
1
|
1
|
4
|
my ($self, $sUntagged, $args) = @_;
|
98
|
|
|
|
|
|
|
|
99
|
1
|
|
|
|
|
2
|
my $wl;
|
100
|
1
|
50
|
33
|
|
|
14
|
if (defined $args->{'wl'} and ref $args->{'wl'} eq 'WordLists::WordList')
|
101
|
|
|
|
|
|
|
{
|
102
|
0
|
|
|
|
|
0
|
$wl=$args->{'wl'};
|
103
|
|
|
|
|
|
|
}
|
104
|
|
|
|
|
|
|
else
|
105
|
|
|
|
|
|
|
{
|
106
|
1
|
|
|
|
|
13
|
$wl = WordLists::WordList->new;
|
107
|
|
|
|
|
|
|
}
|
108
|
1
|
|
|
|
|
2
|
foreach my $sSentence (@{$self->{tagger}->get_sentences($sUntagged)})
|
|
1
|
|
|
|
|
6
|
|
109
|
|
|
|
|
|
|
{
|
110
|
1
|
|
|
|
|
3039
|
my $taggedMS;
|
111
|
1
|
|
|
|
|
5
|
$taggedMS = $self->{tagger}->add_tags( $sSentence );
|
112
|
1
|
|
|
|
|
2614
|
while ($taggedMS =~ m`<([a-z]+)>([^<]+)\1>`g)
|
113
|
|
|
|
|
|
|
{
|
114
|
|
|
|
|
|
|
#print "\n$1\t$2";
|
115
|
11
|
|
|
|
|
30
|
my $sHW = _norm_word($2);
|
116
|
11
|
|
|
|
|
18
|
my $sPosCode = $1;
|
117
|
11
|
|
|
|
|
21
|
my $bNext;
|
118
|
11
|
|
|
|
|
22
|
foreach (@ignore_pos_codes) #
|
119
|
|
|
|
|
|
|
{
|
120
|
125
|
100
|
|
|
|
239
|
if ($sPosCode eq $_)
|
121
|
|
|
|
|
|
|
{
|
122
|
2
|
|
|
|
|
3
|
$bNext++;
|
123
|
2
|
|
|
|
|
5
|
last;
|
124
|
|
|
|
|
|
|
}
|
125
|
|
|
|
|
|
|
}
|
126
|
11
|
100
|
|
|
|
32
|
next if $bNext;
|
127
|
9
|
|
|
|
|
21
|
my $sPos = human_pos($sPosCode);
|
128
|
9
|
|
|
|
|
43
|
my $sense = WordLists::Sense->new();
|
129
|
9
|
|
|
|
|
60
|
$sense->set_hw($sHW);
|
130
|
9
|
|
|
|
|
52
|
$sense->set_pos($sPos);
|
131
|
9
|
|
|
|
|
49
|
$sense->set_eg($sSentence);
|
132
|
9
|
|
|
|
|
46
|
$sense->set_poscode($sPosCode);
|
133
|
9
|
50
|
33
|
0
|
|
34
|
if (defined $args->{'callback_on_make_sense'} and ref $args->{'callback_on_make_sense'} eq ref sub{})
|
|
0
|
|
|
|
|
0
|
|
134
|
|
|
|
|
|
|
{
|
135
|
0
|
|
|
|
|
0
|
&{$args->{'callback_on_make_sense'}}($sense);
|
|
0
|
|
|
|
|
0
|
|
136
|
|
|
|
|
|
|
}
|
137
|
9
|
|
|
|
|
57
|
my @senses = $wl->get_senses_for($sense->get_hw, $sense->get_pos);
|
138
|
9
|
100
|
66
|
|
|
53
|
if (@senses and !$args->{'keep_repeats'})
|
139
|
|
|
|
|
|
|
{
|
140
|
|
|
|
|
|
|
|
141
|
|
|
|
|
|
|
}
|
142
|
|
|
|
|
|
|
else
|
143
|
|
|
|
|
|
|
{
|
144
|
8
|
50
|
33
|
0
|
|
38
|
if (defined $args->{'test_sense_before_add'} and ref $args->{'test_sense_before_add'} eq ref sub {} and !&{$args->{'test_sense_before_add'}}($sense))
|
|
0
|
|
33
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
145
|
|
|
|
|
|
|
{
|
146
|
|
|
|
|
|
|
}
|
147
|
|
|
|
|
|
|
else
|
148
|
|
|
|
|
|
|
{
|
149
|
8
|
50
|
33
|
0
|
|
29
|
if (defined $args->{'callback_on_add_sense'} and ref $args->{'callback_on_add_sense'} eq ref sub{})
|
|
0
|
|
|
|
|
0
|
|
150
|
|
|
|
|
|
|
{
|
151
|
0
|
|
|
|
|
0
|
&{$args->{'callback_on_add_sense'}}($sense);
|
|
0
|
|
|
|
|
0
|
|
152
|
|
|
|
|
|
|
}
|
153
|
8
|
|
|
|
|
26
|
$wl->add_sense($sense);
|
154
|
|
|
|
|
|
|
}
|
155
|
|
|
|
|
|
|
}
|
156
|
|
|
|
|
|
|
};
|
157
|
|
|
|
|
|
|
}
|
158
|
1
|
|
|
|
|
9
|
return $wl;
|
159
|
|
|
|
|
|
|
}
|
160
|
|
|
|
|
|
|
=pod
|
161
|
|
|
|
|
|
|
|
162
|
|
|
|
|
|
|
=head1 NAME
|
163
|
|
|
|
|
|
|
|
164
|
|
|
|
|
|
|
WordLists::Tag::Tagger
|
165
|
|
|
|
|
|
|
|
166
|
|
|
|
|
|
|
=head1 SYNOPSIS
|
167
|
|
|
|
|
|
|
|
168
|
|
|
|
|
|
|
my $tagger = WordLists::Tag::Tagger->new();
|
169
|
|
|
|
|
|
|
my $wl = $tagger->get_wordlist('The quick brown fox jumped over the lazy dog');
|
170
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
=head1 DESCRIPTION
|
172
|
|
|
|
|
|
|
|
173
|
|
|
|
|
|
|
Uses L to do various things with strings, chielfly to create a L out of a document, e.g. to use as a basis for a glossary.
|
174
|
|
|
|
|
|
|
|
175
|
|
|
|
|
|
|
=head1 METHODS
|
176
|
|
|
|
|
|
|
|
177
|
|
|
|
|
|
|
=head3 get_wordlist
|
178
|
|
|
|
|
|
|
|
179
|
|
|
|
|
|
|
Uses L to create a L out of a string (e.g. a manuscript).
|
180
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
L allows splitting into sentences, and these sentences become C fields in the Ls generated.
|
182
|
|
|
|
|
|
|
|
183
|
|
|
|
|
|
|
Only the first instance of each headword / part of speech combination is entered into the list, unless the third argument has a key C with a true value.
|
184
|
|
|
|
|
|
|
|
185
|
|
|
|
|
|
|
The fields populated are: C, C, C, and C, which is the original part of speech code outputted by the tagger.
|
186
|
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
The third argument is a hashref which allows you to configure several options.
|
188
|
|
|
|
|
|
|
|
189
|
|
|
|
|
|
|
C should be a coderef. It is passed the sense immediately before it is added to the wordlist, e.g. if you want to add a unit.
|
190
|
|
|
|
|
|
|
|
191
|
|
|
|
|
|
|
C should be a coderef. It is passed the sense before the wordlist is tested for inclusion. This is an opportunity to further normalise parts of speech.
|
192
|
|
|
|
|
|
|
|
193
|
|
|
|
|
|
|
C is a flag, which, if set, prevents the code from removing repetitions.
|
194
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
=head1 TODO
|
196
|
|
|
|
|
|
|
|
197
|
|
|
|
|
|
|
C is a wordlist or arrayref whose elements are L objects or plain hashrefs of the form C<< {hw=>'head', pos =>'n'} >>, or headwords as strings. If these words are found, they are not added to the list. (not yet implemented!)
|
198
|
|
|
|
|
|
|
|
199
|
|
|
|
|
|
|
=head1 BUGS
|
200
|
|
|
|
|
|
|
|
201
|
|
|
|
|
|
|
Please use the Github issues tracker.
|
202
|
|
|
|
|
|
|
|
203
|
|
|
|
|
|
|
=head1 LICENSE
|
204
|
|
|
|
|
|
|
|
205
|
|
|
|
|
|
|
Copyright 2011-2012 © Cambridge University Press. This module is free software; you can redistribute it and/or modify it under the same terms as Perl itself.
|
206
|
|
|
|
|
|
|
|
207
|
|
|
|
|
|
|
=cut |