line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Text::StemTagPOS; |
2
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
require 5.006002; |
4
|
3
|
|
|
3
|
|
155992
|
use strict; |
|
3
|
|
|
|
|
11
|
|
|
3
|
|
|
|
|
149
|
|
5
|
3
|
|
|
3
|
|
17
|
use warnings; |
|
3
|
|
|
|
|
7
|
|
|
3
|
|
|
|
|
180
|
|
6
|
3
|
|
|
3
|
|
18
|
use Carp; |
|
3
|
|
|
|
|
10
|
|
|
3
|
|
|
|
|
553
|
|
7
|
3
|
|
|
3
|
|
5432
|
use Encode; |
|
3
|
|
|
|
|
93626
|
|
|
3
|
|
|
|
|
354
|
|
8
|
3
|
|
|
3
|
|
3191
|
use Lingua::Stem::Snowball; |
|
3
|
|
|
|
|
8033
|
|
|
3
|
|
|
|
|
206
|
|
9
|
3
|
|
|
3
|
|
4021
|
use Lingua::EN::Tagger; |
|
3
|
|
|
|
|
529363
|
|
|
3
|
|
|
|
|
180
|
|
10
|
3
|
|
|
3
|
|
5353
|
use Data::Dump qw(dump); |
|
3
|
|
|
|
|
59664
|
|
|
3
|
|
|
|
|
407
|
|
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
BEGIN { |
13
|
3
|
|
|
3
|
|
37
|
use Exporter (); |
|
3
|
|
|
|
|
8
|
|
|
3
|
|
|
|
|
77
|
|
14
|
3
|
|
|
3
|
|
17
|
use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS); |
|
3
|
|
|
|
|
7
|
|
|
3
|
|
|
|
|
368
|
|
15
|
3
|
|
|
3
|
|
31
|
$VERSION = '0.61'; |
16
|
3
|
|
|
|
|
66
|
@ISA = qw(Exporter); |
17
|
3
|
|
|
|
|
7
|
@EXPORT = qw(); |
18
|
3
|
|
|
|
|
7
|
@EXPORT_OK = qw(); |
19
|
3
|
|
|
|
|
503
|
%EXPORT_TAGS = (); |
20
|
|
|
|
|
|
|
} |
21
|
|
|
|
|
|
|
|
22
|
3
|
|
|
3
|
|
21
|
use constant WORD_STEMMED => 0; |
|
3
|
|
|
|
|
6
|
|
|
3
|
|
|
|
|
226
|
|
23
|
3
|
|
|
3
|
|
16
|
use constant WORD_ORIGINAL => 1; |
|
3
|
|
|
|
|
5
|
|
|
3
|
|
|
|
|
130
|
|
24
|
3
|
|
|
3
|
|
15
|
use constant WORD_POSTAG => 2; |
|
3
|
|
|
|
|
14
|
|
|
3
|
|
|
|
|
610
|
|
25
|
3
|
|
|
3
|
|
18
|
use constant WORD_INDEX => 3; |
|
3
|
|
|
|
|
5
|
|
|
3
|
|
|
|
|
141
|
|
26
|
3
|
|
|
3
|
|
17
|
use constant WORD_CHAR_POSITION => 4; |
|
3
|
|
|
|
|
6
|
|
|
3
|
|
|
|
|
123
|
|
27
|
3
|
|
|
3
|
|
16
|
use constant WORD_CHAR_LENGTH => 5; |
|
3
|
|
|
|
|
4
|
|
|
3
|
|
|
|
|
119
|
|
28
|
3
|
|
|
3
|
|
15
|
use constant WORD_SENTENCE_ID => 6; |
|
3
|
|
|
|
|
5
|
|
|
3
|
|
|
|
|
148
|
|
29
|
3
|
|
|
3
|
|
549
|
use constant WORD_USER_DEFINED => 7; |
|
3
|
|
|
|
|
7
|
|
|
3
|
|
|
|
|
148
|
|
30
|
|
|
|
|
|
|
|
31
|
3
|
|
|
3
|
|
15
|
use constant POSTAGS_PERIOD => 'PP'; |
|
3
|
|
|
|
|
6
|
|
|
3
|
|
|
|
|
144
|
|
32
|
3
|
|
|
3
|
|
17
|
use constant POSTAGS_PUNCTUATION => qw(PP PGP PPC PPD PPL PPR PPS LRB RRB SYM); |
|
3
|
|
|
|
|
463
|
|
|
3
|
|
|
|
|
220
|
|
33
|
3
|
|
|
3
|
|
19
|
use constant POSTAGS_NOUN => qw(NN NNP NNPS NNS); |
|
3
|
|
|
|
|
6
|
|
|
3
|
|
|
|
|
160
|
|
34
|
3
|
|
|
3
|
|
16
|
use constant POSTAGS_ADJECTIVE => qw(CD JJ JJR JJS); |
|
3
|
|
|
|
|
7
|
|
|
3
|
|
|
|
|
298
|
|
35
|
3
|
|
|
3
|
|
16
|
use constant POSTAGS_VERB => qw(VB VBD VBG VBN VBP VBZ); |
|
3
|
|
|
|
|
7
|
|
|
3
|
|
|
|
|
177
|
|
36
|
3
|
|
|
3
|
|
24
|
use constant POSTAGS_ADVERB => qw(RB RBR RBS RP WRB); |
|
3
|
|
|
|
|
6
|
|
|
3
|
|
|
|
|
153
|
|
37
|
3
|
|
|
3
|
|
17
|
use constant POSTAGS_CONTENT_ADVERB => qw(RBR RBS RP); |
|
3
|
|
|
|
|
5
|
|
|
3
|
|
|
|
|
730
|
|
38
|
3
|
|
|
3
|
|
19
|
use constant POSTAGS_ALL => qw(CC CD DET EX FW IN JJ JJR JJS LS MD NN NNP NNPS NNS PDT POS PRP PRPS RB RBR RBS RP SYM TO UH VB VBD VBG VBN VBP VBZ WDT WP WPS WRB PP PGP PPC PPD PPL PPR PPS LRB RRB); |
|
3
|
|
|
|
|
5
|
|
|
3
|
|
|
|
|
323
|
|
39
|
3
|
|
|
3
|
|
18
|
use constant POSTAGS_CONTENT => (POSTAGS_CONTENT_ADVERB, POSTAGS_VERB, POSTAGS_ADJECTIVE, POSTAGS_NOUN); |
|
3
|
|
|
|
|
5
|
|
|
3
|
|
|
|
|
643
|
|
40
|
3
|
|
|
3
|
|
18
|
use constant POSTAGS_TEXTRANK => (POSTAGS_NOUN, POSTAGS_ADJECTIVE); |
|
3
|
|
|
|
|
7
|
|
|
3
|
|
|
|
|
39782
|
|
41
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
=head1 NAME |
43
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
C - Computes stemmed/POS tagged lists of text. |
45
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
=head1 SYNOPSIS |
47
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
use Text::StemTagPOS; |
49
|
|
|
|
|
|
|
use Data::Dump qw(dump); |
50
|
|
|
|
|
|
|
my $stemTagger = Text::StemTagPOS->new; |
51
|
|
|
|
|
|
|
my $text = 'The first sentence. Sentence number two.'; |
52
|
|
|
|
|
|
|
my $listOfStemmedTaggedSentences = $stemTagger->getStemmedAndTaggedText ($text); |
53
|
|
|
|
|
|
|
dump $listOfStemmedTaggedSentences; |
54
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
=head1 DESCRIPTION |
56
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
C uses the modules L and L |
58
|
|
|
|
|
|
|
to do part-of-speech tagging and stemming of English text. It was developed |
59
|
|
|
|
|
|
|
to pre-process text for other modules. Encoding of all text |
60
|
|
|
|
|
|
|
should be in Perl's internal format; see L for converting text from |
61
|
|
|
|
|
|
|
various encodes to a Perl string. |
62
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
=head1 CONSTRUCTOR |
64
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
=head2 C |
66
|
|
|
|
|
|
|
|
67
|
|
|
|
|
|
|
The method C creates an instance of the C class with the following |
68
|
|
|
|
|
|
|
parameters: |
69
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
=over |
71
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
=item C |
73
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
isoLangCode => 'en' |
75
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
C is the ISO language code of the language that will be tagged and |
77
|
|
|
|
|
|
|
stemmed by the object. It must be 'en', which is the default; other languages |
78
|
|
|
|
|
|
|
may be added when POS taggers for them are added to CPAN. |
79
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
=item C |
81
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
endingSentenceTag => 'PP' |
83
|
|
|
|
|
|
|
|
84
|
|
|
|
|
|
|
C is the part-of-speech tag from L |
85
|
|
|
|
|
|
|
that will be used to indicate |
86
|
|
|
|
|
|
|
the end of a sentence. The default is 'PP'. The value of C must be |
87
|
|
|
|
|
|
|
a tag generated by the module L; see method |
88
|
|
|
|
|
|
|
C for all the possible tags; which are based on the |
89
|
|
|
|
|
|
|
Penn Treebank tagset. |
90
|
|
|
|
|
|
|
|
91
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
=item C and/or C |
93
|
|
|
|
|
|
|
|
94
|
|
|
|
|
|
|
listOfPOSTypesToKeep => [...], listOfPOSTagsToKeep => [...] |
95
|
|
|
|
|
|
|
|
96
|
|
|
|
|
|
|
The method C uses C and C |
97
|
|
|
|
|
|
|
to build the default list of the |
98
|
|
|
|
|
|
|
parts-of-speech to be retained when filtering previously tagged text. |
99
|
|
|
|
|
|
|
The default list is C<[qw(TEXTRANK_WORDS)]>, |
100
|
|
|
|
|
|
|
which is all the nouns and adjectives in the text, as used in the textrank algorithm. Permitted |
101
|
|
|
|
|
|
|
types for C are 'ALL', 'ADJECTIVES', 'ADVERBS', 'CONTENT_WORDS', 'NOUNS', 'PUNCTUATION', |
102
|
|
|
|
|
|
|
'TEXTRANK_WORDS', and 'VERBS'. C provides finer control over the |
103
|
|
|
|
|
|
|
parts-of-speech to be retained. For a list |
104
|
|
|
|
|
|
|
of all the possible tags see method C. |
105
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
=back |
107
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
=cut |
109
|
|
|
|
|
|
|
|
110
|
|
|
|
|
|
|
sub new |
111
|
|
|
|
|
|
|
{ |
112
|
|
|
|
|
|
|
# create the class object. |
113
|
4
|
|
|
4
|
1
|
648
|
my ($Class, %Parameters) = @_; |
114
|
4
|
|
33
|
|
|
41
|
my $Self = bless {}, ref($Class) || $Class; |
115
|
|
|
|
|
|
|
|
116
|
|
|
|
|
|
|
# get the stemmer to normalize the words, default is english. |
117
|
4
|
|
|
|
|
13
|
my $isoLangCode = 'en'; |
118
|
4
|
50
|
|
|
|
18
|
$isoLangCode = lc $Parameters{isoLangCode} if (exists ($Parameters{isoLangCode})); |
119
|
4
|
|
|
|
|
53
|
$Self->{stemmer} = Lingua::Stem::Snowball->new (lang => $isoLangCode, encode => 'UTF-8'); |
120
|
|
|
|
|
|
|
|
121
|
|
|
|
|
|
|
# get the POS tagger. |
122
|
4
|
50
|
|
|
|
495
|
if ($isoLangCode eq 'en') |
123
|
|
|
|
|
|
|
{ |
124
|
4
|
|
|
|
|
48
|
$Self->{tagger} = new Lingua::EN::Tagger; |
125
|
|
|
|
|
|
|
} |
126
|
|
|
|
|
|
|
else |
127
|
|
|
|
|
|
|
{ |
128
|
0
|
|
|
|
|
0
|
croak "parameter isoLangCode must be 'en'.\n"; |
129
|
|
|
|
|
|
|
} |
130
|
|
|
|
|
|
|
|
131
|
|
|
|
|
|
|
# read in the part of speech tags. |
132
|
4
|
|
|
|
|
1236548
|
$Self->_getPartOfSpeechTags (); |
133
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
# get the ending sentence part of speech tag which will be |
135
|
|
|
|
|
|
|
# used to break text into array of sentences. |
136
|
4
|
50
|
|
|
|
39
|
unless (exists ($Parameters{endingSentenceTag})) |
137
|
|
|
|
|
|
|
{ |
138
|
4
|
|
|
|
|
17
|
$Parameters{endingSentenceTag} = 'PP'; |
139
|
|
|
|
|
|
|
} |
140
|
4
|
|
|
|
|
18
|
$Self->{endingSentenceTag} = uc $Parameters{endingSentenceTag}; |
141
|
4
|
|
|
|
|
19
|
$Self->{endingSentenceTag} =~ tr/A-Z//cd; |
142
|
4
|
50
|
|
|
|
39
|
$Self->{endingSentenceTag} = '/' . $Self->{endingSentenceTag} if (substr ($Self->{endingSentenceTag}, 0, 1) ne '/'); |
143
|
|
|
|
|
|
|
|
144
|
|
|
|
|
|
|
# get the list of parts of speech to keep when filtering. |
145
|
|
|
|
|
|
|
# the default is to keep only nouns and adjetives. |
146
|
4
|
|
|
|
|
30
|
$Self->{hashOfPOSTagsToKeep} = $Self->_getHashOfPOSTagsToKeep (%Parameters, instantiation => 1); |
147
|
4
|
|
|
|
|
30
|
return $Self; |
148
|
|
|
|
|
|
|
} |
149
|
|
|
|
|
|
|
|
150
|
|
|
|
|
|
|
=head1 METHODS |
151
|
|
|
|
|
|
|
|
152
|
|
|
|
|
|
|
=head2 C |
153
|
|
|
|
|
|
|
|
154
|
|
|
|
|
|
|
getStemmedAndTaggedText (@Text, $Text, \@Text) |
155
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
The method C returns a hierarchy of array references containing the stemmed words, |
157
|
|
|
|
|
|
|
the original words, their part-of-speech tag, and their word position index within the original text. |
158
|
|
|
|
|
|
|
The hierarchy is of the form |
159
|
|
|
|
|
|
|
|
160
|
|
|
|
|
|
|
[ |
161
|
|
|
|
|
|
|
[ # sentence level: first sentence. |
162
|
|
|
|
|
|
|
[ # word level: first word. |
163
|
|
|
|
|
|
|
stemmed word, original word, part-of-speech tag, word index, word position, word length |
164
|
|
|
|
|
|
|
] |
165
|
|
|
|
|
|
|
[ # word level: second word. |
166
|
|
|
|
|
|
|
stemmed word, original word, part-of-speech tag, word index, word position, word length |
167
|
|
|
|
|
|
|
] |
168
|
|
|
|
|
|
|
... |
169
|
|
|
|
|
|
|
] |
170
|
|
|
|
|
|
|
[ # sentence level: second sentence. |
171
|
|
|
|
|
|
|
[ # word level: first word. |
172
|
|
|
|
|
|
|
stemmed word, original word, part-of-speech tag, word index, word position, word length |
173
|
|
|
|
|
|
|
] |
174
|
|
|
|
|
|
|
[ # word level: second word. |
175
|
|
|
|
|
|
|
stemmed word, original word, part-of-speech tag, word index, word position, word length |
176
|
|
|
|
|
|
|
] |
177
|
|
|
|
|
|
|
... |
178
|
|
|
|
|
|
|
] |
179
|
|
|
|
|
|
|
] |
180
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
Its only parameters are any combination of strings of text as scalars, references to |
182
|
|
|
|
|
|
|
scalars, arrays of strings of text, or references to arrays of strings of text, etc... |
183
|
|
|
|
|
|
|
The following examples below show the various ways to call the method; note that the constants |
184
|
|
|
|
|
|
|
Text::StemTagPOS::WORD_STEMMED, |
185
|
|
|
|
|
|
|
Text::StemTagPOS::WORD_ORIGINAL, |
186
|
|
|
|
|
|
|
Text::StemTagPOS::WORD_POSTAG, |
187
|
|
|
|
|
|
|
Text::StemTagPOS::WORD_INDEX, |
188
|
|
|
|
|
|
|
Text::StemTagPOS::WORD_CHAR_POSITION, |
189
|
|
|
|
|
|
|
Text::StemTagPOS::WORD_CHAR_LENGTH, |
190
|
|
|
|
|
|
|
Text::StemTagPOS::WORD_SENTENCE_ID, and |
191
|
|
|
|
|
|
|
Text::StemTagPOS::WORD_USER_DEFINED, |
192
|
|
|
|
|
|
|
are used to access the information about each word. |
193
|
|
|
|
|
|
|
|
194
|
|
|
|
|
|
|
use Text::StemTagPOS; |
195
|
|
|
|
|
|
|
use Data::Dump qw(dump); |
196
|
|
|
|
|
|
|
my $stemTagger = Text::StemTagPOS->new; |
197
|
|
|
|
|
|
|
my $text = 'The first sentence. Sentence number two.'; |
198
|
|
|
|
|
|
|
my $listOfStemmedTaggedSentences = $stemTagger->getStemmedAndTaggedText ($text); |
199
|
|
|
|
|
|
|
dump $listOfStemmedTaggedSentences; |
200
|
|
|
|
|
|
|
|
201
|
|
|
|
|
|
|
# dumps: |
202
|
|
|
|
|
|
|
# [ |
203
|
|
|
|
|
|
|
# [ |
204
|
|
|
|
|
|
|
# ["the", "The", "/DET", 0, 0, 3, 0], |
205
|
|
|
|
|
|
|
# [" ", " ", "/PGP", 1, 3, 1, 0], |
206
|
|
|
|
|
|
|
# ["first", "first", "/JJ", 2, 4, 5, 0], |
207
|
|
|
|
|
|
|
# [" ", " ", "/PGP", 3, 9, 1, 0], |
208
|
|
|
|
|
|
|
# ["sentenc", "sentence", "/NN", 4, 10, 8, 0], |
209
|
|
|
|
|
|
|
# [".", ".", "/PP", 5, 18, 1, 0], |
210
|
|
|
|
|
|
|
# [" ", " ", "/PGP", 6, 19, 1, 0], |
211
|
|
|
|
|
|
|
# ], |
212
|
|
|
|
|
|
|
# [ |
213
|
|
|
|
|
|
|
# ["sentenc", "Sentence", "/NN", 7, 20, 8, 1], |
214
|
|
|
|
|
|
|
# [" ", " ", "/PGP", 8, 28, 1, 1], |
215
|
|
|
|
|
|
|
# ["number", "number", "/NN", 9, 29, 6, 1], |
216
|
|
|
|
|
|
|
# [" ", " ", "/PGP", 10, 35, 1, 1], |
217
|
|
|
|
|
|
|
# ["two", "two", "/CD", 11, 36, 3, 1], |
218
|
|
|
|
|
|
|
# [".", ".", "/PP", 12, 39, 1, 1], |
219
|
|
|
|
|
|
|
# ], |
220
|
|
|
|
|
|
|
# ] |
221
|
|
|
|
|
|
|
|
222
|
|
|
|
|
|
|
my $word = $listOfStemmedTaggedSentences->[0][0]; |
223
|
|
|
|
|
|
|
print |
224
|
|
|
|
|
|
|
'WORD_STEMMED: ' . |
225
|
|
|
|
|
|
|
"'" . $word->[Text::StemTagPOS::WORD_STEMMED] . "'\n" . |
226
|
|
|
|
|
|
|
'WORD_ORIGINAL: ' . |
227
|
|
|
|
|
|
|
"'" . $word->[Text::StemTagPOS::WORD_ORIGINAL] . "'\n" . |
228
|
|
|
|
|
|
|
'WORD_POSTAG: ' . |
229
|
|
|
|
|
|
|
"'" . $word->[Text::StemTagPOS::WORD_POSTAG] . "'\n" . |
230
|
|
|
|
|
|
|
'WORD_INDEX: ' . |
231
|
|
|
|
|
|
|
$word->[Text::StemTagPOS::WORD_INDEX] . "\n" . |
232
|
|
|
|
|
|
|
'WORD_CHAR_POSITION: ' . |
233
|
|
|
|
|
|
|
$word->[Text::StemTagPOS::WORD_CHAR_POSITION] . "\n" . |
234
|
|
|
|
|
|
|
'WORD_CHAR_LENGTH: ' . |
235
|
|
|
|
|
|
|
$word->[Text::StemTagPOS::WORD_CHAR_LENGTH] . "\n"; |
236
|
|
|
|
|
|
|
|
237
|
|
|
|
|
|
|
# prints: |
238
|
|
|
|
|
|
|
# WORD_STEMMED: 'the' |
239
|
|
|
|
|
|
|
# WORD_ORIGINAL: 'The' |
240
|
|
|
|
|
|
|
# WORD_POSTAG: '/DET' |
241
|
|
|
|
|
|
|
# WORD_INDEX: 0 |
242
|
|
|
|
|
|
|
# WORD_CHAR_POSITION: 0 |
243
|
|
|
|
|
|
|
# WORD_CHAR_LENGTH: 3 |
244
|
|
|
|
|
|
|
|
245
|
|
|
|
|
|
|
The following example shows the various ways the text can be passed to the method: |
246
|
|
|
|
|
|
|
|
247
|
|
|
|
|
|
|
use Text::StemTagPOS; |
248
|
|
|
|
|
|
|
use Data::Dump qw(dump); |
249
|
|
|
|
|
|
|
my $stemTagger = Text::StemTagPOS->new; |
250
|
|
|
|
|
|
|
my $text = 'This is a sentence with seven words.'; |
251
|
|
|
|
|
|
|
dump $stemTagger->getStemmedAndTaggedText ($text, |
252
|
|
|
|
|
|
|
[$text, \$text], ($text, \$text)); |
253
|
|
|
|
|
|
|
|
254
|
|
|
|
|
|
|
=cut |
255
|
|
|
|
|
|
|
|
256
|
|
|
|
|
|
|
sub getStemmedAndTaggedText |
257
|
|
|
|
|
|
|
{ |
258
|
4322
|
|
|
4322
|
1
|
11935
|
my ($Self, @Text) = @_; |
259
|
|
|
|
|
|
|
|
260
|
|
|
|
|
|
|
# convert the data to process to a list of strings. |
261
|
4322
|
|
|
|
|
17282
|
my $ListOfStrings = $Self->_flattenList (@Text); |
262
|
|
|
|
|
|
|
|
263
|
4322
|
|
|
|
|
12952
|
my @sentences; |
264
|
4322
|
|
|
|
|
19241
|
my $initialIndex = 0; |
265
|
4322
|
|
|
|
|
13781
|
foreach my $text (@$ListOfStrings) |
266
|
|
|
|
|
|
|
{ |
267
|
|
|
|
|
|
|
# skip undefined text. |
268
|
4322
|
50
|
|
|
|
13633
|
next unless defined $text; |
269
|
|
|
|
|
|
|
|
270
|
|
|
|
|
|
|
# replace all backslashes with a space, since the tagger adds these. |
271
|
4322
|
|
|
|
|
8346
|
my $cleanedText = $text; |
272
|
4322
|
|
|
|
|
21505
|
$cleanedText =~ tr/\/\-/ /; |
273
|
4322
|
50
|
|
|
|
30279
|
if ($cleanedText !~ m/^\s*$/) |
274
|
|
|
|
|
|
|
{ |
275
|
|
|
|
|
|
|
# tag the text. |
276
|
4322
|
|
|
|
|
68847
|
my $taggedText = $Self->{tagger}->get_readable ($cleanedText); |
277
|
|
|
|
|
|
|
|
278
|
|
|
|
|
|
|
# convert to a list of sentence word,tag pairs. |
279
|
4322
|
|
|
|
|
7836324
|
my $listOfSentences = $Self->_convertTextToListOfSentenceWordTags ($taggedText); |
280
|
|
|
|
|
|
|
|
281
|
|
|
|
|
|
|
# add the position in the original text and the length of the words. |
282
|
4322
|
|
|
|
|
26863
|
$listOfSentences = $Self->_addPositionLengthWordInfo ($initialIndex, $text, $listOfSentences); |
283
|
|
|
|
|
|
|
|
284
|
|
|
|
|
|
|
# save the list of sentences. |
285
|
4322
|
|
|
|
|
18877
|
push @sentences, @$listOfSentences; |
286
|
|
|
|
|
|
|
} |
287
|
|
|
|
|
|
|
|
288
|
|
|
|
|
|
|
# update the initial index for the next string. |
289
|
4322
|
|
|
|
|
12898
|
$initialIndex += length $text; |
290
|
|
|
|
|
|
|
} |
291
|
|
|
|
|
|
|
|
292
|
4322
|
|
|
|
|
9325
|
my $wordIndex = 0; |
293
|
4322
|
|
|
|
|
8430
|
foreach my $sentenceList (@sentences) |
294
|
|
|
|
|
|
|
{ |
295
|
|
|
|
|
|
|
# pull off the words in the sentence and put them in a list. |
296
|
|
|
|
|
|
|
# words must be encoded to utf8 for stemmer. |
297
|
4345
|
|
|
|
|
10326
|
my @wordList = map {encode("utf8", $_->[WORD_ORIGINAL])} @$sentenceList; |
|
28300
|
|
|
|
|
592412
|
|
298
|
|
|
|
|
|
|
|
299
|
|
|
|
|
|
|
# stem the list of words. |
300
|
4345
|
|
|
|
|
194460
|
$Self->{stemmer}->stem_in_place (\@wordList); |
301
|
|
|
|
|
|
|
|
302
|
|
|
|
|
|
|
# insert the stemmed words to the list as: |
303
|
|
|
|
|
|
|
# [stemmed word, original word, part of speech tag, word index, word position, word length] |
304
|
4345
|
|
|
|
|
29741
|
for (my $i = 0; $i < @wordList; $i++) |
305
|
|
|
|
|
|
|
{ |
306
|
28300
|
|
|
|
|
62672
|
my $wordTags = $sentenceList->[$i]; |
307
|
|
|
|
|
|
|
# insert the stemmed word and convert it from utf8 to Perl's internal format. |
308
|
28300
|
|
|
|
|
75286
|
$wordTags->[WORD_STEMMED] = lc decode ('utf8', $wordList[$i]); |
309
|
28300
|
|
|
|
|
925358
|
$wordTags->[WORD_INDEX] = $wordIndex++; |
310
|
|
|
|
|
|
|
} |
311
|
|
|
|
|
|
|
} |
312
|
|
|
|
|
|
|
|
313
|
4322
|
|
|
|
|
21171
|
return \@sentences; |
314
|
|
|
|
|
|
|
} |
315
|
|
|
|
|
|
|
|
316
|
|
|
|
|
|
|
# add the position of the words in the original text to the info about each |
317
|
|
|
|
|
|
|
# of the words. really should rewrite Lingua::EN::Tagger to provide this |
318
|
|
|
|
|
|
|
# information; sloppy to recompute it and method used is fragile. |
319
|
|
|
|
|
|
|
sub _addPositionLengthWordInfo |
320
|
|
|
|
|
|
|
{ |
321
|
4322
|
|
|
4322
|
|
11244
|
my ($Self, $PositionOffset, $Text, $ListOfSentences) = @_; |
322
|
|
|
|
|
|
|
|
323
|
4322
|
|
|
|
|
9102
|
my $startingIndex = 0; |
324
|
4322
|
|
|
|
|
7572
|
my @listOfWords; |
325
|
4322
|
|
|
|
|
14707
|
foreach my $sentence (@$ListOfSentences) |
326
|
|
|
|
|
|
|
{ |
327
|
4345
|
|
|
|
|
14781
|
foreach my $word (@$sentence) |
328
|
|
|
|
|
|
|
{ |
329
|
|
|
|
|
|
|
# store all the words for subsequent analysis for missing words. |
330
|
16356
|
|
|
|
|
29948
|
push @listOfWords, $word; |
331
|
|
|
|
|
|
|
|
332
|
|
|
|
|
|
|
# get and store the length of the word. |
333
|
16356
|
|
|
|
|
32459
|
my $length = length ($word->[WORD_ORIGINAL]); |
334
|
16356
|
|
|
|
|
26712
|
$word->[WORD_CHAR_LENGTH] = $length; |
335
|
|
|
|
|
|
|
|
336
|
|
|
|
|
|
|
# default position is -1, like index function. |
337
|
16356
|
|
|
|
|
24380
|
$word->[WORD_CHAR_POSITION] = -1; |
338
|
|
|
|
|
|
|
|
339
|
|
|
|
|
|
|
# find the starting position of the word. |
340
|
16356
|
|
|
|
|
37780
|
my $position = index ($Text, $word->[WORD_ORIGINAL], $startingIndex); |
341
|
|
|
|
|
|
|
|
342
|
16356
|
100
|
|
|
|
44072
|
if ($position >= $startingIndex) |
343
|
|
|
|
|
|
|
{ |
344
|
|
|
|
|
|
|
# add position and update starting position for next search. |
345
|
16342
|
|
|
|
|
32406
|
$word->[WORD_CHAR_POSITION] = $PositionOffset + $position; |
346
|
16342
|
|
|
|
|
44747
|
$startingIndex = $position + $length; |
347
|
|
|
|
|
|
|
} |
348
|
|
|
|
|
|
|
} |
349
|
|
|
|
|
|
|
} |
350
|
|
|
|
|
|
|
|
351
|
|
|
|
|
|
|
# estimate the positions of the missing words. |
352
|
4322
|
|
|
|
|
9092
|
my $totalWords = @listOfWords; |
353
|
4322
|
|
|
|
|
12308
|
my $i = -1; |
354
|
4322
|
|
|
|
|
14800
|
while (++$i < $totalWords) |
355
|
|
|
|
|
|
|
{ |
356
|
16342
|
100
|
|
|
|
75231
|
if ($listOfWords[$i]->[WORD_CHAR_POSITION] == -1) |
357
|
|
|
|
|
|
|
{ |
358
|
|
|
|
|
|
|
# get the starting and ending index of the sequence of missing words. |
359
|
12
|
|
|
|
|
14
|
my $noPositionStartIndex = $i; |
360
|
12
|
|
|
|
|
27
|
while (++$i < $totalWords) |
361
|
|
|
|
|
|
|
{ |
362
|
14
|
100
|
|
|
|
97
|
last if ($listOfWords[$i]->[WORD_CHAR_POSITION] != -1); |
363
|
|
|
|
|
|
|
} |
364
|
12
|
|
|
|
|
17
|
my $noPositionEndIndex = $i; |
365
|
12
|
|
|
|
|
15
|
my $totalMissingWords = $noPositionEndIndex - $noPositionStartIndex; |
366
|
12
|
50
|
|
|
|
27
|
$totalMissingWords = 0 if ($totalMissingWords < 0); |
367
|
|
|
|
|
|
|
|
368
|
|
|
|
|
|
|
# get the substring corresponding to the missing words. |
369
|
|
|
|
|
|
|
|
370
|
|
|
|
|
|
|
# compute the starting index of the substring. |
371
|
12
|
|
|
|
|
14
|
my $startIndexPosition = $PositionOffset; |
372
|
12
|
50
|
|
|
|
27
|
if ($noPositionStartIndex > 0) |
373
|
|
|
|
|
|
|
{ |
374
|
12
|
|
|
|
|
25
|
$startIndexPosition = $listOfWords[$noPositionStartIndex - 1]->[WORD_CHAR_POSITION] + $listOfWords[$noPositionStartIndex - 1]->[WORD_CHAR_LENGTH]; |
375
|
|
|
|
|
|
|
} |
376
|
|
|
|
|
|
|
|
377
|
|
|
|
|
|
|
# get the ending string index. |
378
|
12
|
|
|
|
|
13
|
my $endIndexPosition = length $Text; |
379
|
12
|
50
|
|
|
|
27
|
if ($noPositionEndIndex < $totalWords) |
380
|
|
|
|
|
|
|
{ |
381
|
12
|
|
|
|
|
17
|
$endIndexPosition = $listOfWords[$noPositionEndIndex]->[WORD_CHAR_POSITION]; |
382
|
|
|
|
|
|
|
} |
383
|
|
|
|
|
|
|
|
384
|
|
|
|
|
|
|
# get the substring. |
385
|
12
|
|
|
|
|
28
|
my $substring = substr ($Text, $startIndexPosition - $PositionOffset, $endIndexPosition - $startIndexPosition); |
386
|
|
|
|
|
|
|
|
387
|
|
|
|
|
|
|
# extract the non-white space items. |
388
|
12
|
|
|
|
|
14
|
my @listOfTokens; |
389
|
12
|
|
|
|
|
65
|
while ($substring =~ m/([^\s]+)/g) |
390
|
|
|
|
|
|
|
{ |
391
|
14
|
|
|
|
|
106
|
push @listOfTokens, [$-[0], $+[0] - $-[1], $1]; |
392
|
|
|
|
|
|
|
} |
393
|
|
|
|
|
|
|
|
394
|
12
|
|
|
|
|
19
|
my $substringsToMatch = @listOfTokens; |
395
|
12
|
50
|
|
|
|
25
|
$substringsToMatch = $totalMissingWords if ($totalMissingWords < $substringsToMatch); |
396
|
12
|
|
|
|
|
31
|
for (my $i = 0; $ i < $substringsToMatch; $i++) |
397
|
|
|
|
|
|
|
{ |
398
|
14
|
|
|
|
|
26
|
$listOfWords[$noPositionStartIndex + $i]->[WORD_CHAR_POSITION] = $startIndexPosition + $listOfTokens[$i]->[0]; |
399
|
14
|
|
|
|
|
70
|
$listOfWords[$noPositionStartIndex + $i]->[WORD_CHAR_LENGTH] = $listOfTokens[$i]->[1]; |
400
|
|
|
|
|
|
|
} |
401
|
|
|
|
|
|
|
} |
402
|
|
|
|
|
|
|
} |
403
|
|
|
|
|
|
|
|
404
|
|
|
|
|
|
|
# add the gaps. |
405
|
|
|
|
|
|
|
# still need some work here to get the words into the list of sentences that they belong in. |
406
|
4322
|
|
|
|
|
30277
|
my $listOfWordsAndGaps = _addGapsToListOfWords (listOfWords => \@listOfWords, stringLength => length ($Text), positionOffset => $PositionOffset); |
407
|
|
|
|
|
|
|
|
408
|
|
|
|
|
|
|
# add the sentence id to each gap. |
409
|
4322
|
|
|
|
|
8821
|
my $currentSentenceId = 0; |
410
|
4322
|
|
|
|
|
7982
|
my @newListOfSentences; |
411
|
4322
|
|
|
|
|
10075
|
foreach my $word (@$listOfWordsAndGaps) |
412
|
|
|
|
|
|
|
{ |
413
|
28300
|
100
|
|
|
|
79489
|
$newListOfSentences[$currentSentenceId] = [] unless defined $newListOfSentences[$currentSentenceId]; |
414
|
|
|
|
|
|
|
|
415
|
28300
|
100
|
|
|
|
69255
|
if ($word->[WORD_POSTAG] eq '/PGP') |
416
|
|
|
|
|
|
|
{ |
417
|
11944
|
|
|
|
|
39849
|
$word->[WORD_SENTENCE_ID] = $currentSentenceId; |
418
|
11944
|
|
|
|
|
30678
|
$word->[WORD_ORIGINAL] = substr ($Text, $word->[WORD_CHAR_POSITION] - $PositionOffset, $word->[WORD_CHAR_LENGTH]); |
419
|
|
|
|
|
|
|
} |
420
|
|
|
|
|
|
|
else |
421
|
|
|
|
|
|
|
{ |
422
|
16356
|
|
|
|
|
29965
|
$currentSentenceId = $word->[WORD_SENTENCE_ID]; |
423
|
|
|
|
|
|
|
} |
424
|
|
|
|
|
|
|
|
425
|
28300
|
|
|
|
|
34868
|
push @{$newListOfSentences[$currentSentenceId]}, $word; |
|
28300
|
|
|
|
|
81398
|
|
426
|
|
|
|
|
|
|
} |
427
|
|
|
|
|
|
|
|
428
|
4322
|
|
|
|
|
28336
|
return \@newListOfSentences; |
429
|
|
|
|
|
|
|
} |
430
|
|
|
|
|
|
|
|
431
|
|
|
|
|
|
|
|
432
|
|
|
|
|
|
|
sub _addGapsToListOfWords |
433
|
|
|
|
|
|
|
{ |
434
|
|
|
|
|
|
|
# get the parameters. |
435
|
4322
|
|
|
4322
|
|
24572
|
my %Parameters = @_; |
436
|
|
|
|
|
|
|
|
437
|
|
|
|
|
|
|
# get the length of the original string. |
438
|
4322
|
|
|
|
|
10344
|
my $stringLength = $Parameters{stringLength}; |
439
|
|
|
|
|
|
|
|
440
|
|
|
|
|
|
|
# get the list of words. |
441
|
4322
|
|
|
|
|
7379
|
my $listOfWords = $Parameters{listOfWords}; |
442
|
|
|
|
|
|
|
|
443
|
|
|
|
|
|
|
# get the position offset if it exists and is defined. |
444
|
4322
|
|
|
|
|
9853
|
my $positionOffset = 0; |
445
|
4322
|
50
|
33
|
|
|
34937
|
$positionOffset = $Parameters{positionOffset} if (exists ($Parameters{positionOffset}) && defined ($Parameters{positionOffset})); |
446
|
|
|
|
|
|
|
|
447
|
|
|
|
|
|
|
# build the list of complete tokens. |
448
|
4322
|
|
|
|
|
18258
|
my @listOfWordsAndGaps = @$listOfWords; |
449
|
|
|
|
|
|
|
|
450
|
|
|
|
|
|
|
# get the list of missing substring positions. |
451
|
4322
|
|
|
|
|
14904
|
my $listOfMissingSubstrings = _getListOfMissingSubstringPositions(listOfSubstringPositions => $listOfWords, stringLength => $stringLength, positionOffset => $positionOffset); |
452
|
|
|
|
|
|
|
|
453
|
|
|
|
|
|
|
# add the gaps to the list. |
454
|
4322
|
|
|
|
|
22439
|
foreach my $gapInfo (@$listOfMissingSubstrings) |
455
|
|
|
|
|
|
|
{ |
456
|
11944
|
|
|
|
|
19574
|
$gapInfo->[WORD_STEMMED] = ' '; |
457
|
11944
|
|
|
|
|
16293
|
$gapInfo->[WORD_ORIGINAL] = undef; |
458
|
11944
|
|
|
|
|
24643
|
$gapInfo->[WORD_POSTAG] = '/PGP'; |
459
|
11944
|
|
|
|
|
24274
|
$gapInfo->[WORD_INDEX] = undef; |
460
|
|
|
|
|
|
|
} |
461
|
|
|
|
|
|
|
|
462
|
|
|
|
|
|
|
# sort the substrings by position. |
463
|
4322
|
|
|
|
|
22350
|
@listOfWordsAndGaps = sort { $a->[WORD_CHAR_POSITION] <=> $b->[WORD_CHAR_POSITION] } (@listOfWordsAndGaps, @$listOfMissingSubstrings); |
|
60156
|
|
|
|
|
97182
|
|
464
|
|
|
|
|
|
|
|
465
|
|
|
|
|
|
|
# if test is true, make sure things were computed correctly. |
466
|
4322
|
50
|
33
|
|
|
31613
|
if (exists($Parameters{test}) && $Parameters{test}) |
467
|
|
|
|
|
|
|
{ |
468
|
0
|
|
|
|
|
0
|
my $totalSubstrings = @listOfWordsAndGaps; |
469
|
|
|
|
|
|
|
|
470
|
0
|
|
|
|
|
0
|
for (my $i = 1 ; $i < $totalSubstrings ; $i++) |
471
|
|
|
|
|
|
|
{ |
472
|
|
|
|
|
|
|
|
473
|
|
|
|
|
|
|
# make sure the strings are sorted. |
474
|
0
|
0
|
|
|
|
0
|
if ($listOfWordsAndGaps[ $i - 1 ]->[WORD_CHAR_POSITION] > $listOfWordsAndGaps[$i]->[WORD_CHAR_POSITION]) |
475
|
|
|
|
|
|
|
{ |
476
|
0
|
|
|
|
|
0
|
croak 'error: substrings in $listOfSubstringInfo are not sorted and should be.'; |
477
|
|
|
|
|
|
|
} |
478
|
|
|
|
|
|
|
|
479
|
|
|
|
|
|
|
# make sure the strings have at least one character. |
480
|
0
|
0
|
|
|
|
0
|
if ($listOfWordsAndGaps[ $i - 1 ]->[WORD_CHAR_LENGTH] < 1) |
481
|
|
|
|
|
|
|
{ |
482
|
0
|
|
|
|
|
0
|
croak 'error: substrings in $listOfSubstringInfo has length less than one.'; |
483
|
|
|
|
|
|
|
} |
484
|
|
|
|
|
|
|
} |
485
|
|
|
|
|
|
|
} |
486
|
|
|
|
|
|
|
|
487
|
|
|
|
|
|
|
# returns the complete list of tokens sorted by their starting index in ascending order. |
488
|
4322
|
|
|
|
|
19660
|
return \@listOfWordsAndGaps; |
489
|
|
|
|
|
|
|
} |
490
|
|
|
|
|
|
|
|
491
|
|
|
|
|
|
|
|
492
|
|
|
|
|
|
|
|
493
|
|
|
|
|
|
|
# routine returns an array reference of the gaps or missing substrings given |
494
|
|
|
|
|
|
|
# a list of substrings. for example, if listOfSubstringPositions is |
495
|
|
|
|
|
|
|
# [[2,4], [9,2], [11,5], [20,1]] and stringLength is 25, then the list returned |
496
|
|
|
|
|
|
|
# is [[0, 2], [6, 3], [16, 4], [21, 4]]. |
497
|
|
|
|
|
|
|
sub _getListOfMissingSubstringPositions # (listOfSubstringPositions => \@, stringLength => n, positionOffset => n) |
498
|
|
|
|
|
|
|
{ |
499
|
|
|
|
|
|
|
# get the parameters. |
500
|
4322
|
|
|
4322
|
|
20758
|
my %Parameters = @_; |
501
|
|
|
|
|
|
|
|
502
|
|
|
|
|
|
|
# get the position offset if it exists and is defined. |
503
|
4322
|
|
|
|
|
8091
|
my $positionOffset = 0; |
504
|
4322
|
50
|
33
|
|
|
41703
|
$positionOffset = $Parameters{positionOffset} if (exists ($Parameters{positionOffset}) && defined ($Parameters{positionOffset})); |
505
|
|
|
|
|
|
|
|
506
|
|
|
|
|
|
|
# if listOfSubstringPositions is not defined, we have one special case. |
507
|
4322
|
50
|
|
|
|
19717
|
if (!defined($Parameters{listOfSubstringPositions})) |
508
|
|
|
|
|
|
|
{ |
509
|
0
|
0
|
|
|
|
0
|
if (!defined($Parameters{stringLength})) |
510
|
|
|
|
|
|
|
{ |
511
|
|
|
|
|
|
|
|
512
|
|
|
|
|
|
|
# no parameters defined, so return the empty list. |
513
|
0
|
|
|
|
|
0
|
return []; |
514
|
|
|
|
|
|
|
} |
515
|
|
|
|
|
|
|
else |
516
|
|
|
|
|
|
|
{ |
517
|
0
|
0
|
|
|
|
0
|
if (int($Parameters{stringLength}) > 0) |
518
|
|
|
|
|
|
|
{ |
519
|
|
|
|
|
|
|
|
520
|
|
|
|
|
|
|
# positive string length, but no substrings, so gap is entire string. |
521
|
0
|
|
|
|
|
0
|
return [ 0, int($Parameters{stringLength}) - 1 ]; |
522
|
|
|
|
|
|
|
} |
523
|
|
|
|
|
|
|
else |
524
|
|
|
|
|
|
|
{ |
525
|
|
|
|
|
|
|
|
526
|
|
|
|
|
|
|
# non-positive string length, so return empty list. |
527
|
0
|
|
|
|
|
0
|
return []; |
528
|
|
|
|
|
|
|
} |
529
|
|
|
|
|
|
|
} |
530
|
|
|
|
|
|
|
} |
531
|
|
|
|
|
|
|
|
532
|
|
|
|
|
|
|
# get the list of [WORD_CHAR_POSITION, WORD_CHAR_LENGTH]. |
533
|
4322
|
|
|
|
|
9541
|
my $listOfSubstringPositions = $Parameters{listOfSubstringPositions}; |
534
|
|
|
|
|
|
|
|
535
|
|
|
|
|
|
|
# get the number of subtrings. |
536
|
4322
|
|
|
|
|
9364
|
my $totalSubstrings = $#$listOfSubstringPositions + 1; |
537
|
|
|
|
|
|
|
|
538
|
|
|
|
|
|
|
# skip substrings having length less than one or a negative position. |
539
|
4322
|
|
|
|
|
7827
|
my @filteredListOfSubstringPositions; |
540
|
4322
|
|
|
|
|
26441
|
for (my $i = 0 ; $i < $totalSubstrings ; $i++) |
541
|
|
|
|
|
|
|
{ |
542
|
16356
|
50
|
|
|
|
45419
|
next if ($listOfSubstringPositions->[$i][WORD_CHAR_LENGTH] < 1); |
543
|
16356
|
50
|
|
|
|
38942
|
next if ($listOfSubstringPositions->[$i][WORD_CHAR_POSITION] < 0); |
544
|
16356
|
|
|
|
|
65483
|
push @filteredListOfSubstringPositions, $listOfSubstringPositions->[$i]; |
545
|
|
|
|
|
|
|
} |
546
|
4322
|
|
|
|
|
19170
|
$listOfSubstringPositions = \@filteredListOfSubstringPositions; |
547
|
4322
|
|
|
|
|
9429
|
$totalSubstrings = $#$listOfSubstringPositions + 1; |
548
|
|
|
|
|
|
|
|
549
|
|
|
|
|
|
|
# get the entire strings length if defined. |
550
|
4322
|
|
|
|
|
6370
|
my $stringLength; |
551
|
4322
|
50
|
|
|
|
17977
|
$stringLength = int abs $Parameters{stringLength} if exists $Parameters{stringLength}; |
552
|
|
|
|
|
|
|
|
553
|
|
|
|
|
|
|
# if $stringLength is undefined use the last substring to compute the length |
554
|
|
|
|
|
|
|
# of the entire string; the string will not end with a gap in this case. |
555
|
4322
|
50
|
33
|
|
|
16624
|
if (!defined($stringLength) && $totalSubstrings) |
556
|
|
|
|
|
|
|
{ |
557
|
0
|
|
|
|
|
0
|
$stringLength = 0; |
558
|
0
|
|
|
|
|
0
|
foreach my $currentSubstringInfo (@$listOfSubstringPositions) |
559
|
|
|
|
|
|
|
{ |
560
|
0
|
|
|
|
|
0
|
my $last = $currentSubstringInfo->[WORD_CHAR_POSITION] + $currentSubstringInfo->[WORD_CHAR_LENGTH]; |
561
|
0
|
0
|
|
|
|
0
|
$stringLength = $last if $last > $stringLength; |
562
|
|
|
|
|
|
|
} |
563
|
0
|
|
|
|
|
0
|
$stringLength -= $positionOffset; |
564
|
|
|
|
|
|
|
} |
565
|
|
|
|
|
|
|
|
566
|
|
|
|
|
|
|
# if $stringLength is not defined at this point there are no gaps. |
567
|
4322
|
50
|
33
|
|
|
29021
|
return [] unless ((defined $stringLength) && ($stringLength > 0)); |
568
|
|
|
|
|
|
|
|
569
|
|
|
|
|
|
|
# if $totalSubstrings is zero, then the entire string is a gap. |
570
|
4322
|
50
|
|
|
|
11770
|
unless ($totalSubstrings) |
571
|
|
|
|
|
|
|
{ |
572
|
0
|
|
|
|
|
0
|
my @substringGapInfo; |
573
|
0
|
|
|
|
|
0
|
$substringGapInfo[WORD_CHAR_POSITION] = 0; |
574
|
0
|
|
|
|
|
0
|
$substringGapInfo[WORD_CHAR_LENGTH] = $stringLength; |
575
|
0
|
|
|
|
|
0
|
return [ \@substringGapInfo ]; |
576
|
|
|
|
|
|
|
} |
577
|
|
|
|
|
|
|
|
578
|
|
|
|
|
|
|
# sort the pairs by their position. |
579
|
4322
|
|
|
|
|
20966
|
my @listOfSubstringPositions = sort { $a->[WORD_CHAR_POSITION] <=> $b->[WORD_CHAR_POSITION] } @$listOfSubstringPositions; |
|
20269
|
|
|
|
|
46141
|
|
580
|
|
|
|
|
|
|
|
581
|
|
|
|
|
|
|
# @listOfMissingSubstringPositions holds all the gaps. |
582
|
4322
|
|
|
|
|
7288
|
my @listOfMissingSubstringPositions; |
583
|
|
|
|
|
|
|
|
584
|
|
|
|
|
|
|
# get the first substring position. |
585
|
4322
|
|
|
|
|
8557
|
my $currentSubstringInfo = $listOfSubstringPositions[0]; |
586
|
|
|
|
|
|
|
|
587
|
|
|
|
|
|
|
# if the first substring does not start with position 0, add the beginning gap. |
588
|
4322
|
50
|
|
|
|
14378
|
if ($currentSubstringInfo->[WORD_CHAR_POSITION] > $positionOffset) |
589
|
|
|
|
|
|
|
{ |
590
|
0
|
|
|
|
|
0
|
my @substringGapInfo; |
591
|
0
|
|
|
|
|
0
|
$substringGapInfo[WORD_CHAR_POSITION] = $positionOffset; |
592
|
0
|
|
|
|
|
0
|
$substringGapInfo[WORD_CHAR_LENGTH] = $currentSubstringInfo->[WORD_CHAR_POSITION] - $positionOffset; |
593
|
0
|
|
|
|
|
0
|
push @listOfMissingSubstringPositions, \@substringGapInfo; |
594
|
|
|
|
|
|
|
} |
595
|
|
|
|
|
|
|
|
596
|
|
|
|
|
|
|
# compute the gaps. |
597
|
4322
|
|
|
|
|
16662
|
for (my $i = 1 ; $i < $totalSubstrings ; $i++) |
598
|
|
|
|
|
|
|
{ |
599
|
|
|
|
|
|
|
|
600
|
|
|
|
|
|
|
# get the information about the previous and current substrings. |
601
|
12034
|
|
|
|
|
41451
|
my $previousSubstringInfo = $listOfSubstringPositions[ $i - 1 ]; |
602
|
12034
|
|
|
|
|
17506
|
my $currentSubstringInfo = $listOfSubstringPositions[$i]; |
603
|
|
|
|
|
|
|
|
604
|
|
|
|
|
|
|
# compute the starting index and length of the gap. |
605
|
12034
|
|
|
|
|
22728
|
my $gapStartPosition = $previousSubstringInfo->[WORD_CHAR_POSITION] + $previousSubstringInfo->[WORD_CHAR_LENGTH]; |
606
|
12034
|
|
|
|
|
19475
|
my $gapEndPosition = $currentSubstringInfo->[WORD_CHAR_POSITION] - 1; |
607
|
12034
|
|
|
|
|
19744
|
my $gapLength = $gapEndPosition - $gapStartPosition + 1; |
608
|
|
|
|
|
|
|
|
609
|
|
|
|
|
|
|
# if the gap is not a positive size, skip it. |
610
|
|
|
|
|
|
|
# maybe a warning should be logged since it really should not happen. |
611
|
12034
|
100
|
|
|
|
49055
|
if ($gapLength > 0) |
612
|
|
|
|
|
|
|
{ |
613
|
|
|
|
|
|
|
|
614
|
|
|
|
|
|
|
# store the information about the gap. |
615
|
11944
|
|
|
|
|
15583
|
my @substringGapInfo; |
616
|
11944
|
|
|
|
|
25344
|
$substringGapInfo[WORD_CHAR_POSITION] = $gapStartPosition; |
617
|
11944
|
|
|
|
|
21944
|
$substringGapInfo[WORD_CHAR_LENGTH] = $gapLength; |
618
|
11944
|
|
|
|
|
65016
|
push @listOfMissingSubstringPositions, \@substringGapInfo; |
619
|
|
|
|
|
|
|
} |
620
|
|
|
|
|
|
|
} |
621
|
|
|
|
|
|
|
|
622
|
|
|
|
|
|
|
# add any trailing gap to the list. |
623
|
4322
|
|
|
|
|
7861
|
$currentSubstringInfo = $listOfSubstringPositions[-1]; |
624
|
4322
|
50
|
|
|
|
28224
|
if ($currentSubstringInfo->[WORD_CHAR_POSITION] + $currentSubstringInfo->[WORD_CHAR_LENGTH] < $stringLength) |
625
|
|
|
|
|
|
|
{ |
626
|
0
|
|
|
|
|
0
|
my @substringGapInfo; |
627
|
0
|
|
|
|
|
0
|
$substringGapInfo[WORD_CHAR_POSITION] = $currentSubstringInfo->[WORD_CHAR_POSITION] + $currentSubstringInfo->[WORD_CHAR_LENGTH]; |
628
|
0
|
|
|
|
|
0
|
$substringGapInfo[WORD_CHAR_LENGTH] = $stringLength - ($currentSubstringInfo->[WORD_CHAR_POSITION] + $currentSubstringInfo->[WORD_CHAR_LENGTH]); |
629
|
0
|
|
|
|
|
0
|
push @listOfMissingSubstringPositions, \@substringGapInfo; |
630
|
|
|
|
|
|
|
} |
631
|
|
|
|
|
|
|
|
632
|
|
|
|
|
|
|
# if test is true, check if gaps were computed correctly. |
633
|
4322
|
50
|
33
|
|
|
19722
|
if (exists($Parameters{test}) && $Parameters{test}) |
634
|
|
|
|
|
|
|
{ |
635
|
0
|
|
|
|
|
0
|
my @allSubstrings = |
636
|
0
|
|
|
|
|
0
|
sort { $a->[WORD_CHAR_POSITION] <=> $b->[WORD_CHAR_POSITION] } (@listOfSubstringPositions, @listOfMissingSubstringPositions); |
637
|
0
|
|
|
|
|
0
|
my $totalSubstrings = @allSubstrings; |
638
|
|
|
|
|
|
|
|
639
|
0
|
|
|
|
|
0
|
for (my $i = 1 ; $i < $totalSubstrings ; $i++) |
640
|
|
|
|
|
|
|
{ |
641
|
|
|
|
|
|
|
|
642
|
|
|
|
|
|
|
# make sure the strings are sorted. |
643
|
0
|
0
|
|
|
|
0
|
if ($allSubstrings[ $i - 1 ]->[WORD_CHAR_POSITION] + $allSubstrings[ $i - 1 ]->[WORD_CHAR_LENGTH] < $allSubstrings[$i]->[WORD_CHAR_POSITION]) |
644
|
|
|
|
|
|
|
{ |
645
|
0
|
|
|
|
|
0
|
my $logger = Log::Log4perl->get_logger(); |
646
|
0
|
|
|
|
|
0
|
$logger->logdie("error: missed computing a gap."); |
647
|
|
|
|
|
|
|
} |
648
|
|
|
|
|
|
|
} |
649
|
|
|
|
|
|
|
} |
650
|
|
|
|
|
|
|
|
651
|
|
|
|
|
|
|
# returns the list of missing substrings. |
652
|
4322
|
|
|
|
|
27902
|
return \@listOfMissingSubstringPositions; |
653
|
|
|
|
|
|
|
} |
654
|
|
|
|
|
|
|
|
655
|
|
|
|
|
|
|
|
656
|
|
|
|
|
|
|
=head2 C |
657
|
|
|
|
|
|
|
|
658
|
|
|
|
|
|
|
getTaggedTextToKeep (listOfStemmedTaggedSentences => [...], |
659
|
|
|
|
|
|
|
listOfPOSTypesToKeep => [...], listOfPOSTagsToKeep => [...]); |
660
|
|
|
|
|
|
|
|
661
|
|
|
|
|
|
|
The method C returns all the array references of the words |
662
|
|
|
|
|
|
|
that have a part-of-speech tag that is of a type specified by |
663
|
|
|
|
|
|
|
C or C. The word lists |
664
|
|
|
|
|
|
|
returned have the same hierarchical sentence structure used by C. |
665
|
|
|
|
|
|
|
Note C and C |
666
|
|
|
|
|
|
|
are optional parameters, if neither is defined, then the values used when the |
667
|
|
|
|
|
|
|
object was instantiated are used. If one of them is defined, its values override the default |
668
|
|
|
|
|
|
|
values. |
669
|
|
|
|
|
|
|
|
670
|
|
|
|
|
|
|
=over |
671
|
|
|
|
|
|
|
|
672
|
|
|
|
|
|
|
=item C |
673
|
|
|
|
|
|
|
|
674
|
|
|
|
|
|
|
listOfStemmedTaggedSentences => [...] |
675
|
|
|
|
|
|
|
|
676
|
|
|
|
|
|
|
C is the array reference returned by |
677
|
|
|
|
|
|
|
C or |
678
|
|
|
|
|
|
|
a previous call to C. |
679
|
|
|
|
|
|
|
|
680
|
|
|
|
|
|
|
=item C and/or C |
681
|
|
|
|
|
|
|
|
682
|
|
|
|
|
|
|
listOfPOSTypesToKeep => [...], listOfPOSTagsToKeep => [...] |
683
|
|
|
|
|
|
|
|
684
|
|
|
|
|
|
|
C and C define the list of |
685
|
|
|
|
|
|
|
parts-of-speech types to be retained when filtering previously tagged text. |
686
|
|
|
|
|
|
|
Permitted values for C are |
687
|
|
|
|
|
|
|
are 'ALL', 'ADJECTIVES', 'ADVERBS', 'CONTENT_WORDS', 'NOUNS', 'PUNCTUATION', |
688
|
|
|
|
|
|
|
'TEXTRANK_WORDS', and 'VERBS'. For the possible value of C |
689
|
|
|
|
|
|
|
see the method C. |
690
|
|
|
|
|
|
|
Note C and C |
691
|
|
|
|
|
|
|
are optional parameters, if neither is defined, then the values used when the |
692
|
|
|
|
|
|
|
object was instantiated are used. If one of them is defined, its values override the default |
693
|
|
|
|
|
|
|
values. |
694
|
|
|
|
|
|
|
|
695
|
|
|
|
|
|
|
=back |
696
|
|
|
|
|
|
|
|
697
|
|
|
|
|
|
|
use Text::StemTagPOS; |
698
|
|
|
|
|
|
|
use Data::Dump qw(dump); |
699
|
|
|
|
|
|
|
my $stemTagger = Text::StemTagPOS->new; |
700
|
|
|
|
|
|
|
my $text = 'This is the first sentence. This is the last sentence.'; |
701
|
|
|
|
|
|
|
my $listOfStemmedTaggedSentences = $stemTagger->getStemmedAndTaggedText ($text); |
702
|
|
|
|
|
|
|
dump $stemTagger->getTaggedTextToKeep ( |
703
|
|
|
|
|
|
|
listOfStemmedTaggedSentences => $listOfStemmedTaggedSentences); |
704
|
|
|
|
|
|
|
|
705
|
|
|
|
|
|
|
# dumps: |
706
|
|
|
|
|
|
|
# [ |
707
|
|
|
|
|
|
|
# [ |
708
|
|
|
|
|
|
|
# ["first", "first", "/JJ", 6, 12, 5, 0], |
709
|
|
|
|
|
|
|
# ["sentenc", "sentence", "/NN", 8, 18, 8, 0], |
710
|
|
|
|
|
|
|
# ], |
711
|
|
|
|
|
|
|
# [ |
712
|
|
|
|
|
|
|
# ["last", "last", "/JJ", 17, 40, 4, 1], |
713
|
|
|
|
|
|
|
# ["sentenc", "sentence", "/NN", 19, 45, 8, 1], |
714
|
|
|
|
|
|
|
# ], |
715
|
|
|
|
|
|
|
# ] |
716
|
|
|
|
|
|
|
|
717
|
|
|
|
|
|
|
=cut |
718
|
|
|
|
|
|
|
|
719
|
|
|
|
|
|
|
sub getTaggedTextToKeep |
720
|
|
|
|
|
|
|
{ |
721
|
0
|
|
|
0
|
1
|
0
|
my ($Self, %Parameters) = @_; |
722
|
|
|
|
|
|
|
|
723
|
|
|
|
|
|
|
# get the list of sentences. |
724
|
0
|
|
|
|
|
0
|
my $listOfStemmedTaggedSentences = $Parameters{listOfStemmedTaggedSentences}; |
725
|
|
|
|
|
|
|
|
726
|
|
|
|
|
|
|
# get the hash of tags to keep. |
727
|
0
|
|
|
|
|
0
|
my $hashOfPOSToKeep = $Self->_getHashOfPOSTagsToKeep (%Parameters); |
728
|
|
|
|
|
|
|
|
729
|
|
|
|
|
|
|
# copy off the tokens to keep in each sentence. |
730
|
0
|
|
|
|
|
0
|
my @listOfFilteredSentences; |
731
|
0
|
|
|
|
|
0
|
foreach my $sentence (@$listOfStemmedTaggedSentences) |
732
|
|
|
|
|
|
|
{ |
733
|
0
|
|
|
|
|
0
|
my @newSentence; |
734
|
0
|
|
|
|
|
0
|
foreach my $token (@$sentence) |
735
|
|
|
|
|
|
|
{ |
736
|
0
|
0
|
|
|
|
0
|
if (exists ($hashOfPOSToKeep->{$token->[WORD_POSTAG]})) |
737
|
|
|
|
|
|
|
{ |
738
|
0
|
|
|
|
|
0
|
push @newSentence, $token; |
739
|
|
|
|
|
|
|
} |
740
|
|
|
|
|
|
|
} |
741
|
|
|
|
|
|
|
#push @listOfFilteredSentences, \@newSentence if (@newSentence > 0); |
742
|
0
|
|
|
|
|
0
|
push @listOfFilteredSentences, \@newSentence; |
743
|
|
|
|
|
|
|
} |
744
|
|
|
|
|
|
|
|
745
|
|
|
|
|
|
|
# return the list of filtered sentences. |
746
|
0
|
|
|
|
|
0
|
return \@listOfFilteredSentences; |
747
|
|
|
|
|
|
|
} |
748
|
|
|
|
|
|
|
|
749
|
|
|
|
|
|
|
=head2 C |
750
|
|
|
|
|
|
|
|
751
|
|
|
|
|
|
|
getWordsPhrasesInTaggedText (listOfStemmedTaggedSentences => ..., |
752
|
|
|
|
|
|
|
listOfPhrasesToFind => [...], listOfPOSTypesToKeep => [...], |
753
|
|
|
|
|
|
|
listOfPOSTagsToKeep => [...]); |
754
|
|
|
|
|
|
|
|
755
|
|
|
|
|
|
|
The method C returns a reference to an array where |
756
|
|
|
|
|
|
|
each entry in the array corresponds |
757
|
|
|
|
|
|
|
to the word or phrase in C. The value of each entry is a list |
758
|
|
|
|
|
|
|
of word indices |
759
|
|
|
|
|
|
|
where the words or phrases were found. Each list contains integer |
760
|
|
|
|
|
|
|
pairs of the form [first-word-index, last-word-index] where first-word-index is the index to the first |
761
|
|
|
|
|
|
|
word of the phrase and last-word-index the index of the last word. The values of the index are those |
762
|
|
|
|
|
|
|
assigned to the stemmed and tagged word in C. |
763
|
|
|
|
|
|
|
|
764
|
|
|
|
|
|
|
[ |
765
|
|
|
|
|
|
|
[ # first phrase locations |
766
|
|
|
|
|
|
|
[first word index, last word index], |
767
|
|
|
|
|
|
|
[first word index, last word index], ...] |
768
|
|
|
|
|
|
|
] |
769
|
|
|
|
|
|
|
[ # second phrase locations |
770
|
|
|
|
|
|
|
[first word index, last word index], |
771
|
|
|
|
|
|
|
[first word index, last word index], ...] |
772
|
|
|
|
|
|
|
] |
773
|
|
|
|
|
|
|
... |
774
|
|
|
|
|
|
|
] |
775
|
|
|
|
|
|
|
|
776
|
|
|
|
|
|
|
=over |
777
|
|
|
|
|
|
|
|
778
|
|
|
|
|
|
|
=item C |
779
|
|
|
|
|
|
|
|
780
|
|
|
|
|
|
|
listOfStemmedTaggedSentences => [...] |
781
|
|
|
|
|
|
|
|
782
|
|
|
|
|
|
|
C is the array reference returned by C or |
783
|
|
|
|
|
|
|
C. |
784
|
|
|
|
|
|
|
|
785
|
|
|
|
|
|
|
=item C |
786
|
|
|
|
|
|
|
|
787
|
|
|
|
|
|
|
listOfPhrasesToFind => [...] |
788
|
|
|
|
|
|
|
|
789
|
|
|
|
|
|
|
C is an array reference containing a list of strings of |
790
|
|
|
|
|
|
|
text that are either single words or phrases that are to be located in the text |
791
|
|
|
|
|
|
|
provided by C. Before the words or phrases are located they are filtered |
792
|
|
|
|
|
|
|
using C or C. |
793
|
|
|
|
|
|
|
|
794
|
|
|
|
|
|
|
|
795
|
|
|
|
|
|
|
=item C and/or C |
796
|
|
|
|
|
|
|
|
797
|
|
|
|
|
|
|
listOfPOSTypesToKeep => [...], listOfPOSTagsToKeep => [...] |
798
|
|
|
|
|
|
|
|
799
|
|
|
|
|
|
|
C and C defines the list of |
800
|
|
|
|
|
|
|
parts-of-speech types to be retained when filtering previously tagged text. |
801
|
|
|
|
|
|
|
Permitted values for C are |
802
|
|
|
|
|
|
|
are 'ALL', 'ADJECTIVES', 'ADVERBS', 'CONTENT_WORDS', 'NOUNS', 'PUNCTUATION', |
803
|
|
|
|
|
|
|
'TEXTRANK_WORDS', and 'VERBS'. For the possible value of C |
804
|
|
|
|
|
|
|
see the method C. |
805
|
|
|
|
|
|
|
Note C and C |
806
|
|
|
|
|
|
|
are optional parameters, if neither is defined, then the values used when the |
807
|
|
|
|
|
|
|
object was instantiated are used. If one of them is defined, its values override the default |
808
|
|
|
|
|
|
|
values. |
809
|
|
|
|
|
|
|
|
810
|
|
|
|
|
|
|
=back |
811
|
|
|
|
|
|
|
|
812
|
|
|
|
|
|
|
The code below illustrates the output format: |
813
|
|
|
|
|
|
|
|
814
|
|
|
|
|
|
|
use Text::StemTagPOS; |
815
|
|
|
|
|
|
|
use Data::Dump qw(dump); |
816
|
|
|
|
|
|
|
my $stemTagger = Text::StemTagPOS->new; |
817
|
|
|
|
|
|
|
my $text = 'This is the first sentence. This is the last sentence.'; |
818
|
|
|
|
|
|
|
my $listOfStemmedTaggedSentences = $stemTagger->getStemmedAndTaggedText ($text); |
819
|
|
|
|
|
|
|
dump $listOfStemmedTaggedSentences; |
820
|
|
|
|
|
|
|
my $listOfWordsOrPhrasesToFind = ['first sentence','this is', |
821
|
|
|
|
|
|
|
'third sentence', 'sentence']; |
822
|
|
|
|
|
|
|
my $phraseLocations = $stemTagger->getWordsPhrasesInTaggedText ( |
823
|
|
|
|
|
|
|
listOfPOSTypesToKeep => [qw(ALL)], |
824
|
|
|
|
|
|
|
listOfStemmedTaggedSentences => $listOfStemmedTaggedSentences, |
825
|
|
|
|
|
|
|
listOfWordsOrPhrasesToFind => $listOfWordsOrPhrasesToFind); |
826
|
|
|
|
|
|
|
dump $phraseLocations; |
827
|
|
|
|
|
|
|
# [ |
828
|
|
|
|
|
|
|
# [[6, 8]], # 'first sentence' |
829
|
|
|
|
|
|
|
# [[0, 2], [11, 13]], # 'this is': note period in text has index 5. |
830
|
|
|
|
|
|
|
# [], # 'third sentence' |
831
|
|
|
|
|
|
|
# [[8, 8], [19, 19]] # 'sentence' |
832
|
|
|
|
|
|
|
# ] |
833
|
|
|
|
|
|
|
|
834
|
|
|
|
|
|
|
=cut |
835
|
|
|
|
|
|
|
|
836
|
|
|
|
|
|
|
sub getWordsPhrasesInTaggedText |
837
|
|
|
|
|
|
|
{ |
838
|
300
|
|
|
300
|
1
|
5355783
|
my ($Self, %Parameters) = @_; |
839
|
|
|
|
|
|
|
|
840
|
|
|
|
|
|
|
# stem and tag all of the word and phrase text. |
841
|
|
|
|
|
|
|
# put all the listOfStemmedTaggedSentences into an array, |
842
|
|
|
|
|
|
|
# create the suffix array of the listOfStemmedTaggedSentences array. |
843
|
|
|
|
|
|
|
|
844
|
|
|
|
|
|
|
# get the hash of tags to keep. |
845
|
300
|
|
|
|
|
2939
|
my $hashOfPOSToKeep = $Self->_getHashOfPOSTagsToKeep (%Parameters); |
846
|
|
|
|
|
|
|
|
847
|
|
|
|
|
|
|
# get the tagged text. |
848
|
300
|
|
|
|
|
1191
|
my $listOfStemmedTaggedSentences = $Parameters{listOfStemmedTaggedSentences}; |
849
|
|
|
|
|
|
|
|
850
|
|
|
|
|
|
|
# create a list of the filtered stemmed words of the text. |
851
|
300
|
|
|
|
|
844
|
my @wordsOfText; |
852
|
300
|
|
|
|
|
841
|
foreach my $sentence (@$listOfStemmedTaggedSentences) |
853
|
|
|
|
|
|
|
{ |
854
|
29474
|
|
|
|
|
75427
|
foreach my $word (@$sentence) |
855
|
|
|
|
|
|
|
{ |
856
|
|
|
|
|
|
|
# skip over the words types to ignore. |
857
|
442744
|
50
|
|
|
|
1350551
|
next unless (exists ($hashOfPOSToKeep->{$word->[WORD_POSTAG]})); |
858
|
442744
|
|
|
|
|
885142
|
push @wordsOfText, $word; |
859
|
|
|
|
|
|
|
} |
860
|
|
|
|
|
|
|
} |
861
|
|
|
|
|
|
|
|
862
|
|
|
|
|
|
|
# create the suffix array of the tagged words, sorting by the stemmed word. |
863
|
300
|
|
|
|
|
1248
|
my $totalWords = @wordsOfText; |
864
|
|
|
|
|
|
|
|
865
|
|
|
|
|
|
|
# hash all the indices of the stemmed words in the suffix array of the text. |
866
|
300
|
|
|
|
|
979
|
my %wordIndices; |
867
|
300
|
|
|
|
|
2964
|
for(my $index = 0; $index < $totalWords; $index++) |
868
|
|
|
|
|
|
|
{ |
869
|
442744
|
|
|
|
|
840436
|
my $word = $wordsOfText[$index]; |
870
|
442744
|
50
|
|
|
|
2394697
|
$wordIndices{$word->[WORD_STEMMED]} = [] unless exists $wordIndices{$word->[WORD_STEMMED]}; |
871
|
442744
|
|
|
|
|
615658
|
push @{$wordIndices{$word->[WORD_STEMMED]}}, $index; |
|
442744
|
|
|
|
|
1830082
|
|
872
|
|
|
|
|
|
|
} |
873
|
|
|
|
|
|
|
|
874
|
|
|
|
|
|
|
# stem and part-of-speech tag the word or phrases to find. |
875
|
300
|
|
|
|
|
822
|
my @listOfStemmedWordsInPhrasesToFind; |
876
|
300
|
|
|
|
|
1176
|
my $listOfWordsOrPhrasesToFind = $Parameters{listOfWordsOrPhrasesToFind}; |
877
|
300
|
|
|
|
|
1453
|
foreach my $wordOrPhrase (@$listOfWordsOrPhrasesToFind) |
878
|
|
|
|
|
|
|
{ |
879
|
4321
|
|
|
|
|
24086
|
my $stemmed = $Self->getStemmedAndTaggedText ($wordOrPhrase); |
880
|
|
|
|
|
|
|
|
881
|
|
|
|
|
|
|
# flatten the words of a phrase into on list. |
882
|
4321
|
|
|
|
|
9594
|
my @allWords; |
883
|
4321
|
|
|
|
|
11384
|
foreach my $sentence (@$stemmed) |
884
|
|
|
|
|
|
|
{ |
885
|
4321
|
|
|
|
|
14860
|
foreach my $word (@$sentence) |
886
|
|
|
|
|
|
|
{ |
887
|
|
|
|
|
|
|
# remove the words to ignore. |
888
|
27041
|
100
|
|
|
|
84286
|
next unless (exists ($hashOfPOSToKeep->{$word->[WORD_POSTAG]})); |
889
|
15681
|
|
|
|
|
45001
|
push @allWords, $word; |
890
|
|
|
|
|
|
|
} |
891
|
|
|
|
|
|
|
} |
892
|
|
|
|
|
|
|
|
893
|
4321
|
|
|
|
|
32282
|
push @listOfStemmedWordsInPhrasesToFind, \@allWords ; |
894
|
|
|
|
|
|
|
} |
895
|
|
|
|
|
|
|
|
896
|
|
|
|
|
|
|
# given the starting index into the suffix array and a word list, return |
897
|
|
|
|
|
|
|
# the number of words matching the word list at that index. |
898
|
|
|
|
|
|
|
my $getMatchingWords = sub |
899
|
|
|
|
|
|
|
{ |
900
|
4321
|
|
|
4321
|
|
6689
|
my ($StartingIndex, $StemmedWordList) = @_; |
901
|
4321
|
|
|
|
|
7881
|
my $stemmedIndex = 0; |
902
|
4321
|
|
|
|
|
5796
|
my $textIndex = $StartingIndex; |
903
|
4321
|
|
100
|
|
|
21753
|
while (($textIndex < $totalWords) && ($stemmedIndex < @$StemmedWordList)) |
904
|
|
|
|
|
|
|
{ |
905
|
15681
|
50
|
|
|
|
75726
|
last if ($wordsOfText[$textIndex]->[WORD_STEMMED] cmp $StemmedWordList->[$stemmedIndex][WORD_STEMMED]); |
906
|
15681
|
|
|
|
|
24881
|
$textIndex++; |
907
|
15681
|
|
|
|
|
68213
|
$stemmedIndex++; |
908
|
|
|
|
|
|
|
} |
909
|
4321
|
|
|
|
|
19467
|
return $textIndex - $StartingIndex; |
910
|
300
|
|
|
|
|
4330
|
}; |
911
|
|
|
|
|
|
|
|
912
|
|
|
|
|
|
|
# find all the phrases in the filtered text. |
913
|
300
|
|
|
|
|
711
|
my @phrasesFound; |
914
|
300
|
|
|
|
|
841
|
my $phraseIndex = 0; |
915
|
300
|
|
|
|
|
859
|
foreach my $phraseStemmedWordList (@listOfStemmedWordsInPhrasesToFind) |
916
|
|
|
|
|
|
|
{ |
917
|
4321
|
|
|
|
|
5740
|
my @fullPhrasesFound; |
918
|
4321
|
|
|
|
|
14044
|
my @stemmedPhraseWords = @$phraseStemmedWordList; |
919
|
4321
|
|
|
|
|
6607
|
my $wordsInPhrase = $#stemmedPhraseWords + 1; |
920
|
|
|
|
|
|
|
|
921
|
|
|
|
|
|
|
# get the first word in the phrase. |
922
|
4321
|
|
|
|
|
7564
|
my $firstWord = $stemmedPhraseWords[0]; |
923
|
4321
|
50
|
|
|
|
19786
|
if (exists ($wordIndices{$firstWord->[WORD_STEMMED]})) |
924
|
|
|
|
|
|
|
{ |
925
|
|
|
|
|
|
|
# get all the indices in the suffix array that the first word occurs at. |
926
|
4321
|
|
|
|
|
29879
|
my $listOfStartingIndices = $wordIndices{$firstWord->[WORD_STEMMED]}; |
927
|
|
|
|
|
|
|
|
928
|
|
|
|
|
|
|
# compute the number of matching phrase words at each position. |
929
|
4321
|
|
|
|
|
10688
|
foreach my $startingIndex (@$listOfStartingIndices) |
930
|
|
|
|
|
|
|
{ |
931
|
4321
|
|
|
|
|
9559
|
my $matchingLength = &$getMatchingWords ($startingIndex, \@stemmedPhraseWords); |
932
|
4321
|
|
|
|
|
11569
|
my $firstWordIndex = $wordsOfText[$startingIndex]->[WORD_INDEX]; |
933
|
4321
|
|
|
|
|
10116
|
my $lastWordIndex = $wordsOfText[$startingIndex + $matchingLength - 1]->[WORD_INDEX]; |
934
|
|
|
|
|
|
|
|
935
|
4321
|
50
|
|
|
|
17419
|
if ($matchingLength == $wordsInPhrase) |
936
|
|
|
|
|
|
|
{ |
937
|
|
|
|
|
|
|
# full match of the phrase. |
938
|
4321
|
|
|
|
|
21275
|
push @fullPhrasesFound, [$firstWordIndex, $lastWordIndex]; |
939
|
|
|
|
|
|
|
} |
940
|
|
|
|
|
|
|
} |
941
|
|
|
|
|
|
|
} |
942
|
4321
|
|
|
|
|
16702
|
$phrasesFound[$phraseIndex++] = \@fullPhrasesFound; |
943
|
|
|
|
|
|
|
} |
944
|
300
|
|
|
|
|
908785
|
return \@phrasesFound; |
945
|
|
|
|
|
|
|
} |
946
|
|
|
|
|
|
|
|
947
|
|
|
|
|
|
|
=head2 C |
948
|
|
|
|
|
|
|
|
949
|
|
|
|
|
|
|
The method C takes no parameters. It returns an array |
950
|
|
|
|
|
|
|
reference where each item in the list is of the form C<[part of speech tag, description, examples]>. |
951
|
|
|
|
|
|
|
It is meant for getting the part-of-speech tags that can be used to populate C. |
952
|
|
|
|
|
|
|
|
953
|
|
|
|
|
|
|
use Text::StemTagPOS; |
954
|
|
|
|
|
|
|
use Data::Dump qw(dump); |
955
|
|
|
|
|
|
|
my $stemTagger = Text::StemTagPOS->new; |
956
|
|
|
|
|
|
|
dump $stemTagger->getListOfPartOfSpeechTags; |
957
|
|
|
|
|
|
|
|
958
|
|
|
|
|
|
|
=cut |
959
|
|
|
|
|
|
|
|
960
|
|
|
|
|
|
|
# this method takes no parameters. it reads in the information about the |
961
|
|
|
|
|
|
|
# part of speech tags stored in the DATA section of this file. the information |
962
|
|
|
|
|
|
|
# is stored in the object at posTags as an array reference containing the |
963
|
|
|
|
|
|
|
# array [POS tag, description, examples] for each possible tag. |
964
|
|
|
|
|
|
|
sub getListOfPartOfSpeechTags |
965
|
|
|
|
|
|
|
{ |
966
|
0
|
|
|
0
|
1
|
0
|
my $Self = shift; |
967
|
|
|
|
|
|
|
|
968
|
|
|
|
|
|
|
# return a copy of the posTags array. |
969
|
0
|
|
|
|
|
0
|
my @partOfSpeechTags = map {[@$_]} @{$Self->{posTags}}; |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
970
|
0
|
|
|
|
|
0
|
return \@partOfSpeechTags; |
971
|
|
|
|
|
|
|
} |
972
|
|
|
|
|
|
|
|
973
|
|
|
|
|
|
|
|
974
|
|
|
|
|
|
|
# _convertTextToListOfSentenceWordTags->('string of tagged words returned from xxxx'); |
975
|
|
|
|
|
|
|
sub _convertTextToListOfSentenceWordTags |
976
|
|
|
|
|
|
|
{ |
977
|
|
|
|
|
|
|
# get the method class object. |
978
|
4322
|
|
|
4322
|
|
11660
|
my $Self = $_[0]; |
979
|
|
|
|
|
|
|
|
980
|
|
|
|
|
|
|
# split the string of tagged words into a list of the words and tags. note, |
981
|
|
|
|
|
|
|
# all POS tags are a slash followed by 2, 3 or 4 uppercase letters. |
982
|
4322
|
|
|
|
|
73375
|
my @list = split (/(\/[A-Z]{2,4})/, $_[1]); |
983
|
|
|
|
|
|
|
|
984
|
|
|
|
|
|
|
# trim off any leading or trailing spaces from each word or tag. |
985
|
4322
|
|
|
|
|
13494
|
foreach my $item (@list) |
986
|
|
|
|
|
|
|
{ |
987
|
32712
|
|
|
|
|
116979
|
$item =~ s/^\s*//; |
988
|
32712
|
|
|
|
|
70468
|
$item =~ s/\s+$//; |
989
|
|
|
|
|
|
|
} |
990
|
|
|
|
|
|
|
|
991
|
|
|
|
|
|
|
# restructure the list so each item is of the form [word, tag]. |
992
|
4322
|
|
|
|
|
9200
|
my $wordOrPhrase; |
993
|
|
|
|
|
|
|
my @taggerWordList; |
994
|
4322
|
|
|
|
|
11075
|
foreach my $item (@list) |
995
|
|
|
|
|
|
|
{ |
996
|
32712
|
100
|
|
|
|
92258
|
if (substr ($item, 0, 1) eq '/') |
997
|
|
|
|
|
|
|
{ |
998
|
|
|
|
|
|
|
# at this point the item in the list is a tag, so form the new [word,tag] pair |
999
|
|
|
|
|
|
|
# and save it. |
1000
|
16356
|
|
|
|
|
32512
|
my $wordInfo = []; |
1001
|
16356
|
|
|
|
|
39199
|
$wordInfo->[WORD_ORIGINAL] = $wordOrPhrase; |
1002
|
16356
|
|
|
|
|
35433
|
$wordInfo->[WORD_POSTAG] = $item; |
1003
|
16356
|
|
|
|
|
40379
|
push @taggerWordList, $wordInfo; |
1004
|
16356
|
|
|
|
|
34927
|
$wordOrPhrase = undef; |
1005
|
|
|
|
|
|
|
} |
1006
|
|
|
|
|
|
|
else |
1007
|
|
|
|
|
|
|
{ |
1008
|
|
|
|
|
|
|
# at this point we have a new word, so append it to $wordOrPhrase. |
1009
|
16356
|
50
|
|
|
|
38626
|
if (!defined ($wordOrPhrase)) |
1010
|
|
|
|
|
|
|
{ |
1011
|
16356
|
|
|
|
|
42972
|
$wordOrPhrase = $item; |
1012
|
|
|
|
|
|
|
} |
1013
|
|
|
|
|
|
|
else |
1014
|
|
|
|
|
|
|
{ |
1015
|
0
|
|
|
|
|
0
|
$wordOrPhrase .= ' ' . $item; |
1016
|
|
|
|
|
|
|
} |
1017
|
|
|
|
|
|
|
} |
1018
|
|
|
|
|
|
|
} |
1019
|
|
|
|
|
|
|
|
1020
|
|
|
|
|
|
|
# now partition the list into sentences. |
1021
|
4322
|
|
|
|
|
10570
|
my @sentences; |
1022
|
4322
|
|
|
|
|
10521
|
my $currentSentence = []; |
1023
|
4322
|
|
|
|
|
14299
|
foreach my $wordTag (@taggerWordList) |
1024
|
|
|
|
|
|
|
{ |
1025
|
16356
|
100
|
|
|
|
50494
|
if ($wordTag->[WORD_POSTAG] eq $Self->{endingSentenceTag}) |
1026
|
|
|
|
|
|
|
{ |
1027
|
|
|
|
|
|
|
# if the sentence has no words, skip it. |
1028
|
24
|
50
|
|
|
|
63
|
next unless ($#$currentSentence > -1); |
1029
|
|
|
|
|
|
|
|
1030
|
|
|
|
|
|
|
# add the POS sentence ender tag to the end of the sentence list. |
1031
|
24
|
|
|
|
|
43
|
push @$currentSentence, $wordTag; |
1032
|
|
|
|
|
|
|
|
1033
|
|
|
|
|
|
|
# add the sentence to the list of sentences. |
1034
|
24
|
|
|
|
|
37
|
push @sentences, $currentSentence; |
1035
|
24
|
|
|
|
|
58
|
$currentSentence = []; |
1036
|
|
|
|
|
|
|
} |
1037
|
|
|
|
|
|
|
else |
1038
|
|
|
|
|
|
|
{ |
1039
|
16332
|
|
|
|
|
38092
|
push @$currentSentence, $wordTag; |
1040
|
|
|
|
|
|
|
} |
1041
|
|
|
|
|
|
|
} |
1042
|
|
|
|
|
|
|
|
1043
|
|
|
|
|
|
|
# if there is a sentence left, save it. |
1044
|
4322
|
100
|
|
|
|
30933
|
if ($#$currentSentence > -1) |
1045
|
|
|
|
|
|
|
{ |
1046
|
4321
|
|
|
|
|
10486
|
push @sentences, $currentSentence; |
1047
|
|
|
|
|
|
|
} |
1048
|
|
|
|
|
|
|
|
1049
|
|
|
|
|
|
|
# add the sentence id to each word. |
1050
|
4322
|
|
|
|
|
16517
|
for (my $i = 0; $i < @sentences; $i++) |
1051
|
|
|
|
|
|
|
{ |
1052
|
4345
|
|
|
|
|
19630
|
foreach my $word (@{$sentences[$i]}) |
|
4345
|
|
|
|
|
35625
|
|
1053
|
|
|
|
|
|
|
{ |
1054
|
16356
|
|
|
|
|
46344
|
$word->[WORD_SENTENCE_ID] = $i; |
1055
|
|
|
|
|
|
|
} |
1056
|
|
|
|
|
|
|
} |
1057
|
|
|
|
|
|
|
|
1058
|
|
|
|
|
|
|
# return the list of sentences of word,tag pairs. |
1059
|
4322
|
|
|
|
|
24404
|
return \@sentences; |
1060
|
|
|
|
|
|
|
} |
1061
|
|
|
|
|
|
|
|
1062
|
|
|
|
|
|
|
|
1063
|
|
|
|
|
|
|
# converts a list of strings, string refs, arrays of strings, etc... to a |
1064
|
|
|
|
|
|
|
# simpler list of strings; essentially flattening a list. |
1065
|
|
|
|
|
|
|
sub _flattenList |
1066
|
|
|
|
|
|
|
{ |
1067
|
4322
|
|
|
4322
|
|
10894
|
my ($Self, @Data) = @_; |
1068
|
|
|
|
|
|
|
|
1069
|
4322
|
|
|
|
|
6442
|
my @ListOfStrings; |
1070
|
4322
|
|
|
|
|
11793
|
foreach my $item (@_) |
1071
|
|
|
|
|
|
|
{ |
1072
|
8644
|
|
|
|
|
18556
|
my $type = ref ($item); |
1073
|
8644
|
100
|
|
|
|
64628
|
if ($type eq '') |
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
1074
|
|
|
|
|
|
|
{ |
1075
|
4321
|
|
|
|
|
15269
|
push @ListOfStrings, $item; |
1076
|
|
|
|
|
|
|
} |
1077
|
|
|
|
|
|
|
elsif ($type eq 'SCALAR') |
1078
|
|
|
|
|
|
|
{ |
1079
|
1
|
|
|
|
|
4
|
push @ListOfStrings, $$item; |
1080
|
|
|
|
|
|
|
} |
1081
|
|
|
|
|
|
|
elsif ($type eq 'ARRAY') |
1082
|
|
|
|
|
|
|
{ |
1083
|
0
|
|
|
|
|
0
|
push @ListOfStrings, @{$Self->_flattenList (@$item)}; |
|
0
|
|
|
|
|
0
|
|
1084
|
|
|
|
|
|
|
} |
1085
|
|
|
|
|
|
|
elsif ($type eq 'REF') |
1086
|
|
|
|
|
|
|
{ |
1087
|
0
|
|
|
|
|
0
|
push @ListOfStrings, @{$Self->_flattenList ($$item)}; |
|
0
|
|
|
|
|
0
|
|
1088
|
|
|
|
|
|
|
} |
1089
|
|
|
|
|
|
|
} |
1090
|
4322
|
|
|
|
|
14194
|
return \@ListOfStrings; |
1091
|
|
|
|
|
|
|
} |
1092
|
|
|
|
|
|
|
|
1093
|
|
|
|
|
|
|
|
1094
|
|
|
|
|
|
|
# this method takes no parameters. it reads in the information about the |
1095
|
|
|
|
|
|
|
# part of speech tags stored in the DATA section of this file. the information |
1096
|
|
|
|
|
|
|
# is stored in the object at posTags as an array reference containing the |
1097
|
|
|
|
|
|
|
# array [POS tag, description, examples] for each possible tag. |
1098
|
|
|
|
|
|
|
sub _getPartOfSpeechTags |
1099
|
|
|
|
|
|
|
{ |
1100
|
4
|
|
|
4
|
|
15
|
my $Self = shift; |
1101
|
|
|
|
|
|
|
|
1102
|
4
|
|
|
|
|
268
|
my $posTags = |
1103
|
|
|
|
|
|
|
[ |
1104
|
|
|
|
|
|
|
['CC', 'Conjunction, coordinating', 'and, or'], |
1105
|
|
|
|
|
|
|
['CD', 'Adjective, cardinal number', '3, fifteen'], |
1106
|
|
|
|
|
|
|
['DET', 'Determiner', 'this, each, some'], |
1107
|
|
|
|
|
|
|
['EX', 'Pronoun, existential there', 'there'], |
1108
|
|
|
|
|
|
|
['FW', 'Foreign words', ''], |
1109
|
|
|
|
|
|
|
['IN', 'Preposition / Conjunction', 'for, of, although, that'], |
1110
|
|
|
|
|
|
|
['JJ', 'Adjective', 'happy, bad'], |
1111
|
|
|
|
|
|
|
['JJR', 'Adjective, comparative', 'happier, worse'], |
1112
|
|
|
|
|
|
|
['JJS', 'Adjective, superlative', 'happiest, worst'], |
1113
|
|
|
|
|
|
|
['LS', 'Symbol, list item', 'A, A.'], |
1114
|
|
|
|
|
|
|
['MD', 'Verb, modal', "can, could, 'll"], |
1115
|
|
|
|
|
|
|
['NN', 'Noun', 'aircraft, data'], |
1116
|
|
|
|
|
|
|
['NNP', 'Noun, proper', 'London, Michael'], |
1117
|
|
|
|
|
|
|
['NNPS', 'Noun, proper, plural', 'Australians, Methodists'], |
1118
|
|
|
|
|
|
|
['NNS', 'Noun, plural', 'women, books'], |
1119
|
|
|
|
|
|
|
['PDT', 'Determiner, prequalifier', 'quite, all, half'], |
1120
|
|
|
|
|
|
|
['POS', 'Possessive', "'s"], |
1121
|
|
|
|
|
|
|
['PRP', 'Determiner, possessive second', 'mine, yours'], |
1122
|
|
|
|
|
|
|
['PRPS', 'Determiner, possessive', 'their, your'], |
1123
|
|
|
|
|
|
|
['RB', 'Adverb', 'often, not, very, here'], |
1124
|
|
|
|
|
|
|
['RBR', 'Adverb, comparative', 'faster'], |
1125
|
|
|
|
|
|
|
['RBS', 'Adverb, superlative', 'fastest'], |
1126
|
|
|
|
|
|
|
['RP', 'Adverb, particle', 'up, off, out'], |
1127
|
|
|
|
|
|
|
['SYM', 'Symbol', '*'], |
1128
|
|
|
|
|
|
|
['TO', 'Preposition', 'to'], |
1129
|
|
|
|
|
|
|
['UH', 'Interjection', 'oh, yes, mmm'], |
1130
|
|
|
|
|
|
|
['VB', 'Verb, infinitive', 'take, live'], |
1131
|
|
|
|
|
|
|
['VBD', 'Verb, past tense', 'took, lived'], |
1132
|
|
|
|
|
|
|
['VBG', 'Verb, gerund', 'taking, living'], |
1133
|
|
|
|
|
|
|
['VBN', 'Verb, past/passive participle', 'taken, lived'], |
1134
|
|
|
|
|
|
|
['VBP', 'Verb, base present form', 'take, live'], |
1135
|
|
|
|
|
|
|
['VBZ', 'Verb, present 3SG -s form', 'takes, lives'], |
1136
|
|
|
|
|
|
|
['WDT', 'Determiner, question', 'which, whatever'], |
1137
|
|
|
|
|
|
|
['WP', 'Pronoun, question', 'who, whoever'], |
1138
|
|
|
|
|
|
|
['WPS', 'Determiner, possessive & question', 'whose'], |
1139
|
|
|
|
|
|
|
['WRB', 'Adverb, question', 'when, how, however'], |
1140
|
|
|
|
|
|
|
['PP', 'Punctuation, sentence ender', '., !, ?'], |
1141
|
|
|
|
|
|
|
['PPC', 'Punctuation, comma', ','], |
1142
|
|
|
|
|
|
|
['PGP', 'Punctuation, whitespace, gap', ' '], |
1143
|
|
|
|
|
|
|
['PPD', 'Punctuation, dollar sign', '$'], |
1144
|
|
|
|
|
|
|
['PPL', 'Punctuation, quotation mark left', '``'], |
1145
|
|
|
|
|
|
|
['PPR', 'Punctuation, quotation mark right', "''"], |
1146
|
|
|
|
|
|
|
['PPS', 'Punctuation, colon, semicolon, elipsis', ':, ..., -'], |
1147
|
|
|
|
|
|
|
['LRB', 'Punctuation, left bracket', '(, {, ['], |
1148
|
|
|
|
|
|
|
['RRB', 'Punctuation, right bracket', '), }, ]'] |
1149
|
|
|
|
|
|
|
]; |
1150
|
|
|
|
|
|
|
|
1151
|
|
|
|
|
|
|
# store the pos array. |
1152
|
4
|
|
|
|
|
23
|
$Self->{posTags} = $posTags; |
1153
|
4
|
|
|
|
|
16
|
return; |
1154
|
|
|
|
|
|
|
} |
1155
|
|
|
|
|
|
|
|
1156
|
|
|
|
|
|
|
|
1157
|
|
|
|
|
|
|
sub _getListOfPOSTagsFromPOSTypesList |
1158
|
|
|
|
|
|
|
{ |
1159
|
1
|
|
|
1
|
|
3
|
my ($Self, %Parameters) = @_; |
1160
|
|
|
|
|
|
|
|
1161
|
1
|
|
|
|
|
3
|
my $listOfPOSTypesToKeep = []; |
1162
|
1
|
50
|
|
|
|
4
|
$listOfPOSTypesToKeep = $Parameters{listOfPOSTypesToKeep} if exists $Parameters{listOfPOSTypesToKeep}; |
1163
|
|
|
|
|
|
|
|
1164
|
|
|
|
|
|
|
# ALL, NOUNS, VERBS, PUNCTUATION, ADJECTIVES, ADVERBS, CONTENT_WORDS, TEXTRANK_WORDS, VERBS |
1165
|
1
|
|
|
|
|
2
|
my @listOfPOSTagsToKeep; |
1166
|
1
|
|
|
|
|
6
|
foreach my $type (@$listOfPOSTypesToKeep) |
1167
|
|
|
|
|
|
|
{ |
1168
|
0
|
|
|
|
|
0
|
my $ucType = uc $type; |
1169
|
0
|
|
|
|
|
0
|
$ucType =~ tr/A-Z\_//cd; |
1170
|
0
|
0
|
|
|
|
0
|
if (($ucType cmp 'NOUNS') == 0) { push @listOfPOSTagsToKeep, POSTAGS_NOUN; } |
|
0
|
0
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
1171
|
0
|
|
|
|
|
0
|
elsif (($ucType cmp 'VERBS') == 0) { push @listOfPOSTagsToKeep, POSTAGS_VERB; } |
1172
|
0
|
|
|
|
|
0
|
elsif (($ucType cmp 'ADJECTIVES') == 0) { push @listOfPOSTagsToKeep, POSTAGS_ADJECTIVE; } |
1173
|
0
|
|
|
|
|
0
|
elsif (($ucType cmp 'ADVERBS') == 0) { push @listOfPOSTagsToKeep, POSTAGS_ADVERB; } |
1174
|
0
|
|
|
|
|
0
|
elsif (($ucType cmp 'CONTENT_ADVERBS') == 0) { push @listOfPOSTagsToKeep, POSTAGS_CONTENT_ADVERB; } |
1175
|
0
|
|
|
|
|
0
|
elsif (($ucType cmp 'PUNCTUATION') == 0) { push @listOfPOSTagsToKeep, POSTAGS_PUNCTUATION; } |
1176
|
0
|
|
|
|
|
0
|
elsif (($ucType cmp 'CONTENT_WORDS') == 0) { push @listOfPOSTagsToKeep, POSTAGS_CONTENT; } |
1177
|
0
|
|
|
|
|
0
|
elsif (($ucType cmp 'TEXTRANK_WORDS') == 0) { push @listOfPOSTagsToKeep, POSTAGS_TEXTRANK; } |
1178
|
0
|
|
|
|
|
0
|
elsif (($ucType cmp 'ALL') == 0) { push @listOfPOSTagsToKeep, POSTAGS_ALL; } |
1179
|
|
|
|
|
|
|
} |
1180
|
|
|
|
|
|
|
|
1181
|
1
|
|
|
|
|
4
|
return \@listOfPOSTagsToKeep; |
1182
|
|
|
|
|
|
|
} |
1183
|
|
|
|
|
|
|
|
1184
|
|
|
|
|
|
|
|
1185
|
|
|
|
|
|
|
# returns a hash reference of the part of speech tags that are |
1186
|
|
|
|
|
|
|
# provided in listOfPOSTagsToKeep. |
1187
|
|
|
|
|
|
|
sub _getHashOfPOSTagsToKeep |
1188
|
|
|
|
|
|
|
{ |
1189
|
304
|
|
|
304
|
|
1410
|
my ($Self, %Parameters) = @_; |
1190
|
|
|
|
|
|
|
|
1191
|
|
|
|
|
|
|
# get the list of POS tags to keep. |
1192
|
304
|
|
|
|
|
910
|
my $listOfPOSTagsToKeep; |
1193
|
304
|
100
|
66
|
|
|
4823
|
if (!exists ($Parameters{listOfPOSTagsToKeep}) && !exists ($Parameters{listOfPOSTypesToKeep})) |
1194
|
|
|
|
|
|
|
{ |
1195
|
303
|
100
|
|
|
|
3525
|
return $Self->{hashOfPOSTagsToKeep} unless exists $Parameters{instantiation}; |
1196
|
3
|
|
|
|
|
22
|
$listOfPOSTagsToKeep = [POSTAGS_TEXTRANK]; |
1197
|
|
|
|
|
|
|
} |
1198
|
|
|
|
|
|
|
else |
1199
|
|
|
|
|
|
|
{ |
1200
|
1
|
|
|
|
|
6
|
$listOfPOSTagsToKeep = $Self->_getListOfPOSTagsFromPOSTypesList (%Parameters); |
1201
|
1
|
50
|
|
|
|
4
|
push @$listOfPOSTagsToKeep, @{$Parameters{listOfPOSTagsToKeep}} if (exists ($Parameters{listOfPOSTagsToKeep})); |
|
1
|
|
|
|
|
22
|
|
1202
|
|
|
|
|
|
|
} |
1203
|
|
|
|
|
|
|
|
1204
|
4
|
|
|
|
|
11
|
my %hashOfPOSTagsToKeep; |
1205
|
4
|
|
|
|
|
16
|
foreach my $pos (@$listOfPOSTagsToKeep) |
1206
|
|
|
|
|
|
|
{ |
1207
|
68
|
|
|
|
|
76
|
my $posClean = uc $pos; |
1208
|
68
|
|
|
|
|
72
|
$posClean =~ tr/A-Z0-9//cd; |
1209
|
|
|
|
|
|
|
|
1210
|
|
|
|
|
|
|
# prefix with a slash. |
1211
|
68
|
|
|
|
|
172
|
$hashOfPOSTagsToKeep{'/' . $posClean} = 1; |
1212
|
|
|
|
|
|
|
} |
1213
|
|
|
|
|
|
|
|
1214
|
4
|
|
|
|
|
31
|
return \%hashOfPOSTagsToKeep; |
1215
|
|
|
|
|
|
|
} |
1216
|
|
|
|
|
|
|
|
1217
|
|
|
|
|
|
|
=head2 C |
1218
|
|
|
|
|
|
|
|
1219
|
|
|
|
|
|
|
The method C returns an array reference of the sorted stemmed |
1220
|
|
|
|
|
|
|
words in the text given by C. |
1221
|
|
|
|
|
|
|
|
1222
|
|
|
|
|
|
|
=over |
1223
|
|
|
|
|
|
|
|
1224
|
|
|
|
|
|
|
=item C |
1225
|
|
|
|
|
|
|
|
1226
|
|
|
|
|
|
|
listOfStemmedTaggedSentences => [...] |
1227
|
|
|
|
|
|
|
|
1228
|
|
|
|
|
|
|
C is the array reference returned by C or |
1229
|
|
|
|
|
|
|
C of the text. |
1230
|
|
|
|
|
|
|
|
1231
|
|
|
|
|
|
|
=back |
1232
|
|
|
|
|
|
|
|
1233
|
|
|
|
|
|
|
use Text::StemTagPOS; |
1234
|
|
|
|
|
|
|
use Data::Dump qw(dump); |
1235
|
|
|
|
|
|
|
my $stemTagger = Text::StemTagPOS->new; |
1236
|
|
|
|
|
|
|
my $text = 'The first sentence. Sentence number two.'; |
1237
|
|
|
|
|
|
|
my $listOfStemmedTaggedSentences = $stemTagger->getStemmedAndTaggedText ($text); |
1238
|
|
|
|
|
|
|
dump $listOfStemmedTaggedSentences; |
1239
|
|
|
|
|
|
|
|
1240
|
|
|
|
|
|
|
=cut |
1241
|
|
|
|
|
|
|
|
1242
|
|
|
|
|
|
|
sub getListOfStemmedWordsInText |
1243
|
|
|
|
|
|
|
{ |
1244
|
0
|
|
|
0
|
1
|
|
my ($Self, %Parameters) = @_; |
1245
|
|
|
|
|
|
|
|
1246
|
|
|
|
|
|
|
# get the hash of tags to keep. |
1247
|
0
|
|
|
|
|
|
my $hashOfPOSToKeep = $Self->_getHashOfPOSTagsToKeep (%Parameters); |
1248
|
|
|
|
|
|
|
|
1249
|
|
|
|
|
|
|
# get the tagged text. |
1250
|
0
|
|
|
|
|
|
my $listOfStemmedTaggedSentences = $Parameters{listOfStemmedTaggedSentences}; |
1251
|
|
|
|
|
|
|
|
1252
|
|
|
|
|
|
|
# create a list of the filtered stemmed words of the text. |
1253
|
0
|
|
|
|
|
|
my %wordsOfText; |
1254
|
0
|
|
|
|
|
|
foreach my $sentence (@$listOfStemmedTaggedSentences) |
1255
|
|
|
|
|
|
|
{ |
1256
|
0
|
|
|
|
|
|
foreach my $word (@$sentence) |
1257
|
|
|
|
|
|
|
{ |
1258
|
|
|
|
|
|
|
# skip over the words types to ignore. |
1259
|
0
|
0
|
|
|
|
|
next unless (exists ($hashOfPOSToKeep->{$word->[WORD_POSTAG]})); |
1260
|
0
|
|
|
|
|
|
$wordsOfText{$word->[WORD_STEMMED]} = 1; |
1261
|
|
|
|
|
|
|
} |
1262
|
|
|
|
|
|
|
} |
1263
|
0
|
|
|
|
|
|
my @words = sort keys %wordsOfText; |
1264
|
0
|
|
|
|
|
|
return \@words; |
1265
|
|
|
|
|
|
|
} |
1266
|
|
|
|
|
|
|
|
1267
|
|
|
|
|
|
|
=head2 C |
1268
|
|
|
|
|
|
|
|
1269
|
|
|
|
|
|
|
The method C returns an array reference of the sorted stemmed |
1270
|
|
|
|
|
|
|
words of the intersection of all the words in the documents given by C; |
1271
|
|
|
|
|
|
|
|
1272
|
|
|
|
|
|
|
=over |
1273
|
|
|
|
|
|
|
|
1274
|
|
|
|
|
|
|
=item C |
1275
|
|
|
|
|
|
|
|
1276
|
|
|
|
|
|
|
listOfStemmedTaggedDocuments => [...] |
1277
|
|
|
|
|
|
|
|
1278
|
|
|
|
|
|
|
C is a list of document references returned by C or |
1279
|
|
|
|
|
|
|
C. |
1280
|
|
|
|
|
|
|
|
1281
|
|
|
|
|
|
|
=back |
1282
|
|
|
|
|
|
|
|
1283
|
|
|
|
|
|
|
=cut |
1284
|
|
|
|
|
|
|
|
1285
|
|
|
|
|
|
|
sub getListOfStemmedWordsInAllDocuments |
1286
|
|
|
|
|
|
|
{ |
1287
|
0
|
|
|
0
|
1
|
|
my ($Self, %Parameters) = @_; |
1288
|
|
|
|
|
|
|
|
1289
|
|
|
|
|
|
|
# get the list of documents to process. |
1290
|
0
|
|
|
|
|
|
my $listOfStemmedTaggedDocuments = $Parameters{listOfStemmedTaggedDocuments}; |
1291
|
0
|
|
|
|
|
|
my $totalDocuments = @$listOfStemmedTaggedDocuments; |
1292
|
0
|
|
|
|
|
|
my %wordOccurence; |
1293
|
|
|
|
|
|
|
|
1294
|
|
|
|
|
|
|
# put the words of the first document into a hash. |
1295
|
0
|
|
|
|
|
|
my $sentences = $listOfStemmedTaggedDocuments->[0]; |
1296
|
0
|
|
|
|
|
|
foreach my $sentence (@$sentences) |
1297
|
|
|
|
|
|
|
{ |
1298
|
0
|
|
|
|
|
|
foreach my $word (@$sentence) |
1299
|
|
|
|
|
|
|
{ |
1300
|
0
|
|
|
|
|
|
$wordOccurence{$word->[WORD_STEMMED]} = 1; |
1301
|
|
|
|
|
|
|
} |
1302
|
|
|
|
|
|
|
} |
1303
|
|
|
|
|
|
|
|
1304
|
|
|
|
|
|
|
# add the words from the remaining documents, only if the word |
1305
|
|
|
|
|
|
|
# occurred in all previous documents. |
1306
|
0
|
|
|
|
|
|
for (my $i = 1; $i < $totalDocuments; $i++) |
1307
|
|
|
|
|
|
|
{ |
1308
|
0
|
|
|
|
|
|
my $sentences = $listOfStemmedTaggedDocuments->[$i]; |
1309
|
0
|
|
|
|
|
|
foreach my $sentence (@$sentences) |
1310
|
|
|
|
|
|
|
{ |
1311
|
0
|
|
|
|
|
|
foreach my $word (@$sentence) |
1312
|
|
|
|
|
|
|
{ |
1313
|
|
|
|
|
|
|
# if the word has not occurred in a previous document, skip it. |
1314
|
0
|
0
|
|
|
|
|
next unless exists $wordOccurence{$word->[WORD_STEMMED]}; |
1315
|
|
|
|
|
|
|
|
1316
|
|
|
|
|
|
|
# if the word has not occurred in all previous documents, delete it. |
1317
|
0
|
0
|
|
|
|
|
if ($wordOccurence{$word->[WORD_STEMMED]} < $i) |
1318
|
|
|
|
|
|
|
{ |
1319
|
0
|
|
|
|
|
|
delete $wordOccurence{$word->[WORD_STEMMED]}; |
1320
|
0
|
|
|
|
|
|
next; |
1321
|
|
|
|
|
|
|
} |
1322
|
|
|
|
|
|
|
|
1323
|
|
|
|
|
|
|
# set the words occurence. |
1324
|
0
|
|
|
|
|
|
$wordOccurence{$word->[WORD_STEMMED]} = $i + 1; |
1325
|
|
|
|
|
|
|
} |
1326
|
|
|
|
|
|
|
} |
1327
|
|
|
|
|
|
|
} |
1328
|
|
|
|
|
|
|
|
1329
|
|
|
|
|
|
|
# return the words that occured in all the documents. |
1330
|
0
|
|
|
|
|
|
my @wordsInAll; |
1331
|
0
|
|
|
|
|
|
while (my ($word, $occurence) = each %wordOccurence) |
1332
|
|
|
|
|
|
|
{ |
1333
|
0
|
0
|
|
|
|
|
push @wordsInAll, $word if ($occurence == $totalDocuments); |
1334
|
|
|
|
|
|
|
} |
1335
|
0
|
|
|
|
|
|
@wordsInAll = sort @wordsInAll; |
1336
|
0
|
|
|
|
|
|
return \@wordsInAll; |
1337
|
|
|
|
|
|
|
} |
1338
|
|
|
|
|
|
|
|
1339
|
|
|
|
|
|
|
=head1 INSTALLATION |
1340
|
|
|
|
|
|
|
|
1341
|
|
|
|
|
|
|
To install the module run the following commands: |
1342
|
|
|
|
|
|
|
|
1343
|
|
|
|
|
|
|
perl Makefile.PL |
1344
|
|
|
|
|
|
|
make |
1345
|
|
|
|
|
|
|
make test |
1346
|
|
|
|
|
|
|
make install |
1347
|
|
|
|
|
|
|
|
1348
|
|
|
|
|
|
|
If you are on a windows box you should use 'nmake' rather than 'make'. |
1349
|
|
|
|
|
|
|
|
1350
|
|
|
|
|
|
|
=head1 AUTHOR |
1351
|
|
|
|
|
|
|
|
1352
|
|
|
|
|
|
|
Jeff Kubina |
1353
|
|
|
|
|
|
|
|
1354
|
|
|
|
|
|
|
=head1 BUGS |
1355
|
|
|
|
|
|
|
|
1356
|
|
|
|
|
|
|
Please email bugs reports or feature requests to C, or through |
1357
|
|
|
|
|
|
|
the web interface at L. The author |
1358
|
|
|
|
|
|
|
will be notified and you can be automatically notified of progress on the bug fix or feature request. |
1359
|
|
|
|
|
|
|
|
1360
|
|
|
|
|
|
|
=head1 COPYRIGHT |
1361
|
|
|
|
|
|
|
|
1362
|
|
|
|
|
|
|
Copyright (c) 2010 Jeff Kubina. All rights reserved. |
1363
|
|
|
|
|
|
|
This program is free software; you can redistribute |
1364
|
|
|
|
|
|
|
it and/or modify it under the same terms as Perl itself. |
1365
|
|
|
|
|
|
|
|
1366
|
|
|
|
|
|
|
The full text of the license can be found in the |
1367
|
|
|
|
|
|
|
LICENSE file included with this module. |
1368
|
|
|
|
|
|
|
|
1369
|
|
|
|
|
|
|
=head1 KEYWORDS |
1370
|
|
|
|
|
|
|
|
1371
|
|
|
|
|
|
|
natural language processing, NLP, part of speech tagging, POS, stemming |
1372
|
|
|
|
|
|
|
|
1373
|
|
|
|
|
|
|
=head1 SEE ALSO |
1374
|
|
|
|
|
|
|
|
1375
|
|
|
|
|
|
|
L, L, L, L, |
1376
|
|
|
|
|
|
|
L, L |
1377
|
|
|
|
|
|
|
|
1378
|
|
|
|
|
|
|
=begin html |
1379
|
|
|
|
|
|
|
|
1380
|
|
|
|
|
|
|
See the Lingua::EN::Tagger README |
1381
|
|
|
|
|
|
|
file for a list of the part-of-speech tags. |
1382
|
|
|
|
|
|
|
|
1383
|
|
|
|
|
|
|
=end html |
1384
|
|
|
|
|
|
|
|
1385
|
|
|
|
|
|
|
=cut |
1386
|
|
|
|
|
|
|
|
1387
|
|
|
|
|
|
|
1; |
1388
|
|
|
|
|
|
|
# The preceding line will help the module return a true value |