| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package Pod::Wordlist; |
|
2
|
7
|
|
|
7
|
|
223381
|
use 5.008; |
|
|
7
|
|
|
|
|
20
|
|
|
3
|
7
|
|
|
7
|
|
33
|
use strict; |
|
|
7
|
|
|
|
|
13
|
|
|
|
7
|
|
|
|
|
158
|
|
|
4
|
7
|
|
|
7
|
|
25
|
use warnings; |
|
|
7
|
|
|
|
|
8
|
|
|
|
7
|
|
|
|
|
547
|
|
|
5
|
|
|
|
|
|
|
|
|
6
|
|
|
|
|
|
|
our $VERSION = '1.27'; |
|
7
|
|
|
|
|
|
|
|
|
8
|
7
|
|
|
7
|
|
5612
|
use Lingua::EN::Inflect 'PL'; |
|
|
7
|
|
|
|
|
156246
|
|
|
|
7
|
|
|
|
|
1149
|
|
|
9
|
7
|
|
|
7
|
|
86
|
use File::Spec (); |
|
|
7
|
|
|
|
|
13
|
|
|
|
7
|
|
|
|
|
206
|
|
|
10
|
|
|
|
|
|
|
use constant { |
|
11
|
7
|
|
|
|
|
777
|
_MAXWORDLENGTH => 50, |
|
12
|
7
|
|
|
7
|
|
46
|
}; |
|
|
7
|
|
|
|
|
10
|
|
|
13
|
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
use Class::Tiny { |
|
15
|
7
|
|
|
|
|
58
|
wordlist => \&_copy_wordlist, |
|
16
|
|
|
|
|
|
|
_is_debug => 0, |
|
17
|
|
|
|
|
|
|
no_wide_chars => 0, |
|
18
|
7
|
|
|
7
|
|
4267
|
}; |
|
|
7
|
|
|
|
|
13338
|
|
|
19
|
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
our %Wordlist; ## no critic ( Variables::ProhibitPackageVars ) |
|
21
|
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
sub _copy_wordlist { |
|
23
|
18
|
|
|
18
|
|
147
|
my %copy; |
|
24
|
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
# %Wordlist can be accessed externally, and users will often add terms in |
|
26
|
|
|
|
|
|
|
# encoded form |
|
27
|
18
|
|
|
|
|
11174
|
for my $word ( keys %Wordlist ) { |
|
28
|
43758
|
|
|
|
|
38324
|
my $decoded_word = $word; |
|
29
|
|
|
|
|
|
|
# if it was already decoded, this should do nothing |
|
30
|
43758
|
|
|
|
|
53916
|
utf8::decode($decoded_word); |
|
31
|
43758
|
|
|
|
|
63301
|
$copy{$decoded_word} = 1; |
|
32
|
|
|
|
|
|
|
} |
|
33
|
|
|
|
|
|
|
|
|
34
|
18
|
|
|
|
|
1844
|
return \%copy; |
|
35
|
|
|
|
|
|
|
} |
|
36
|
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
BEGIN { |
|
38
|
7
|
|
|
7
|
|
5317
|
my $file; |
|
39
|
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
# try to find wordlist in non-installed dist |
|
41
|
7
|
|
|
|
|
158
|
my ($d, $p) = File::Spec->splitpath(__FILE__); |
|
42
|
7
|
|
|
|
|
163
|
$p = File::Spec->catdir($p, (File::Spec->updir) x 2, 'share'); |
|
43
|
7
|
|
|
|
|
107
|
my $full_path = File::Spec->catpath($d, $p, 'wordlist'); |
|
44
|
7
|
50
|
33
|
|
|
209
|
if ($full_path && -e $full_path) { |
|
45
|
0
|
|
|
|
|
0
|
$file = $full_path; |
|
46
|
|
|
|
|
|
|
} |
|
47
|
|
|
|
|
|
|
|
|
48
|
7
|
50
|
|
|
|
28
|
if ( not defined $file ) { |
|
49
|
7
|
|
|
|
|
3350
|
require File::ShareDir; |
|
50
|
7
|
|
|
|
|
189029
|
$file = File::ShareDir::dist_file('Pod-Spell', 'wordlist'); |
|
51
|
|
|
|
|
|
|
} |
|
52
|
|
|
|
|
|
|
|
|
53
|
7
|
50
|
|
7
|
|
1593
|
open my $fh, '<:encoding(UTF-8)', $file |
|
|
7
|
|
|
|
|
3968
|
|
|
|
7
|
|
|
|
|
100
|
|
|
|
7
|
|
|
|
|
34
|
|
|
54
|
|
|
|
|
|
|
or die "Cannot read $file: $!"; ## no critic (ErrorHandling::RequireCarping) |
|
55
|
7
|
|
|
|
|
66345
|
while ( defined( my $line = readline $fh ) ) { |
|
56
|
8540
|
|
|
|
|
1748686
|
chomp $line; |
|
57
|
8540
|
|
|
|
|
17453
|
$Wordlist{$line} = 1; |
|
58
|
8540
|
|
|
|
|
14220
|
$Wordlist{PL($line)} = 1; |
|
59
|
|
|
|
|
|
|
} |
|
60
|
7
|
|
|
|
|
8912
|
close $fh; |
|
61
|
|
|
|
|
|
|
} |
|
62
|
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
sub learn_stopwords { |
|
64
|
21
|
|
|
21
|
1
|
295483
|
my ( $self, $text ) = @_; |
|
65
|
21
|
|
|
|
|
582
|
my $stopwords = $self->wordlist; |
|
66
|
|
|
|
|
|
|
|
|
67
|
21
|
|
|
|
|
266
|
while ( $text =~ m<(\S+)>g ) { |
|
68
|
45
|
|
|
|
|
492
|
my $word = $1; |
|
69
|
45
|
|
|
|
|
148
|
utf8::decode($word); |
|
70
|
45
|
100
|
|
|
|
103
|
if ( $word =~ m/^!(.+)/s ) { |
|
71
|
|
|
|
|
|
|
# "!word" deletes from the stopword list |
|
72
|
2
|
|
|
|
|
4
|
my $negation = $1; |
|
73
|
|
|
|
|
|
|
# different $1 from above |
|
74
|
2
|
|
|
|
|
4
|
delete $stopwords->{$negation}; |
|
75
|
2
|
|
|
|
|
6
|
delete $stopwords->{PL($negation)}; |
|
76
|
2
|
100
|
|
|
|
423
|
print "Unlearning stopword <$negation>\n" if $self->_is_debug; |
|
77
|
|
|
|
|
|
|
} |
|
78
|
|
|
|
|
|
|
else { |
|
79
|
43
|
|
|
|
|
94
|
$word =~ s{'s$}{}; # we strip 's when checking so strip here, too |
|
80
|
43
|
|
|
|
|
96
|
$stopwords->{$word} = 1; |
|
81
|
43
|
|
|
|
|
170
|
$stopwords->{PL($word)} = 1; |
|
82
|
43
|
100
|
|
|
|
12464
|
print "Learning stopword <$word>\n" if $self->_is_debug; |
|
83
|
|
|
|
|
|
|
} |
|
84
|
|
|
|
|
|
|
} |
|
85
|
21
|
|
|
|
|
242
|
return; |
|
86
|
|
|
|
|
|
|
} |
|
87
|
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
sub is_stopword { |
|
89
|
161
|
|
|
161
|
1
|
184
|
my ($self, $word) = @_; |
|
90
|
161
|
|
|
|
|
1692
|
my $stopwords = $self->wordlist; |
|
91
|
161
|
100
|
100
|
|
|
761
|
if ( exists $stopwords->{$word} or exists $stopwords->{ lc $word } ) { |
|
92
|
71
|
100
|
|
|
|
718
|
print " Rejecting <$word>\n" if $self->_is_debug; |
|
93
|
71
|
|
|
|
|
263
|
return 1; |
|
94
|
|
|
|
|
|
|
} |
|
95
|
90
|
|
|
|
|
311
|
return; |
|
96
|
|
|
|
|
|
|
} |
|
97
|
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
sub strip_stopwords { |
|
99
|
38
|
|
|
38
|
1
|
228
|
my ($self, $text) = @_; |
|
100
|
|
|
|
|
|
|
|
|
101
|
|
|
|
|
|
|
# Count the things in $text |
|
102
|
38
|
100
|
|
|
|
844
|
print "Content: <", $text, ">\n" if $self->_is_debug; |
|
103
|
|
|
|
|
|
|
|
|
104
|
38
|
|
|
|
|
395
|
my @words = grep { length($_) < _MAXWORDLENGTH } split " ", $text; |
|
|
211
|
|
|
|
|
391
|
|
|
105
|
|
|
|
|
|
|
|
|
106
|
38
|
|
|
|
|
108
|
for ( @words ) { |
|
107
|
211
|
100
|
|
|
|
2577
|
print "Parsing word: <$_>\n" if $self->_is_debug; |
|
108
|
|
|
|
|
|
|
# some spellcheckers can't cope with anything but Latin1 |
|
109
|
211
|
100
|
100
|
|
|
2714
|
$_ = '' if $self->no_wide_chars && /[^\x00-\xFF]/; |
|
110
|
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
# strip leading punctuation |
|
112
|
211
|
|
|
|
|
885
|
s/^[\(\[\{\'\"\:\;\,\?\!\.]+//; |
|
113
|
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
# keep everything up to trailing punctuation, not counting |
|
115
|
|
|
|
|
|
|
# periods (for abbreviations like "Ph.D."), single-quotes |
|
116
|
|
|
|
|
|
|
# (for contractions like "don't") or colons (for package |
|
117
|
|
|
|
|
|
|
# names like "Foo::Bar") |
|
118
|
211
|
|
|
|
|
513
|
s/^([^\)\]\}\"\;\,\?\!]+).*$/$1/; |
|
119
|
|
|
|
|
|
|
|
|
120
|
|
|
|
|
|
|
# strip trailing single-quote, periods or colons; after this |
|
121
|
|
|
|
|
|
|
# we have a word that could have internal periods or quotes |
|
122
|
211
|
|
|
|
|
320
|
s/[\.\'\:]+$//; |
|
123
|
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
# strip possessive |
|
125
|
211
|
|
|
|
|
217
|
s/'s$//i; |
|
126
|
|
|
|
|
|
|
|
|
127
|
|
|
|
|
|
|
# zero out variable names or things with internal symbols, |
|
128
|
|
|
|
|
|
|
# since those are probably code expressions outside a C<> |
|
129
|
211
|
|
|
|
|
270
|
my $is_sigil = /^[\&\%\$\@\:\<\*\\\_]/; |
|
130
|
211
|
|
|
|
|
251
|
my $is_strange = /[\%\^\&\#\$\@\_\<\>\(\)\[\]\{\}\\\*\:\+\/\=\|\`\~]/; |
|
131
|
211
|
100
|
100
|
|
|
473
|
$_ = '' if $is_sigil || $is_strange; |
|
132
|
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
# stop if there are no "word" characters left; if it's just |
|
134
|
|
|
|
|
|
|
# punctuation that we didn't happen to strip or it's weird glyphs, |
|
135
|
|
|
|
|
|
|
# the spellchecker won't do any good anyway |
|
136
|
211
|
100
|
|
|
|
366
|
next unless /\w/; |
|
137
|
|
|
|
|
|
|
|
|
138
|
156
|
100
|
|
|
|
1683
|
print " Checking as <$_>\n" if $self->_is_debug; |
|
139
|
|
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
# replace it with any stopword or stopword parts stripped |
|
141
|
156
|
|
|
|
|
558
|
$_ = $self->_strip_a_word($_); |
|
142
|
|
|
|
|
|
|
|
|
143
|
156
|
100
|
100
|
|
|
1086
|
print " Keeping as <$_>\n" if $_ && $self->_is_debug; |
|
144
|
|
|
|
|
|
|
} |
|
145
|
|
|
|
|
|
|
|
|
146
|
38
|
50
|
|
|
|
128
|
return join(" ", grep { defined && length } @words ); |
|
|
211
|
|
|
|
|
512
|
|
|
147
|
|
|
|
|
|
|
} |
|
148
|
|
|
|
|
|
|
|
|
149
|
|
|
|
|
|
|
sub _strip_a_word { |
|
150
|
156
|
|
|
156
|
|
198
|
my ($self, $word) = @_; |
|
151
|
156
|
|
|
|
|
145
|
my $remainder; |
|
152
|
|
|
|
|
|
|
|
|
153
|
|
|
|
|
|
|
# try word as-is, including possible hyphenation vs stoplist |
|
154
|
156
|
100
|
|
|
|
237
|
if ($self->is_stopword($word) ) { |
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
155
|
69
|
|
|
|
|
71
|
$remainder = ''; |
|
156
|
|
|
|
|
|
|
} |
|
157
|
|
|
|
|
|
|
# internal period could be abbreviations, so check with |
|
158
|
|
|
|
|
|
|
# trailing period restored and drop or keep on that basis |
|
159
|
|
|
|
|
|
|
elsif ( index($word, '.') >= 0 ) { |
|
160
|
2
|
|
|
|
|
5
|
my $abbr = "$word."; |
|
161
|
2
|
100
|
|
|
|
7
|
$remainder = $self->is_stopword($abbr) ? '' : $abbr; |
|
162
|
|
|
|
|
|
|
} |
|
163
|
|
|
|
|
|
|
# check individual parts of hyphenated word, keep whatever isn't a |
|
164
|
|
|
|
|
|
|
# stopword as individual words |
|
165
|
|
|
|
|
|
|
elsif ( index($word, '-') >= 0 ) { |
|
166
|
1
|
|
|
|
|
2
|
my @keep; |
|
167
|
1
|
|
|
|
|
3
|
for my $part ( split /-/, $word ) { |
|
168
|
3
|
100
|
|
|
|
14
|
push @keep, $part if ! $self->is_stopword( $part ); |
|
169
|
|
|
|
|
|
|
} |
|
170
|
1
|
50
|
|
|
|
4
|
$remainder = join(" ", @keep) if @keep; |
|
171
|
|
|
|
|
|
|
} |
|
172
|
|
|
|
|
|
|
# otherwise, we just keep it |
|
173
|
|
|
|
|
|
|
else { |
|
174
|
84
|
|
|
|
|
133
|
$remainder = $word; |
|
175
|
|
|
|
|
|
|
} |
|
176
|
156
|
|
|
|
|
232
|
return $remainder; |
|
177
|
|
|
|
|
|
|
} |
|
178
|
|
|
|
|
|
|
|
|
179
|
|
|
|
|
|
|
1; |
|
180
|
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
__END__ |
|
182
|
|
|
|
|
|
|
|
|
183
|
|
|
|
|
|
|
=pod |
|
184
|
|
|
|
|
|
|
|
|
185
|
|
|
|
|
|
|
=encoding UTF-8 |
|
186
|
|
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
=for :stopwords Sean M. Burke Caleb Cushing Olivier Mengué |
|
188
|
|
|
|
|
|
|
|
|
189
|
|
|
|
|
|
|
=head1 NAME |
|
190
|
|
|
|
|
|
|
|
|
191
|
|
|
|
|
|
|
Pod::Wordlist - English words that come up in Perl documentation |
|
192
|
|
|
|
|
|
|
|
|
193
|
|
|
|
|
|
|
=head1 VERSION |
|
194
|
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
version 1.27 |
|
196
|
|
|
|
|
|
|
|
|
197
|
|
|
|
|
|
|
=head1 DESCRIPTION |
|
198
|
|
|
|
|
|
|
|
|
199
|
|
|
|
|
|
|
Pod::Wordlist is used by L<Pod::Spell|Pod::Spell>, providing a set of words |
|
200
|
|
|
|
|
|
|
that are English jargon words that come up in Perl documentation, but which are |
|
201
|
|
|
|
|
|
|
not to be found in general English lexicons. (For example: autovivify, |
|
202
|
|
|
|
|
|
|
backreference, chroot, stringify, wantarray.) |
|
203
|
|
|
|
|
|
|
|
|
204
|
|
|
|
|
|
|
You can also use this wordlist with your word processor by just |
|
205
|
|
|
|
|
|
|
pasting C<share/wordlist>'s content into your wordprocessor, deleting |
|
206
|
|
|
|
|
|
|
the leading Perl code so that only the wordlist remains, and then |
|
207
|
|
|
|
|
|
|
spellchecking this resulting list and adding every word in it to your |
|
208
|
|
|
|
|
|
|
private lexicon. |
|
209
|
|
|
|
|
|
|
|
|
210
|
|
|
|
|
|
|
=head1 METHODS |
|
211
|
|
|
|
|
|
|
|
|
212
|
|
|
|
|
|
|
=head2 learn_stopwords |
|
213
|
|
|
|
|
|
|
|
|
214
|
|
|
|
|
|
|
$wordlist->learn_stopwords( $text ); |
|
215
|
|
|
|
|
|
|
|
|
216
|
|
|
|
|
|
|
Modifies the stopword list based on a text block. See the rules |
|
217
|
|
|
|
|
|
|
for L<adding stopwords|Pod::Spell/ADDING STOPWORDS> for details. |
|
218
|
|
|
|
|
|
|
|
|
219
|
|
|
|
|
|
|
=head2 is_stopword |
|
220
|
|
|
|
|
|
|
|
|
221
|
|
|
|
|
|
|
if ( $wordlist->is_stopword( $word ) ) { ... } |
|
222
|
|
|
|
|
|
|
|
|
223
|
|
|
|
|
|
|
Returns true if the word is found in the stopword list. |
|
224
|
|
|
|
|
|
|
|
|
225
|
|
|
|
|
|
|
=head2 strip_stopwords |
|
226
|
|
|
|
|
|
|
|
|
227
|
|
|
|
|
|
|
my $out = $wordlist->strip_stopwords( $text ); |
|
228
|
|
|
|
|
|
|
|
|
229
|
|
|
|
|
|
|
Returns a string with space separated words from the original |
|
230
|
|
|
|
|
|
|
text with stopwords removed. |
|
231
|
|
|
|
|
|
|
|
|
232
|
|
|
|
|
|
|
=head1 ATTRIBUTES |
|
233
|
|
|
|
|
|
|
|
|
234
|
|
|
|
|
|
|
=head2 wordlist |
|
235
|
|
|
|
|
|
|
|
|
236
|
|
|
|
|
|
|
ref $self->wordlist eq 'HASH'; # true |
|
237
|
|
|
|
|
|
|
|
|
238
|
|
|
|
|
|
|
This is the instance of the wordlist |
|
239
|
|
|
|
|
|
|
|
|
240
|
|
|
|
|
|
|
=head2 no_wide_chars |
|
241
|
|
|
|
|
|
|
|
|
242
|
|
|
|
|
|
|
If true, words with characters outside the Latin-1 range C<0x00> to C<0xFF> will |
|
243
|
|
|
|
|
|
|
be stripped like stopwords. |
|
244
|
|
|
|
|
|
|
|
|
245
|
|
|
|
|
|
|
=head1 WORDLIST |
|
246
|
|
|
|
|
|
|
|
|
247
|
|
|
|
|
|
|
Note that the scope of this file is only English, specifically American |
|
248
|
|
|
|
|
|
|
English. (But you may find in useful to incorporate into your own |
|
249
|
|
|
|
|
|
|
lexicons, even if they are for other dialects/languages.) |
|
250
|
|
|
|
|
|
|
|
|
251
|
|
|
|
|
|
|
remove any q{'s} before adding to the list. |
|
252
|
|
|
|
|
|
|
|
|
253
|
|
|
|
|
|
|
The list should be sorted and uniqued. The following will work (with GNU |
|
254
|
|
|
|
|
|
|
Coreutils ). |
|
255
|
|
|
|
|
|
|
|
|
256
|
|
|
|
|
|
|
sort share/wordlist -u > /tmp/sorted && mv /tmp/sorted share/wordlist |
|
257
|
|
|
|
|
|
|
|
|
258
|
|
|
|
|
|
|
=head1 BUGS |
|
259
|
|
|
|
|
|
|
|
|
260
|
|
|
|
|
|
|
Please report any bugs or feature requests on the bugtracker website |
|
261
|
|
|
|
|
|
|
L<https://rt.cpan.org/Public/Dist/Display.html?Name=Pod-Spell> or by email |
|
262
|
|
|
|
|
|
|
to L<bug-Pod-Spell@rt.cpan.org|mailto:bug-Pod-Spell@rt.cpan.org>. |
|
263
|
|
|
|
|
|
|
|
|
264
|
|
|
|
|
|
|
When submitting a bug or request, please include a test-file or a |
|
265
|
|
|
|
|
|
|
patch to an existing test-file that illustrates the bug or desired |
|
266
|
|
|
|
|
|
|
feature. |
|
267
|
|
|
|
|
|
|
|
|
268
|
|
|
|
|
|
|
=head1 AUTHORS |
|
269
|
|
|
|
|
|
|
|
|
270
|
|
|
|
|
|
|
=over 4 |
|
271
|
|
|
|
|
|
|
|
|
272
|
|
|
|
|
|
|
=item * |
|
273
|
|
|
|
|
|
|
|
|
274
|
|
|
|
|
|
|
Sean M. Burke <sburke@cpan.org> |
|
275
|
|
|
|
|
|
|
|
|
276
|
|
|
|
|
|
|
=item * |
|
277
|
|
|
|
|
|
|
|
|
278
|
|
|
|
|
|
|
Caleb Cushing <xenoterracide@gmail.com> |
|
279
|
|
|
|
|
|
|
|
|
280
|
|
|
|
|
|
|
=back |
|
281
|
|
|
|
|
|
|
|
|
282
|
|
|
|
|
|
|
=head1 COPYRIGHT AND LICENSE |
|
283
|
|
|
|
|
|
|
|
|
284
|
|
|
|
|
|
|
This software is Copyright (c) 2024 by Olivier Mengué. |
|
285
|
|
|
|
|
|
|
|
|
286
|
|
|
|
|
|
|
This is free software, licensed under: |
|
287
|
|
|
|
|
|
|
|
|
288
|
|
|
|
|
|
|
The Artistic License 2.0 (GPL Compatible) |
|
289
|
|
|
|
|
|
|
|
|
290
|
|
|
|
|
|
|
=cut |