line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Pod::Wordlist; |
2
|
6
|
|
|
6
|
|
57488
|
use 5.008; |
|
6
|
|
|
|
|
24
|
|
3
|
6
|
|
|
6
|
|
28
|
use strict; |
|
6
|
|
|
|
|
11
|
|
|
6
|
|
|
|
|
127
|
|
4
|
6
|
|
|
6
|
|
29
|
use warnings; |
|
6
|
|
|
|
|
11
|
|
|
6
|
|
|
|
|
269
|
|
5
|
|
|
|
|
|
|
|
6
|
|
|
|
|
|
|
our $VERSION = '1.24'; # TRIAL |
7
|
|
|
|
|
|
|
|
8
|
6
|
|
|
6
|
|
4080
|
use Lingua::EN::Inflect 'PL'; |
|
6
|
|
|
|
|
124430
|
|
|
6
|
|
|
|
|
679
|
|
9
|
6
|
|
|
6
|
|
63
|
use File::Spec (); |
|
6
|
|
|
|
|
37
|
|
|
6
|
|
|
|
|
134
|
|
10
|
|
|
|
|
|
|
use constant { |
11
|
6
|
|
|
|
|
628
|
MAXWORDLENGTH => 50, |
12
|
6
|
|
|
6
|
|
32
|
}; |
|
6
|
|
|
|
|
9
|
|
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
use Class::Tiny { |
15
|
6
|
|
|
|
|
46
|
wordlist => \&_copy_wordlist, |
16
|
|
|
|
|
|
|
_is_debug => 0, |
17
|
|
|
|
|
|
|
no_wide_chars => 0, |
18
|
6
|
|
|
6
|
|
2985
|
}; |
|
6
|
|
|
|
|
9460
|
|
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
our %Wordlist; ## no critic ( Variables::ProhibitPackageVars ) |
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
sub _copy_wordlist { |
23
|
8
|
|
|
8
|
|
79
|
my %copy; |
24
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
# %Wordlist can be accessed externally, and users will often add terms in |
26
|
|
|
|
|
|
|
# encoded form |
27
|
8
|
|
|
|
|
4867
|
for my $word ( keys %Wordlist ) { |
28
|
19448
|
|
|
|
|
19364
|
my $decoded_word = $word; |
29
|
|
|
|
|
|
|
# if it was already decoded, this should do nothing |
30
|
19448
|
|
|
|
|
27243
|
utf8::decode($decoded_word); |
31
|
19448
|
|
|
|
|
30258
|
$copy{$decoded_word} = 1; |
32
|
|
|
|
|
|
|
} |
33
|
|
|
|
|
|
|
|
34
|
8
|
|
|
|
|
789
|
return \%copy; |
35
|
|
|
|
|
|
|
} |
36
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
BEGIN { |
38
|
6
|
|
|
6
|
|
3946
|
my $file; |
39
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
# try to find wordlist in non-installed dist |
41
|
6
|
|
|
|
|
128
|
my ($d, $p) = File::Spec->splitpath(__FILE__); |
42
|
6
|
|
|
|
|
85
|
$p = File::Spec->catdir($p, (File::Spec->updir) x 2, 'share'); |
43
|
6
|
|
|
|
|
76
|
my $full_path = File::Spec->catpath($d, $p, 'wordlist'); |
44
|
6
|
50
|
33
|
|
|
156
|
if ($full_path && -e $full_path) { |
45
|
0
|
|
|
|
|
0
|
$file = $full_path; |
46
|
|
|
|
|
|
|
} |
47
|
|
|
|
|
|
|
|
48
|
6
|
50
|
|
|
|
26
|
if ( not defined $file ) { |
49
|
6
|
|
|
|
|
2753
|
require File::ShareDir; |
50
|
6
|
|
|
|
|
135362
|
$file = File::ShareDir::dist_file('Pod-Spell', 'wordlist'); |
51
|
|
|
|
|
|
|
} |
52
|
|
|
|
|
|
|
|
53
|
6
|
50
|
|
6
|
|
1229
|
open my $fh, '<:encoding(UTF-8)', $file |
|
6
|
|
|
|
|
37
|
|
|
6
|
|
|
|
|
10
|
|
|
6
|
|
|
|
|
62
|
|
54
|
|
|
|
|
|
|
or die "Cannot read $file: $!"; ## no critic (ErrorHandling::RequireCarping) |
55
|
6
|
|
|
|
|
62931
|
while ( defined( my $line = readline $fh ) ) { |
56
|
7320
|
|
|
|
|
1498139
|
chomp $line; |
57
|
7320
|
|
|
|
|
21276
|
$Wordlist{$line} = 1; |
58
|
7320
|
|
|
|
|
12083
|
$Wordlist{PL($line)} = 1; |
59
|
|
|
|
|
|
|
} |
60
|
6
|
|
|
|
|
6464
|
close $fh; |
61
|
|
|
|
|
|
|
} |
62
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
sub learn_stopwords { |
64
|
11
|
|
|
11
|
1
|
2304
|
my ( $self, $text ) = @_; |
65
|
11
|
|
|
|
|
245
|
my $stopwords = $self->wordlist; |
66
|
|
|
|
|
|
|
|
67
|
11
|
|
|
|
|
92
|
while ( $text =~ m<(\S+)>g ) { |
68
|
25
|
|
|
|
|
487
|
my $word = $1; |
69
|
25
|
|
|
|
|
65
|
utf8::decode($word); |
70
|
25
|
100
|
|
|
|
62
|
if ( $word =~ m/^!(.+)/s ) { |
71
|
|
|
|
|
|
|
# "!word" deletes from the stopword list |
72
|
2
|
|
|
|
|
6
|
my $negation = $1; |
73
|
|
|
|
|
|
|
# different $1 from above |
74
|
2
|
|
|
|
|
5
|
delete $stopwords->{$negation}; |
75
|
2
|
|
|
|
|
8
|
delete $stopwords->{PL($negation)}; |
76
|
2
|
100
|
|
|
|
419
|
print "Unlearning stopword <$negation>\n" if $self->_is_debug; |
77
|
|
|
|
|
|
|
} |
78
|
|
|
|
|
|
|
else { |
79
|
23
|
|
|
|
|
41
|
$word =~ s{'s$}{}; # we strip 's when checking so strip here, too |
80
|
23
|
|
|
|
|
55
|
$stopwords->{$word} = 1; |
81
|
23
|
|
|
|
|
73
|
$stopwords->{PL($word)} = 1; |
82
|
23
|
100
|
|
|
|
5795
|
print "Learning stopword <$word>\n" if $self->_is_debug; |
83
|
|
|
|
|
|
|
} |
84
|
|
|
|
|
|
|
} |
85
|
11
|
|
|
|
|
234
|
return; |
86
|
|
|
|
|
|
|
} |
87
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
sub is_stopword { |
89
|
56
|
|
|
56
|
1
|
77
|
my ($self, $word) = @_; |
90
|
56
|
|
|
|
|
863
|
my $stopwords = $self->wordlist; |
91
|
56
|
100
|
100
|
|
|
363
|
if ( exists $stopwords->{$word} or exists $stopwords->{ lc $word } ) { |
92
|
21
|
100
|
|
|
|
264
|
print " Rejecting <$word>\n" if $self->_is_debug; |
93
|
21
|
|
|
|
|
255
|
return 1; |
94
|
|
|
|
|
|
|
} |
95
|
35
|
|
|
|
|
143
|
return; |
96
|
|
|
|
|
|
|
} |
97
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
sub strip_stopwords { |
99
|
18
|
|
|
18
|
1
|
99
|
my ($self, $text) = @_; |
100
|
|
|
|
|
|
|
|
101
|
|
|
|
|
|
|
# Count the things in $text |
102
|
18
|
100
|
|
|
|
383
|
print "Content: <", $text, ">\n" if $self->_is_debug; |
103
|
|
|
|
|
|
|
|
104
|
18
|
|
|
|
|
537
|
my @words = grep { length($_) < MAXWORDLENGTH } split " ", $text; |
|
71
|
|
|
|
|
154
|
|
105
|
|
|
|
|
|
|
|
106
|
18
|
|
|
|
|
44
|
for ( @words ) { |
107
|
71
|
100
|
|
|
|
1463
|
print "Parsing word: <$_>\n" if $self->_is_debug; |
108
|
|
|
|
|
|
|
# some spellcheckers can't cope with anything but Latin1 |
109
|
71
|
100
|
100
|
|
|
1776
|
$_ = '' if $self->no_wide_chars && /[^\x00-\xFF]/; |
110
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
# strip leading punctuation |
112
|
71
|
|
|
|
|
414
|
s/^[\(\[\{\'\"\:\;\,\?\!\.]+//; |
113
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
# keep everything up to trailing punctuation, not counting |
115
|
|
|
|
|
|
|
# periods (for abbreviations like "Ph.D."), single-quotes |
116
|
|
|
|
|
|
|
# (for contractions like "don't") or colons (for package |
117
|
|
|
|
|
|
|
# names like "Foo::Bar") |
118
|
71
|
|
|
|
|
243
|
s/^([^\)\]\}\"\;\,\?\!]+).*$/$1/; |
119
|
|
|
|
|
|
|
|
120
|
|
|
|
|
|
|
# strip trailing single-quote, periods or colons; after this |
121
|
|
|
|
|
|
|
# we have a word that could have internal periods or quotes |
122
|
71
|
|
|
|
|
146
|
s/[\.\'\:]+$//; |
123
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
# strip possessive |
125
|
71
|
|
|
|
|
115
|
s/'s$//i; |
126
|
|
|
|
|
|
|
|
127
|
|
|
|
|
|
|
# zero out variable names or things with internal symbols, |
128
|
|
|
|
|
|
|
# since those are probably code expressions outside a C<> |
129
|
71
|
|
|
|
|
125
|
my $is_sigil = /^[\&\%\$\@\:\<\*\\\_]/; |
130
|
71
|
|
|
|
|
102
|
my $is_strange = /[\%\^\&\#\$\@\_\<\>\(\)\[\]\{\}\\\*\:\+\/\=\|\`\~]/; |
131
|
71
|
100
|
100
|
|
|
207
|
$_ = '' if $is_sigil || $is_strange; |
132
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
# stop if there are no "word" characters left; if it's just |
134
|
|
|
|
|
|
|
# punctuation that we didn't happen to strip or it's weird glyphs, |
135
|
|
|
|
|
|
|
# the spellchecker won't do any good anyway |
136
|
71
|
100
|
|
|
|
148
|
next unless /\w/; |
137
|
|
|
|
|
|
|
|
138
|
51
|
100
|
|
|
|
684
|
print " Checking as <$_>\n" if $self->_is_debug; |
139
|
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
# replace it with any stopword or stopword parts stripped |
141
|
51
|
|
|
|
|
580
|
$_ = $self->_strip_a_word($_); |
142
|
|
|
|
|
|
|
|
143
|
51
|
100
|
100
|
|
|
591
|
print " Keeping as <$_>\n" if $_ && $self->_is_debug; |
144
|
|
|
|
|
|
|
} |
145
|
|
|
|
|
|
|
|
146
|
18
|
50
|
|
|
|
97
|
return join(" ", grep { defined && length } @words ); |
|
71
|
|
|
|
|
214
|
|
147
|
|
|
|
|
|
|
} |
148
|
|
|
|
|
|
|
|
149
|
|
|
|
|
|
|
sub _strip_a_word { |
150
|
51
|
|
|
51
|
|
89
|
my ($self, $word) = @_; |
151
|
51
|
|
|
|
|
55
|
my $remainder; |
152
|
|
|
|
|
|
|
|
153
|
|
|
|
|
|
|
# try word as-is, including possible hyphenation vs stoplist |
154
|
51
|
100
|
|
|
|
98
|
if ($self->is_stopword($word) ) { |
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
155
|
19
|
|
|
|
|
29
|
$remainder = ''; |
156
|
|
|
|
|
|
|
} |
157
|
|
|
|
|
|
|
# internal period could be abbreviations, so check with |
158
|
|
|
|
|
|
|
# trailing period restored and drop or keep on that basis |
159
|
|
|
|
|
|
|
elsif ( index($word, '.') >= 0 ) { |
160
|
2
|
|
|
|
|
6
|
my $abbr = "$word."; |
161
|
2
|
100
|
|
|
|
6
|
$remainder = $self->is_stopword($abbr) ? '' : $abbr; |
162
|
|
|
|
|
|
|
} |
163
|
|
|
|
|
|
|
# check individual parts of hyphenated word, keep whatever isn't a |
164
|
|
|
|
|
|
|
# stopword as individual words |
165
|
|
|
|
|
|
|
elsif ( index($word, '-') >= 0 ) { |
166
|
1
|
|
|
|
|
2
|
my @keep; |
167
|
1
|
|
|
|
|
5
|
for my $part ( split /-/, $word ) { |
168
|
3
|
100
|
|
|
|
7
|
push @keep, $part if ! $self->is_stopword( $part ); |
169
|
|
|
|
|
|
|
} |
170
|
1
|
50
|
|
|
|
18
|
$remainder = join(" ", @keep) if @keep; |
171
|
|
|
|
|
|
|
} |
172
|
|
|
|
|
|
|
# otherwise, we just keep it |
173
|
|
|
|
|
|
|
else { |
174
|
29
|
|
|
|
|
40
|
$remainder = $word; |
175
|
|
|
|
|
|
|
} |
176
|
51
|
|
|
|
|
81
|
return $remainder; |
177
|
|
|
|
|
|
|
} |
178
|
|
|
|
|
|
|
|
179
|
|
|
|
|
|
|
1; |
180
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
__END__ |