File Coverage

blib/lib/Pod/Wordlist.pm
Criterion Covered Total %
statement 96 97 98.9
branch 39 44 88.6
condition 13 15 86.6
subroutine 14 14 100.0
pod 3 3 100.0
total 165 173 95.3


line stmt bran cond sub pod time code
1             package Pod::Wordlist;
2 7     7   223381 use 5.008;
  7         20  
3 7     7   33 use strict;
  7         13  
  7         158  
4 7     7   25 use warnings;
  7         8  
  7         547  
5              
6             our $VERSION = '1.27';
7              
8 7     7   5612 use Lingua::EN::Inflect 'PL';
  7         156246  
  7         1149  
9 7     7   86 use File::Spec ();
  7         13  
  7         206  
10             use constant {
11 7         777 _MAXWORDLENGTH => 50,
12 7     7   46 };
  7         10  
13              
14             use Class::Tiny {
15 7         58 wordlist => \&_copy_wordlist,
16             _is_debug => 0,
17             no_wide_chars => 0,
18 7     7   4267 };
  7         13338  
19              
20             our %Wordlist; ## no critic ( Variables::ProhibitPackageVars )
21              
22             sub _copy_wordlist {
23 18     18   147 my %copy;
24              
25             # %Wordlist can be accessed externally, and users will often add terms in
26             # encoded form
27 18         11174 for my $word ( keys %Wordlist ) {
28 43758         38324 my $decoded_word = $word;
29             # if it was already decoded, this should do nothing
30 43758         53916 utf8::decode($decoded_word);
31 43758         63301 $copy{$decoded_word} = 1;
32             }
33              
34 18         1844 return \%copy;
35             }
36              
37             BEGIN {
38 7     7   5317 my $file;
39              
40             # try to find wordlist in non-installed dist
41 7         158 my ($d, $p) = File::Spec->splitpath(__FILE__);
42 7         163 $p = File::Spec->catdir($p, (File::Spec->updir) x 2, 'share');
43 7         107 my $full_path = File::Spec->catpath($d, $p, 'wordlist');
44 7 50 33     209 if ($full_path && -e $full_path) {
45 0         0 $file = $full_path;
46             }
47              
48 7 50       28 if ( not defined $file ) {
49 7         3350 require File::ShareDir;
50 7         189029 $file = File::ShareDir::dist_file('Pod-Spell', 'wordlist');
51             }
52              
53 7 50   7   1593 open my $fh, '<:encoding(UTF-8)', $file
  7         3968  
  7         100  
  7         34  
54             or die "Cannot read $file: $!"; ## no critic (ErrorHandling::RequireCarping)
55 7         66345 while ( defined( my $line = readline $fh ) ) {
56 8540         1748686 chomp $line;
57 8540         17453 $Wordlist{$line} = 1;
58 8540         14220 $Wordlist{PL($line)} = 1;
59             }
60 7         8912 close $fh;
61             }
62              
63             sub learn_stopwords {
64 21     21 1 295483 my ( $self, $text ) = @_;
65 21         582 my $stopwords = $self->wordlist;
66              
67 21         266 while ( $text =~ m<(\S+)>g ) {
68 45         492 my $word = $1;
69 45         148 utf8::decode($word);
70 45 100       103 if ( $word =~ m/^!(.+)/s ) {
71             # "!word" deletes from the stopword list
72 2         4 my $negation = $1;
73             # different $1 from above
74 2         4 delete $stopwords->{$negation};
75 2         6 delete $stopwords->{PL($negation)};
76 2 100       423 print "Unlearning stopword <$negation>\n" if $self->_is_debug;
77             }
78             else {
79 43         94 $word =~ s{'s$}{}; # we strip 's when checking so strip here, too
80 43         96 $stopwords->{$word} = 1;
81 43         170 $stopwords->{PL($word)} = 1;
82 43 100       12464 print "Learning stopword <$word>\n" if $self->_is_debug;
83             }
84             }
85 21         242 return;
86             }
87              
88             sub is_stopword {
89 161     161 1 184 my ($self, $word) = @_;
90 161         1692 my $stopwords = $self->wordlist;
91 161 100 100     761 if ( exists $stopwords->{$word} or exists $stopwords->{ lc $word } ) {
92 71 100       718 print " Rejecting <$word>\n" if $self->_is_debug;
93 71         263 return 1;
94             }
95 90         311 return;
96             }
97              
98             sub strip_stopwords {
99 38     38 1 228 my ($self, $text) = @_;
100              
101             # Count the things in $text
102 38 100       844 print "Content: <", $text, ">\n" if $self->_is_debug;
103              
104 38         395 my @words = grep { length($_) < _MAXWORDLENGTH } split " ", $text;
  211         391  
105              
106 38         108 for ( @words ) {
107 211 100       2577 print "Parsing word: <$_>\n" if $self->_is_debug;
108             # some spellcheckers can't cope with anything but Latin1
109 211 100 100     2714 $_ = '' if $self->no_wide_chars && /[^\x00-\xFF]/;
110              
111             # strip leading punctuation
112 211         885 s/^[\(\[\{\'\"\:\;\,\?\!\.]+//;
113              
114             # keep everything up to trailing punctuation, not counting
115             # periods (for abbreviations like "Ph.D."), single-quotes
116             # (for contractions like "don't") or colons (for package
117             # names like "Foo::Bar")
118 211         513 s/^([^\)\]\}\"\;\,\?\!]+).*$/$1/;
119              
120             # strip trailing single-quote, periods or colons; after this
121             # we have a word that could have internal periods or quotes
122 211         320 s/[\.\'\:]+$//;
123              
124             # strip possessive
125 211         217 s/'s$//i;
126              
127             # zero out variable names or things with internal symbols,
128             # since those are probably code expressions outside a C<>
129 211         270 my $is_sigil = /^[\&\%\$\@\:\<\*\\\_]/;
130 211         251 my $is_strange = /[\%\^\&\#\$\@\_\<\>\(\)\[\]\{\}\\\*\:\+\/\=\|\`\~]/;
131 211 100 100     473 $_ = '' if $is_sigil || $is_strange;
132              
133             # stop if there are no "word" characters left; if it's just
134             # punctuation that we didn't happen to strip or it's weird glyphs,
135             # the spellchecker won't do any good anyway
136 211 100       366 next unless /\w/;
137              
138 156 100       1683 print " Checking as <$_>\n" if $self->_is_debug;
139              
140             # replace it with any stopword or stopword parts stripped
141 156         558 $_ = $self->_strip_a_word($_);
142              
143 156 100 100     1086 print " Keeping as <$_>\n" if $_ && $self->_is_debug;
144             }
145              
146 38 50       128 return join(" ", grep { defined && length } @words );
  211         512  
147             }
148              
149             sub _strip_a_word {
150 156     156   198 my ($self, $word) = @_;
151 156         145 my $remainder;
152              
153             # try word as-is, including possible hyphenation vs stoplist
154 156 100       237 if ($self->is_stopword($word) ) {
    100          
    100          
155 69         71 $remainder = '';
156             }
157             # internal period could be abbreviations, so check with
158             # trailing period restored and drop or keep on that basis
159             elsif ( index($word, '.') >= 0 ) {
160 2         5 my $abbr = "$word.";
161 2 100       7 $remainder = $self->is_stopword($abbr) ? '' : $abbr;
162             }
163             # check individual parts of hyphenated word, keep whatever isn't a
164             # stopword as individual words
165             elsif ( index($word, '-') >= 0 ) {
166 1         2 my @keep;
167 1         3 for my $part ( split /-/, $word ) {
168 3 100       14 push @keep, $part if ! $self->is_stopword( $part );
169             }
170 1 50       4 $remainder = join(" ", @keep) if @keep;
171             }
172             # otherwise, we just keep it
173             else {
174 84         133 $remainder = $word;
175             }
176 156         232 return $remainder;
177             }
178              
179             1;
180              
181             __END__
182              
183             =pod
184              
185             =encoding UTF-8
186              
187             =for :stopwords Sean M. Burke Caleb Cushing Olivier Mengué
188              
189             =head1 NAME
190              
191             Pod::Wordlist - English words that come up in Perl documentation
192              
193             =head1 VERSION
194              
195             version 1.27
196              
197             =head1 DESCRIPTION
198              
199             Pod::Wordlist is used by L<Pod::Spell|Pod::Spell>, providing a set of words
200             that are English jargon words that come up in Perl documentation, but which are
201             not to be found in general English lexicons. (For example: autovivify,
202             backreference, chroot, stringify, wantarray.)
203              
204             You can also use this wordlist with your word processor by just
205             pasting C<share/wordlist>'s content into your wordprocessor, deleting
206             the leading Perl code so that only the wordlist remains, and then
207             spellchecking this resulting list and adding every word in it to your
208             private lexicon.
209              
210             =head1 METHODS
211              
212             =head2 learn_stopwords
213              
214             $wordlist->learn_stopwords( $text );
215              
216             Modifies the stopword list based on a text block. See the rules
217             for L<adding stopwords|Pod::Spell/ADDING STOPWORDS> for details.
218              
219             =head2 is_stopword
220              
221             if ( $wordlist->is_stopword( $word ) ) { ... }
222              
223             Returns true if the word is found in the stopword list.
224              
225             =head2 strip_stopwords
226              
227             my $out = $wordlist->strip_stopwords( $text );
228              
229             Returns a string with space separated words from the original
230             text with stopwords removed.
231              
232             =head1 ATTRIBUTES
233              
234             =head2 wordlist
235              
236             ref $self->wordlist eq 'HASH'; # true
237              
238             This is the instance of the wordlist
239              
240             =head2 no_wide_chars
241              
242             If true, words with characters outside the Latin-1 range C<0x00> to C<0xFF> will
243             be stripped like stopwords.
244              
245             =head1 WORDLIST
246              
247             Note that the scope of this file is only English, specifically American
248             English. (But you may find in useful to incorporate into your own
249             lexicons, even if they are for other dialects/languages.)
250              
251             remove any q{'s} before adding to the list.
252              
253             The list should be sorted and uniqued. The following will work (with GNU
254             Coreutils ).
255              
256             sort share/wordlist -u > /tmp/sorted && mv /tmp/sorted share/wordlist
257              
258             =head1 BUGS
259              
260             Please report any bugs or feature requests on the bugtracker website
261             L<https://rt.cpan.org/Public/Dist/Display.html?Name=Pod-Spell> or by email
262             to L<bug-Pod-Spell@rt.cpan.org|mailto:bug-Pod-Spell@rt.cpan.org>.
263              
264             When submitting a bug or request, please include a test-file or a
265             patch to an existing test-file that illustrates the bug or desired
266             feature.
267              
268             =head1 AUTHORS
269              
270             =over 4
271              
272             =item *
273              
274             Sean M. Burke <sburke@cpan.org>
275              
276             =item *
277              
278             Caleb Cushing <xenoterracide@gmail.com>
279              
280             =back
281              
282             =head1 COPYRIGHT AND LICENSE
283              
284             This software is Copyright (c) 2024 by Olivier Mengué.
285              
286             This is free software, licensed under:
287              
288             The Artistic License 2.0 (GPL Compatible)
289              
290             =cut