File Coverage

blib/lib/Pod/Wordlist.pm

Criterion	Covered	Total	%
statement	96	97	98.9
branch	39	44	88.6
condition	13	15	86.6
subroutine	14	14	100.0
pod	3	3	100.0
total	165	173	95.3

line	stmt	bran	cond	sub	pod	time	code
1							package Pod::Wordlist;
2	7			7		223381	use 5.008;
	7					20
3	7			7		33	use strict;
	7					13
	7					158
4	7			7		25	use warnings;
	7					8
	7					547
5
6							our $VERSION = '1.27';
7
8	7			7		5612	use Lingua::EN::Inflect 'PL';
	7					156246
	7					1149
9	7			7		86	use File::Spec ();
	7					13
	7					206
10							use constant {
11	7					777	_MAXWORDLENGTH => 50,
12	7			7		46	};
	7					10
13
14							use Class::Tiny {
15	7					58	wordlist => \&_copy_wordlist,
16							_is_debug => 0,
17							no_wide_chars => 0,
18	7			7		4267	};
	7					13338
19
20							our %Wordlist; ## no critic ( Variables::ProhibitPackageVars )
21
22							sub _copy_wordlist {
23	18			18		147	my %copy;
24
25							# %Wordlist can be accessed externally, and users will often add terms in
26							# encoded form
27	18					11174	for my $word ( keys %Wordlist ) {
28	43758					38324	my $decoded_word = $word;
29							# if it was already decoded, this should do nothing
30	43758					53916	utf8::decode($decoded_word);
31	43758					63301	$copy{$decoded_word} = 1;
32							}
33
34	18					1844	return \%copy;
35							}
36
37							BEGIN {
38	7			7		5317	my $file;
39
40							# try to find wordlist in non-installed dist
41	7					158	my ($d, $p) = File::Spec->splitpath(__FILE__);
42	7					163	$p = File::Spec->catdir($p, (File::Spec->updir) x 2, 'share');
43	7					107	my $full_path = File::Spec->catpath($d, $p, 'wordlist');
44	7	50	33			209	if ($full_path && -e $full_path) {
45	0					0	$file = $full_path;
46							}
47
48	7	50				28	if ( not defined $file ) {
49	7					3350	require File::ShareDir;
50	7					189029	$file = File::ShareDir::dist_file('Pod-Spell', 'wordlist');
51							}
52
53	7	50		7		1593	open my $fh, '<:encoding(UTF-8)', $file
	7					3968
	7					100
	7					34
54							or die "Cannot read $file: $!"; ## no critic (ErrorHandling::RequireCarping)
55	7					66345	while ( defined( my $line = readline $fh ) ) {
56	8540					1748686	chomp $line;
57	8540					17453	$Wordlist{$line} = 1;
58	8540					14220	$Wordlist{PL($line)} = 1;
59							}
60	7					8912	close $fh;
61							}
62
63							sub learn_stopwords {
64	21			21	1	295483	my ( $self, $text ) = @_;
65	21					582	my $stopwords = $self->wordlist;
66
67	21					266	while ( $text =~ m<(\S+)>g ) {
68	45					492	my $word = $1;
69	45					148	utf8::decode($word);
70	45	100				103	if ( $word =~ m/^!(.+)/s ) {
71							# "!word" deletes from the stopword list
72	2					4	my $negation = $1;
73							# different $1 from above
74	2					4	delete $stopwords->{$negation};
75	2					6	delete $stopwords->{PL($negation)};
76	2	100				423	print "Unlearning stopword <$negation>\n" if $self->_is_debug;
77							}
78							else {
79	43					94	$word =~ s{'s$}{}; # we strip 's when checking so strip here, too
80	43					96	$stopwords->{$word} = 1;
81	43					170	$stopwords->{PL($word)} = 1;
82	43	100				12464	print "Learning stopword <$word>\n" if $self->_is_debug;
83							}
84							}
85	21					242	return;
86							}
87
88							sub is_stopword {
89	161			161	1	184	my ($self, $word) = @_;
90	161					1692	my $stopwords = $self->wordlist;
91	161	100	100			761	if ( exists $stopwords->{$word} or exists $stopwords->{ lc $word } ) {
92	71	100				718	print " Rejecting <$word>\n" if $self->_is_debug;
93	71					263	return 1;
94							}
95	90					311	return;
96							}
97
98							sub strip_stopwords {
99	38			38	1	228	my ($self, $text) = @_;
100
101							# Count the things in $text
102	38	100				844	print "Content: <", $text, ">\n" if $self->_is_debug;
103
104	38					395	my @words = grep { length($_) < _MAXWORDLENGTH } split " ", $text;
	211					391
105
106	38					108	for ( @words ) {
107	211	100				2577	print "Parsing word: <$_>\n" if $self->_is_debug;
108							# some spellcheckers can't cope with anything but Latin1
109	211	100	100			2714	$_ = '' if $self->no_wide_chars && /[^\x00-\xFF]/;
110
111							# strip leading punctuation
112	211					885	s/^[\(\[\{\'\"\:\;\,\?\!\.]+//;
113
114							# keep everything up to trailing punctuation, not counting
115							# periods (for abbreviations like "Ph.D."), single-quotes
116							# (for contractions like "don't") or colons (for package
117							# names like "Foo::Bar")
118	211					513	s/^([^\)\]\}\"\;\,\?\!]+).*$/$1/;
119
120							# strip trailing single-quote, periods or colons; after this
121							# we have a word that could have internal periods or quotes
122	211					320	s/[\.\'\:]+$//;
123
124							# strip possessive
125	211					217	s/'s$//i;
126
127							# zero out variable names or things with internal symbols,
128							# since those are probably code expressions outside a C<>
129	211					270	my $is_sigil = /^[\&\%\$\@\:\<\*\\\_]/;
130	211					251	my $is_strange = /[\%\^\&\#\$\@\_\<\>\[\]\{\}\\\*\:\+\/\=\\|\`\~]/;
131	211	100	100			473	$_ = '' if $is_sigil \|\| $is_strange;
132
133							# stop if there are no "word" characters left; if it's just
134							# punctuation that we didn't happen to strip or it's weird glyphs,
135							# the spellchecker won't do any good anyway
136	211	100				366	next unless /\w/;
137
138	156	100				1683	print " Checking as <$_>\n" if $self->_is_debug;
139
140							# replace it with any stopword or stopword parts stripped
141	156					558	$_ = $self->_strip_a_word($_);
142
143	156	100	100			1086	print " Keeping as <$_>\n" if $_ && $self->_is_debug;
144							}
145
146	38	50				128	return join(" ", grep { defined && length } @words );
	211					512
147							}
148
149							sub _strip_a_word {
150	156			156		198	my ($self, $word) = @_;
151	156					145	my $remainder;
152
153							# try word as-is, including possible hyphenation vs stoplist
154	156	100				237	if ($self->is_stopword($word) ) {
		100
		100
155	69					71	$remainder = '';
156							}
157							# internal period could be abbreviations, so check with
158							# trailing period restored and drop or keep on that basis
159							elsif ( index($word, '.') >= 0 ) {
160	2					5	my $abbr = "$word.";
161	2	100				7	$remainder = $self->is_stopword($abbr) ? '' : $abbr;
162							}
163							# check individual parts of hyphenated word, keep whatever isn't a
164							# stopword as individual words
165							elsif ( index($word, '-') >= 0 ) {
166	1					2	my @keep;
167	1					3	for my $part ( split /-/, $word ) {
168	3	100				14	push @keep, $part if ! $self->is_stopword( $part );
169							}
170	1	50				4	$remainder = join(" ", @keep) if @keep;
171							}
172							# otherwise, we just keep it
173							else {
174	84					133	$remainder = $word;
175							}
176	156					232	return $remainder;
177							}
178
179							1;
180
181							__END__
182
183							=pod
184
185							=encoding UTF-8
186
187							=for :stopwords Sean M. Burke Caleb Cushing Olivier Mengué
188
189							=head1 NAME
190
191							Pod::Wordlist - English words that come up in Perl documentation
192
193							=head1 VERSION
194
195							version 1.27
196
197							=head1 DESCRIPTION
198
199							Pod::Wordlist is used by L<Pod::Spell\|Pod::Spell>, providing a set of words
200							that are English jargon words that come up in Perl documentation, but which are
201							not to be found in general English lexicons. (For example: autovivify,
202							backreference, chroot, stringify, wantarray.)
203
204							You can also use this wordlist with your word processor by just
205							pasting C<share/wordlist>'s content into your wordprocessor, deleting
206							the leading Perl code so that only the wordlist remains, and then
207							spellchecking this resulting list and adding every word in it to your
208							private lexicon.
209
210							=head1 METHODS
211
212							=head2 learn_stopwords
213
214							$wordlist->learn_stopwords( $text );
215
216							Modifies the stopword list based on a text block. See the rules
217							for L<adding stopwords\|Pod::Spell/ADDING STOPWORDS> for details.
218
219							=head2 is_stopword
220
221							if ( $wordlist->is_stopword( $word ) ) { ... }
222
223							Returns true if the word is found in the stopword list.
224
225							=head2 strip_stopwords
226
227							my $out = $wordlist->strip_stopwords( $text );
228
229							Returns a string with space separated words from the original
230							text with stopwords removed.
231
232							=head1 ATTRIBUTES
233
234							=head2 wordlist
235
236							ref $self->wordlist eq 'HASH'; # true
237
238							This is the instance of the wordlist
239
240							=head2 no_wide_chars
241
242							If true, words with characters outside the Latin-1 range C<0x00> to C<0xFF> will
243							be stripped like stopwords.
244
245							=head1 WORDLIST
246
247							Note that the scope of this file is only English, specifically American
248							English. (But you may find in useful to incorporate into your own
249							lexicons, even if they are for other dialects/languages.)
250
251							remove any q{'s} before adding to the list.
252
253							The list should be sorted and uniqued. The following will work (with GNU
254							Coreutils ).
255
256							sort share/wordlist -u > /tmp/sorted && mv /tmp/sorted share/wordlist
257
258							=head1 BUGS
259
260							Please report any bugs or feature requests on the bugtracker website
261							L<https://rt.cpan.org/Public/Dist/Display.html?Name=Pod-Spell> or by email
262							to L<bug-Pod-Spell@rt.cpan.org\|mailto:bug-Pod-Spell@rt.cpan.org>.
263
264							When submitting a bug or request, please include a test-file or a
265							patch to an existing test-file that illustrates the bug or desired
266							feature.
267
268							=head1 AUTHORS
269
270							=over 4
271
272							=item *
273
274							Sean M. Burke <sburke@cpan.org>
275
276							=item *
277
278							Caleb Cushing <xenoterracide@gmail.com>
279
280							=back
281
282							=head1 COPYRIGHT AND LICENSE
283
284							This software is Copyright (c) 2024 by Olivier Mengué.
285
286							This is free software, licensed under:
287
288							The Artistic License 2.0 (GPL Compatible)
289
290							=cut