File Coverage

blib/lib/Lingua/EN/CommonMistakes.pm

Criterion	Covered	Total	%
statement	63	66	95.4
branch	19	22	86.3
condition	2	3	66.6
subroutine	9	9	100.0
pod			n/a
total	93	100	93.0

line	stmt	bran	cond	sub	time	code
1						package Lingua::EN::CommonMistakes;
2
3	2			2	21023	use 5.006;
	2				8
	2				73
4	2			2	18	use strict;
	2				4
	2				69
5	2			2	9	use warnings FATAL => 'all';
	2				8
	2				85
6	2			2	11	use warnings::register;
	2				3
	2				283
7	2			2	11	use Carp;
	2				3
	2				1336
8
9						our $VERSION = 20130425;
10
11						my %MISTAKES;
12
13						# reads data from __DATA__ section into %MISTAKES
14						sub _read_mistakes {
15	2			2	4	my $in_tag = ':common';
16
17	2				10	while ( my $line = ) {
18	4406				4693	chomp $line;
19	4406				5341	$line =~ s{#.*\z}{};
20	4406				7219	$line =~ s{\s+\z}{};
21	4406				5973	$line =~ s{\A\s+}{};
22	4406				8046	$line =~ s{ {2,}}{ };
23	4406	100			7101	next unless $line;
24
25	4400	100			7900	if ( $line =~ m{\A:[^\s]+\z} ) {
26	6				10	$in_tag = $line;
27	6				17	next;
28						}
29
30	4394				10874	my ( $word, $correction ) = split( /\s/, $line, 2 );
31	4394				24630	$MISTAKES{$in_tag}{$word} = $correction;
32						}
33	2				77	close(Lingua::EN::CommonMistakes::DATA);
34
35	2				7	return;
36						}
37
38						sub import {
39	7			7	4853	my ( $package, @args ) = @_;
40	7				13	my @out_name;
41	7				17	my %tags = map { $_ => 1 } qw(:common :punct);
	14				49
42	7				17	foreach my $arg (@args) {
43	10	100			35	if ( substr( $arg, 0, 1 ) eq '%' ) {
		50
44	5				17	push @out_name, substr( $arg, 1 );
45						}
46						elsif ( substr( $arg, 0, 1 ) eq ':' ) {
47	5	50			26	if ($arg eq ':no-defaults') {
		100
48	0				0	%tags = ();
49						} elsif ($arg =~ m{\A:no-(.+)\z}) {
50	1				8	delete $tags{ ":$1" };
51						} else {
52	4				10	$tags{$arg}++;
53						}
54						}
55						else {
56	0				0	croak __PACKAGE__ . ": import argument $arg is not understood";
57						}
58						}
59
60	7	100			25	if ( !@out_name ) {
61	2				4	push @out_name, 'MISTAKES';
62						}
63
64	7	50	66		30	if ( $tags{':american'} && $tags{':british'} ) {
65	0				0	croak __PACKAGE__ . ": can't use both :american and :british";
66						}
67
68	7	100			35	if ( !%MISTAKES ) {
69	2				5	_read_mistakes();
70						}
71
72	7				11	my %out;
73	7				26	foreach my $tag ( keys %tags ) {
74	17	100			640	if ( !$MISTAKES{$tag} ) {
75	2	100			82	if (warnings::enabled( __PACKAGE__ )) {
76	1				275	carp __PACKAGE__ . ": import argument $tag is not understood";
77						}
78						}
79						else {
80	15				947	(%out) = ( %out, %{ $MISTAKES{$tag} } );
	15				16659
81						}
82						}
83
84	7				275	my ($caller_package) = caller();
85	7				18	foreach my $out_name (@out_name) {
86	2			2	12	no strict 'refs';
	2				4
	2				64
87	2			2	11	no warnings 'once';
	2				4
	2				252
88	7				14	*{ $caller_package . '::' . $out_name } = \%out;
	7				64
89						}
90	7				224	return;
91						}
92
93						=head1 NAME
94
95						Lingua::EN::CommonMistakes - map of common English spelling errors
96
97						=head1 SYNOPSIS
98
99						use Lingua::EN::CommonMistakes qw(%MISTAKES);
100
101						foreach my $word (split /\b/, $text) {
102						if (my $correction = $MISTAKES{lc $word}) {
103						warn "Likely spelling error: $word (-> $correction)\n";
104						}
105						}
106
107						# or use a different flavor of English
108						use Lingua::EN::CommonMistakes qw(:no-punct :british %MISTAKES);
109						...
110
111						Provides a customizable map of common English spelling errors with their
112						respective corrections.
113
114						=head1 USAGE
115
116						The behavior of this package is customized at import time.
117
118						By default, importing this package will create a hash named
119						C<%MISTAKES> in the calling package, containing most corrections, but
120						not containing either American English or British English corrections.
121
122						This behavior may be customized by providing the following parameters
123						when importing:
124
125						=over
126
127						=item %I [default: C<%MISTAKES>]
128
129						The map will be imported with the given name.
130
131						=item C<:common>, C<:no-common> [default: C<:common>]
132
133						If enabled, include the base set of corrections common among all
134						English variants. This is the largest set of corrections.
135
136						=item C<:american>, C<:no-american> [default: C<:no-american>]
137
138						If enabled, American English is desirable; include corrections from
139						British English to American English. For example, "colour" should be
140						replaced with "color".
141
142						=item C<:british>, C<:no-british> [default: C<:no-british>]
143
144						If enabled, British English is desirable; include corrections from
145						American English to British English. For example, "recognized" should
146						be replaced with "recognised".
147
148						=item C<:punct>, C<:no-punct> [default: C<:punct>]
149
150						If enabled, include corrections which introduce punctuation characters;
151						for example, "dont" should be replaced with "don't".
152
153						C<:no-punct> is often useful when scanning input text where
154						punctuation characters have special meaning, such as in most
155						programming languages.
156
157						=item C<:no-defaults>
158
159						If set, the corrections map only includes sets which have been
160						explicitly enabled.
161
162						=back
163
164						It's possible to C the package several times if multiple mappings are
165						needed, as in the following example:
166
167						# one map for common mistakes, another for british->american only
168						use Lingua::EN::CommonMistakes qw(%MISTAKES_COMMON);
169						use Lingua::EN::CommonMistakes qw(:no-defaults :american %MISTAKES_GB_TO_US);
170
171						=head1 WHY?
172
173						One might justifiably wonder why it would make sense to use a list of
174						mistakes rather than a full dictionary when spell checking.
175
176						Spell checking typically uses a whitelist approach: all words are
177						considered incorrect unless they can be found in the whitelist
178						(dictionary). This module instead facilitates a blacklist approach:
179						words are considered correct unless they can be found in the blacklist
180						(map of mistakes).
181
182						A blacklist approach to spell-checking is often more suitable than a
183						whitelist approach when scanning text which is partly but not entirely
184						English.
185
186						Computer programs are a prime example of semi-English documents;
187						comments and identifiers may be written in English, with additional
188						restrictions (such as no punctuation characters permitted in
189						identifiers) and often contain words which are intentionally not
190						spelled correctly (abbreviations or corruptions of valid English
191						words, e.g. "int" for "integer").
192
193						Other examples include mixed language documents or documents which are
194						ostensibly English but contain a lot of domain-specific jargon
195						unlikely to be found in an English dictionary.
196
197						Despite the fact that such bodies of text are only partly English, any
198						occurrences of words in the blacklist are likely to be genuine errors.
199
200						A blacklist approach also makes sense when it is more important to
201						have a low rate of false positives than it is to find every error (for
202						example, an automated system which risks being ignored if it generates
203						too many reports of dubious value).
204
205						=head1 AUTHOR
206
207						Rohan McGovern, C
208
209						=head1 BUGS
210
211						Please view and report any bugs here:
212						L
213
214						=head1 ACKNOWLEDGEMENTS
215
216						Most of the word list has been sourced from other projects, including:
217
218						=over
219
220						=item *
221
222						I code checker tool, written for KDE:
223						L
224
225						=item *
226
227						I package checker tool, written for Debian:
228						L
229
230						=back
231
232						=head1 LICENSE AND COPYRIGHT
233
234						Copyright 2012 Rohan McGovern.
235
236						Incorporated word lists may be Copyright their respective authors.
237
238						This program is free software; you can redistribute it and/or modify
239						it under the terms of the GNU General Public License as published by
240						the Free Software Foundation; version 2 dated June, 1991 or at your option
241						any later version.
242
243						This program is distributed in the hope that it will be useful,
244						but WITHOUT ANY WARRANTY; without even the implied warranty of
245						MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
246						GNU General Public License for more details.
247
248						A copy of the GNU General Public License is available in the source tree;
249						if not, write to the Free Software Foundation, Inc.,
250						59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
251
252
253						=cut
254
255						1;
256
257						__DATA__