File Coverage

lib/Lingua/Ispell.pm

Criterion	Covered	Total	%
statement	18	121	14.8
branch	0	26	0.0
condition	0	4	0.0
subroutine	6	24	25.0
pod	11	12	91.6
total	35	187	18.7

line	stmt	bran	cond	sub	pod	time	code
1
2							#(@) Lingua::Ispell.pm - a module encapsulating access to the Ispell program.
3
4							=head1 NAME
5
6							Lingua::Ispell.pm - a module encapsulating access to the Ispell program.
7
8							Note: this module was previously known as Text::Ispell; if you have
9							Text::Ispell installed on your system, it is now obsolete and should be
10							replaced by Lingua::Ispell.
11
12							=head1 NOTA BENE
13
14							ispell, when reporting on misspelled words, indicates the string it was unable
15							to verify, as well as its starting offset in the input line.
16							No such information is returned for words which are deemed to be correctly spelled.
17							For example, in a line like "Can't buy a thrill", ispell simply reports that the
18							line contained four correctly spelled words.
19
20							Lingua::Ispell would like to identify which substrings of the input
21							line are words -- correctly spelled or otherwise. It used to attempt to split
22							the input line into words according to the same rules ispell uses; but that has
23							proven to be very difficult, resulting in both slow and error-prone code.
24
25							=head2 Consequences
26
27							Lingua::Ispell now operates only in "terse" mode.
28							In this mode, only misspelled words are reported.
29							Words which ispell verifies as correctly spelled are silently accepted.
30
31							In the report structures returned by C, the C<'term'> member
32							is now always identical to the C<'original'> member; of the two, you should
33							probably use the C<'term'> member. (Also consider the C<'offset'> member.)
34							ispell does not report this information for correctly spelled words; if at
35							some point in the future this capability is added to ispell, Lingua::Ispell
36							will be updated to take advantage of it.
37
38							Use of the C<$word_chars> variable has been removed; setting it no longer
39							has any effect.
40
41							C now does nothing.
42
43							=cut
44
45
46							package Lingua::Ispell;
47	1			1		1909	use Exporter;
	1					1
	1					69
48							@Lingua::Ispell::ISA = qw(Exporter);
49							@Lingua::Ispell::EXPORT_OK = qw(
50							spellcheck
51							add_word
52							add_word_lc
53							accept_word
54							parse_according_to
55							set_params_by_language
56							save_dictionary
57							allow_compounds
58							make_wild_guesses
59							use_dictionary
60							use_personal_dictionary
61							);
62							%Lingua::Ispell::EXPORT_TAGS = (
63							'all' => \@Lingua::Ispell::EXPORT_OK,
64							);
65
66
67	1			1		750	use FileHandle;
	1					11148
	1					5
68	1			1		1222	use IPC::Open2;
	1					3705
	1					47
69	1			1		7	use Carp;
	1					1
	1					41
70
71	1			1		4	use strict;
	1					1
	1					26
72
73	1			1		4	use vars qw( $VERSION );
	1					1
	1					1539
74							$VERSION = '0.07';
75
76
77							=head1 SYNOPSIS
78
79							# Brief:
80							use Lingua::Ispell;
81							Lingua::Ispell::spellcheck( $string );
82							# or
83							use Lingua::Ispell qw( spellcheck ); # import the function
84							spellcheck( $string );
85
86							# Useful:
87							use Lingua::Ispell qw( :all ); # import all symbols
88							for my $r ( spellcheck( "hello hacking perl shrdlu 42" ) ) {
89							print "$r->{'type'}: $r->{'term'}\n";
90							}
91
92
93							=head1 DESCRIPTION
94
95							Lingua::Ispell::spellcheck() takes one argument. It must be a
96							string, and it should contain only printable characters.
97							One allowable exception is a terminal newline, which will be
98							chomped off anyway. The line is fed to a coprocess running
99							ispell for analysis. ispell parses the line into "terms"
100							according to the language-specific rules in effect.
101
102							The result of ispell's analysis of each term is a categorization
103							of the term into one of six types: ok, compound, root, miss, none,
104							and guess. Some of these carry additional information.
105							The first three types are "correctly" spelled terms, and the last
106							three are for "incorrectly" spelled terms.
107
108							Lingua::Ispell::spellcheck returns a list of objects, each
109							corresponding to a term in the spellchecked string. Each object
110							is a hash (hash-ref) with at least two entries: 'term' and 'type'.
111							The former contains the term ispell is reporting on, and the latter
112							is ispell's determination of that term's type (see above).
113							For types 'ok' and 'none', that is all the information there is.
114							For the type 'root', an additional hash entry is present: 'root'.
115							Its value is the word which ispell identified in the dictionary
116							as being the likely root of the current term.
117							For the type 'miss', an additional hash entry is present: 'misses'.
118							Its value is an ref to an array of words which ispell
119							identified as being "near-misses" of the current term, when
120							scanning the dictionary.
121
122							=head2 NOTE
123
124							As mentioned above, C currently only reports on misspelled terms.
125
126							=head2 EXAMPLE
127
128							use Lingua::Ispell qw( spellcheck );
129							Lingua::Ispell::allow_compounds(1);
130							for my $r ( spellcheck( "hello hacking perl salmoning fruithammer shrdlu 42" ) ) {
131							if ( $r->{'type'} eq 'ok' ) {
132							# as in the case of 'hello'
133							print "'$r->{'term'}' was found in the dictionary.\n";
134							}
135							elsif ( $r->{'type'} eq 'root' ) {
136							# as in the case of 'hacking'
137							print "'$r->{'term'}' can be formed from root '$r->{'root'}'\n";
138							}
139							elsif ( $r->{'type'} eq 'miss' ) {
140							# as in the case of 'perl'
141							print "'$r->{'term'}' was not found in the dictionary;\n";
142							print "Near misses: @{$r->{'misses'}}\n";
143							}
144							elsif ( $r->{'type'} eq 'guess' ) {
145							# as in the case of 'salmoning'
146							print "'$r->{'term'}' was not found in the dictionary;\n";
147							print "Root/affix Guesses: @{$r->{'guesses'}}\n";
148							}
149							elsif ( $r->{'type'} eq 'compound' ) {
150							# as in the case of 'fruithammer'
151							print "'$r->{'term'}' is a valid compound word.\n";
152							}
153							elsif ( $r->{'type'} eq 'none' ) {
154							# as in the case of 'shrdlu'
155							print "No match for term '$r->{'term'}'\n";
156							}
157							# and numbers are skipped entirely, as in the case of 42.
158							}
159
160
161							=head2 ERRORS
162
163							C starts the ispell coprocess
164							if the coprocess seems not to exist. Ordinarily this is simply
165							the first time it's called.
166
167							ispell is spawned via the C function, which
168							throws an exception (i.e. dies) if the spawn fails. The caller
169							should be prepared to catch this exception -- unless, of course,
170							the default behavior of die is acceptable.
171
172							=head2 Nota Bene
173
174							The full location of the ispell executable is stored
175							in the variable C<$Lingua::Ispell::path>. The default
176							value is F.
177							If your ispell executable has some name other than
178							this, then you must set C<$Lingua::Ispell::path> accordingly
179							before you call C (or any other function
180							in the module) for the first time!
181
182							=cut
183
184
185							sub _init {
186	0	0		0			unless ( $Lingua::Ispell::pid ) {
187	0						my @options;
188	0						while ( my( $k, $ar ) = each %Lingua::Ispell::options ) {
189	0	0					if ( @$ar ) {
190	0						for ( @$ar ) {
191							#push @options, "$k $_";
192	0						push @options, $k, $_;
193							}
194							}
195							else {
196	0						push @options, $k;
197							}
198							}
199
200	0		0				$Lingua::Ispell::path \|\|= '/usr/local/bin/ispell';
201
202	0						$Lingua::Ispell::pid = undef; # so that it's still undef if open2 fails.
203	0						$Lingua::Ispell::pid = open2( # if open2 fails, it throws, but doesn't return.
204							*Reader,
205							*Writer,
206							$Lingua::Ispell::path,
207							'-a', '-S',
208							@options,
209							);
210
211	0						my $hdr = scalar();
212
213							# must be the same as ispell:
214	0						$Lingua::Ispell::terse = 0;
215							{
216							# set up permanent terse mode:
217	0						local $/ = "\n";
	0
218	0						local $\ = '';
219	0						print Writer "!\n";
220	0						$Lingua::Ispell::terse = 1;
221							}
222							}
223
224							$Lingua::Ispell::pid
225	0						}
226
227							sub _exit {
228	0	0		0			if ( $Lingua::Ispell::pid ) {
229	0						close Reader;
230	0						close Writer;
231	0						kill $Lingua::Ispell::pid;
232	0						$Lingua::Ispell::pid = undef;
233							}
234							}
235
236
237							sub spellcheck {
238	0	0		0	0		_init() or return(); # caller should really catch the exception from a failed open2.
239	0						my $line = shift;
240	0						local $/ = "\n"; local $\ = '';
	0
241	0						chomp $line;
242	0						$line =~ s/\r//g; # kill the hate
243	0	0					$line =~ /\n/ and croak "newlines not allowed in arguments to Lingua::Ispell::spellcheck!";
244	0						print Writer "^$line\n";
245	0						my @commentary;
246	0						local $_;
247	0						while ( ) {
248	0						chomp;
249	0	0					last unless $_ gt '';
250	0						push @commentary, $_;
251							}
252
253	0						my %types = (
254							# correct words:
255							'*' => 'ok',
256							'-' => 'compound',
257							'+' => 'root',
258
259							# misspelled words:
260							'#' => 'none',
261							'&' => 'miss',
262							'?' => 'guess',
263							);
264							# and there's one more type, unknown, which is
265							# used when the first char is not in the above set.
266
267							my %modisp = (
268							'root' => sub {
269	0			0			my $h = shift;
270	0						$h->{'root'} = shift;
271							},
272							'none' => sub {
273	0			0			my $h = shift;
274	0						$h->{'original'} = shift;
275	0						$h->{'offset'} = shift;
276							},
277							'miss' => sub { # also used for 'guess'
278	0			0			my $h = shift;
279	0						$h->{'original'} = shift;
280	0						$h->{'count'} = shift; # count will always be 0, when $c eq '?'.
281	0						$h->{'offset'} = shift;
282
283	0						my @misses = splice @_, 0, $h->{'count'};
284	0						my @guesses = @_;
285
286	0						$h->{'misses'} = \@misses;
287	0						$h->{'guesses'} = \@guesses;
288							},
289	0						);
290	0						$modisp{'guess'} = $modisp{'miss'}; # same handler.
291
292	0						my @results;
293	0						for my $i ( 0 .. $#commentary ) {
294	0						my %h = (
295							'commentary' => $commentary[$i],
296							);
297
298	0						my @tail; # will get stuff after a colon, if any.
299
300	0	0					if ( $h{'commentary'} =~ s/:\s+(.*)// ) {
301	0						my $tail = $1;
302	0						@tail = split /, /, $tail;
303							}
304
305	0						my( $c, @args ) = split ' ', $h{'commentary'};
306
307	0		0				my $type = $types{$c} \|\| 'unknown';
308
309	0	0					$modisp{$type} and $modisp{$type}->( \%h, @args, @tail );
310
311	0						$h{'type'} = $type;
312	0						$h{'term'} = $h{'original'};
313
314	0						push @results, \%h;
315							}
316
317							@results
318	0						}
319
320							sub _send_command($$) {
321	0			0			my( $cmd, $arg ) = @_;
322	0	0					defined $arg or $arg = '';
323	0						local $/ = "\n"; local $\ = '';
	0
324	0						chomp $arg;
325	0						_init();
326	0						print Writer "$cmd$arg\n";
327							}
328
329
330							=head1 AUX FUNCTIONS
331
332							=head2 add_word(word)
333
334							Adds a word to the personal dictionary. Be careful of capitalization.
335							If you want the word to be added "case-insensitively", you should
336							call C
337
338							=cut
339
340							sub add_word($) {
341	0			0	1		_send_command "\*", $_[0];
342							}
343
344							=head2 add_word_lc(word)
345
346							Adds a word to the personal dictionary, in lower-case form.
347							This allows ispell to match it in a case-insensitive manner.
348
349							=cut
350
351							sub add_word_lc($) {
352	0			0	1		_send_command "\&", $_[0];
353							}
354
355							=head2 accept_word(word)
356
357							Similar to adding a word to the dictionary, in that it causes
358							ispell to accept the word as valid, but it does not actually
359							add it to the dictionary. Presumably the effects of this only
360							last for the current ispell session, which will mysteriously
361							end if any of the coprocess-restarting functions are called...
362
363							=cut
364
365							sub accept_word($) {
366	0			0	1		_send_command "\@", $_[0];
367							}
368
369							=head2 parse_according_to(formatter)
370
371							Causes ispell to parse subsequent input lines according to
372							the specified formatter. As of ispell v. 3.1.20, only
373							'tex' and 'nroff' are supported.
374
375							=cut
376
377							sub parse_according_to($) {
378							# must be one of 'tex' or 'nroff'
379	0			0	1		_send_command "\-", $_[0];
380							}
381
382							=head2 set_params_by_language(language)
383
384							Causes ispell to set its internal operational parameters
385							according to the given language. Legal arguments to this
386							function, and its effects, are currently unknown by the
387							author of Lingua::Ispell.
388
389							=cut
390
391							sub set_params_by_language($) {
392	0			0	1		_send_command "\~", $_[0];
393							}
394
395							=head2 save_dictionary()
396
397							Causes ispell to save the current state of the dictionary
398							to its disk file. Presumably ispell would ordinarily
399							only do this upon exit.
400
401							=cut
402
403							sub save_dictionary() {
404	0			0	1		_send_command "\#", '';
405							}
406
407							=head2 terse_mode(bool:terse)
408
409							I This function has been disabled!
410							Lingua::Ispell now always operates in terse mode.>
411
412							In terse mode, ispell will not produce reports for "correct" words.
413							This means that the calling program will not receive results of the
414							types 'ok', 'root', and 'compound'.
415
416							=cut
417
418	0			0	1		sub terse_mode($) {
419							# my $bool = shift;
420							# my $cmd = $bool ? "\!" : "\%";
421							# _send_command $cmd, '';
422							# $Lingua::Ispell::terse = $bool;
423							}
424
425
426							=head1 FUNCTIONS THAT RESTART ISPELL
427
428							The following functions cause the current ispell coprocess, if any, to terminate.
429							This means that all the changes to the state of ispell made by the above
430							functions will be lost, and their respective values reset to their defaults.
431							The only function above whose effect is persistent is C.
432
433							Perhaps in the future we will figure out a good way to make this
434							state information carry over from one instantiation of the coprocess
435							to the next.
436
437							=head2 allow_compounds(bool)
438
439							When this value is set to True, compound words are
440							accepted as legal -- as long as both words are found in the
441							dictionary; more than two words are always illegal.
442							When this value is set to False, run-together words are
443							considered spelling errors.
444
445							The default value of this setting is dictionary-dependent,
446							so the caller should set it explicitly if it really matters.
447
448							=cut
449
450							sub allow_compounds {
451	0			0	1		my $bool = shift;
452	0						_exit();
453	0	0					if ( $bool ) {
454	0						$Lingua::Ispell::options{'-C'} = [];
455	0						delete $Lingua::Ispell::options{'-B'};
456							}
457							else {
458	0						$Lingua::Ispell::options{'-B'} = [];
459	0						delete $Lingua::Ispell::options{'-C'};
460							}
461							}
462
463							=head2 make_wild_guesses(bool)
464
465							This setting controls when ispell makes "wild" guesses.
466
467							If False, ispell only makes "sane" guesses, i.e. possible
468							root/affix combinations that match the current dictionary;
469							only if it can find none will it make "wild" guesses,
470							which don't match the dictionary, and might in fact
471							be illegal words.
472
473							If True, wild guesses are always made, along with any "sane" guesses.
474							This feature can be useful if the dictionary has a limited word list,
475							or a word list with few suffixes.
476
477							The default value of this setting is dictionary-dependent,
478							so the caller should set it explicitly if it really matters.
479
480							=cut
481
482							sub make_wild_guesses {
483	0			0	1		my $bool = shift;
484	0						_exit();
485	0	0					if ( $bool ) {
486	0						$Lingua::Ispell::options{'-m'} = [];
487	0						delete $Lingua::Ispell::options{'-P'};
488							}
489							else {
490	0						$Lingua::Ispell::options{'-P'} = [];
491	0						delete $Lingua::Ispell::options{'-m'};
492							}
493							}
494
495							=head2 use_dictionary([dictionary])
496
497							Specifies what dictionary to use instead of the
498							default. Dictionary names are actually file
499							names, and are searched for according to the
500							following rule: if the name does not contain a slash,
501							it is looked for in the directory containing the
502							default dictionary, typically /usr/local/lib.
503							Otherwise, it is used as is: if it does not begin
504							with a slash, it is construed from the current
505							directory.
506
507							If no argument is given, the default dictionary will be used.
508
509							=cut
510
511							sub use_dictionary {
512	0			0	1		_exit();
513	0	0					if ( @_ ) {
514	0						$Lingua::Ispell::options{'-d'} = [ @_ ];
515							}
516							else {
517	0						delete $Lingua::Ispell::options{'-d'};
518							}
519							}
520
521							=head2 use_personal_dictionary([dictionary])
522
523							Specifies what personal dictionary to use
524							instead of the default.
525
526							Dictionary names are actually file names, and are
527							searched for according to the following rule:
528							if the name begins with a slash, it is used as
529							is (i.e. it is an absolute path name). Otherwise,
530							it is construed as relative to the user's home
531							directory ($HOME).
532
533							If no argument is given, the default personal
534							dictionary will be used.
535
536							=cut
537
538							sub use_personal_dictionary {
539	0			0	1		_exit();
540	0	0					if ( @_ ) {
541	0						$Lingua::Ispell::options{'-p'} = [ @_ ];
542							}
543							else {
544	0						delete $Lingua::Ispell::options{'-p'};
545							}
546							}
547
548
549
550							1;
551
552
553							=head1 FUTURE ENHANCEMENTS
554
555							ispell options:
556
557							-w chars
558							Specify additional characters that can be part of a word.
559
560							=head1 DEPENDENCIES
561
562							Lingua::Ispell uses the external program ispell, which is
563							the "International Ispell", available at
564
565							http://fmg-www.cs.ucla.edu/geoff/ispell.html
566
567							as well as various archives and mirrors, such as
568
569							ftp://ftp.math.orst.edu/pub/ispell-3.1/
570
571							This is a very popular program, and may already be
572							installed on your system.
573
574							Lingua::Ispell also uses the standard perl modules FileHandle,
575							IPC::Open2, and Carp.
576
577							=head1 AUTHOR
578
579							jdporter@min.net (John Porter)
580
581							=head1 COPYRIGHT
582
583							This module is free software; you may redistribute it and/or
584							modify it under the same terms as Perl itself.
585
586							=cut
587