File Coverage

blib/lib/FAQ/OMatic/Words.pm

Criterion	Covered	Total	%
statement	7	71	9.8
branch	1	12	8.3
condition			n/a
subroutine	3	11	27.2
pod	0	8	0.0
total	11	102	10.7

line	stmt	bran	sub	pod	time	code
1						##############################################################################
2						# The Faq-O-Matic is Copyright 1997 by Jon Howell, all rights reserved. #
3						# #
4						# This program is free software; you can redistribute it and/or #
5						# modify it under the terms of the GNU General Public License #
6						# as published by the Free Software Foundation; either version 2 #
7						# of the License, or (at your option) any later version. #
8						# #
9						# This program is distributed in the hope that it will be useful, #
10						# but WITHOUT ANY WARRANTY; without even the implied warranty of #
11						# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
12						# GNU General Public License for more details. #
13						# #
14						# You should have received a copy of the GNU General Public License #
15						# along with this program; if not, write to the Free Software #
16						# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.#
17						# #
18						# Jon Howell can be contacted at: #
19						# 6211 Sudikoff Lab, Dartmouth College #
20						# Hanover, NH 03755-3510 #
21						# jonh@cs.dartmouth.edu #
22						# #
23						# An electronic copy of the GPL is available at: #
24						# http://www.gnu.org/copyleft/gpl.html #
25						# #
26						##############################################################################
27
28	1		1		5	use strict;
	1				1
	1				30
29	1		1		733	use locale;
	1				199
	1				4
30
31						### Words.pm
32						###
33						### Support for extracting "words" from strings
34						###
35						### To change these routines to support other character sets,
36						### copy this file to a location outside of the FAQ::OMatic tree and
37						### add the following lines to the start of your cgi-bin/fom file:
38						### use lib '/Whatever/your/directory/path/is';
39						### require Words;
40						### #existing use lib line
41						### use FAQ::OMatic::Words
42						### This will override the definitions in this file.
43
44
45						package FAQ::OMatic::Words;
46
47						BEGIN {
48						# This code use Japanese environment only.
49						# see http://chasen.aist-nara.ac.jp/index.html.en
50						#
51	1	50	1		75	if (FAQ::OMatic::I18N::language() eq 'ja_JP.EUC') {
52	0					require Text::ChaSen; import Text::ChaSen;
	0
53	0					&Text::ChaSen::getopt_argv('faq-omatic', '-j', '-F', '%m ');
54						}
55						}
56
57						sub cannonical {
58	0		0	0		my $string = shift;
59
60						# convert the input string into cannonical form.
61						#
62						# The default is to strip parenthesis and apostrophies, and
63						# convert to ASCII lower case.
64						#
65						# If you use another character set (e.g. ISO-8859-?), you'll want
66						# to override to do correct lower case handling.
67						#
68						# This routine is called both when the indicies are created and
69						# when the search pattern is formed, so things will be done
70						# consistantly.
71
72						# convert
73						# timer(s) to timers
74						# timer's to timers
75						# e-mail to email
76	0					$string =~ s/[()'-]//g;
77	0					$string = lc($string); # convert to lower case
78
79	0	0				if (FAQ::OMatic::I18N::language() eq 'hu') {
80						# Accentuated lc(),
81	0					$string =~ tr/\301\311\315\323\326\325\332\334\333/\341\351\355\363\366\365\372\374\373/;
82						}
83
84	0					$string;
85						}
86
87						sub getWords {
88	0		0	0		my $string = shift;
89	0					my $encode_lang = FAQ::OMatic::I18N::language();
90						#EUC-JP case
91	0	0				return getWordsEUCJP($string) if($encode_lang eq "ja_JP.EUC");
92						# Hungarian case
93	0	0				return getWordshu($string) if($encode_lang eq 'hu');
94						#normal case
95	0					return getWordsSB($string);
96						}
97
98						sub getWordsSB {
99	0		0	0		my $string = shift;
100
101						# given a user-input string, we break it into "legal" words
102						# and return an array of them
103
104	0					$string = cannonical( $string );
105
106	0					my $wordPattern = '[\w-]'; # alphanumeric + '_' + '-'
107
108						#my @words = ($string =~ m/($wordPattern+)/gso);
109						# /gso seems to break in some circumstances. :v(
110	0					my @wordspl = split(/($wordPattern+)/, $string);
111	0					my @words=();
112	0					my $i;
113	0					for ($i=1; $i<@wordspl; $i+=2) {
114	0					push (@words, $wordspl[$i]);
115						}
116	0					return @words;
117
118						}
119
120						sub getWordsEUCJP {
121	0		0	0		require Text::ChaSen; import Text::ChaSen;
	0
122	0					require NKF; import NKF;
	0
123
124	0					my $string = shift;
125
126						# given a user-input string, we break it into "legal" words
127						# and return an array of them
128
129	0					$string = nkf('-e', $string);
130	0					$string = cannonical( $string );
131
132	0					my $wordPattern = '[\w-]'; # alphanumeric + '_' + '-'
133
134	0					my $s = &Text::ChaSen::sparse_tostr($string);
135	0					chomp $s;
136	0					my @words = split / /, $s;
137	0					return @words;
138
139						}
140
141						sub getWordshu {
142	0		0	0		my $string = shift;
143
144						# given a user-input string, we break it into "legal" words
145						# and return an array of them
146
147	0					$string = cannonical( $string );
148
149						# pattern for hungarian language:
150	0					my $wordPattern = '[\w\341\351\355\363\366\365\372\374\373-]';
151
152						#my @words = ($string =~ m/($wordPattern+)/gso);
153						# /gso seems to break in some circumstances. :v(
154	0					my @wordspl = split(/($wordPattern+)/, $string);
155	0					my @words=();
156	0					my $i;
157	0					for ($i=1; $i<@wordspl; $i+=2) {
158	0					push (@words, $wordspl[$i]);
159						}
160	0					return @words;
161						}
162
163						sub getPrefixes {
164	0		0	0		my $word = shift;
165	0					my $encode_lang = FAQ::OMatic::I18N::language();
166						#EUC-JP case
167	0	0				return getPrefixesEUCJP($word) if($encode_lang eq "ja_JP.EUC");
168						#normal case
169	0					return getPrefixesSB($word);
170						}
171
172						sub getPrefixesSB {
173	0		0	0		my $word = shift;
174
175						# given a word, return an array of prefixes which should be
176						# indexed.
177						#
178						# default routine returns all substrings
179	0					my @prefix=();
180	0					my $i = length( $word );
181	0					while( $i ) {
182	0					push @prefix, substr( $word, 0, $i-- );
183						}
184
185	0					@prefix;
186						}
187
188						## Japanese EUC-JP multibyte extended getPrefixes by oota ##
189						sub getPrefixesEUCJP {
190	0		0	0		my $word = shift;
191
192						# given a word, return an array of prefixes which should be
193						# indexed.
194						#
195						# default routine returns all substrings
196	0					my @prefix=();
197	0					my $i = 1;
198	0					while( $i <= length( $word )) {
199	0	0				if(ord(substr($word,$i-1,1)) >= 128) {
200	0					push @prefix, substr( $word, 0, $i+1 );
201	0					$i += 2;
202						} else {
203	0					push @prefix, substr( $word, 0, $i );
204	0					$i += 1;
205						}
206						}
207
208	0					reverse @prefix;
209						}
210
211						'true';
212