File Coverage

blib/lib/Lingua/JA/Gairaigo/Fuzzy.pm

Criterion	Covered	Total	%
statement	120	134	89.5
branch	55	76	72.3
condition	36	51	70.5
subroutine	14	14	100.0
pod	1	8	12.5
total	226	283	79.8

line	stmt	bran	cond	sub	pod	time	code
1							package Lingua::JA::Gairaigo::Fuzzy;
2							require Exporter;
3							@ISA = qw(Exporter);
4							@EXPORT_OK = qw/same_gairaigo/;
5							%EXPORT_TAGS = (
6							all => \@EXPORT_OK,
7							);
8	1			1		25123	use warnings;
	1					2
	1					35
9	1			1		5	use strict;
	1					2
	1					35
10	1			1		6	use Carp;
	1					6
	1					107
11							our $VERSION = 0.06;
12	1			1		1268	use utf8;
	1					11
	1					5
13
14	1			1		930	use Text::Fuzzy 'fuzzy_index';
	1					1551
	1					71
15	1			1		1219	use Lingua::JA::Moji ':all';
	1					55785
	1					2120
16
17							sub same_gairaigo
18							{
19	13			13	1	95	my ($kana, $n, $debug) = @_;
20	13	100				39	if ($kana eq $n) {
21	1					5	return 1;
22							}
23	12	100				26	if (chouon ($kana, $n)) {
24	11					22	my $gotcha = usual_suspect ($kana, $n, $debug);
25	11	100				24	if ($gotcha) {
26	10					50	return 1;
27							}
28							}
29	2					8	return undef;
30							}
31
32							# Check a few likely things
33
34							sub usual_suspect
35							{
36	11			11	0	18	my ($kana, $n, $debug) = @_;
37
38							# The following is an undocumented routine in Text::Fuzzy.
39
40	11					35	my ($dist, $edits) = fuzzy_index ($kana, $n, 1);
41
42							# Is this a likely candidate?
43
44	11					7163	my $gotcha;
45
46	11	100				55	if ($edits =~ /ii\|dd\|rr/) {
47
48							# A double delete, double insertion, or double replace means
49							# this is unlikely to be the same word.
50
51	1					3	return;
52							}
53	10					39	my @kana = split //, $kana;
54	10					36	my @nkana = split //, $n;
55	10					36	my @edits = split //, $edits;
56
57	10	50				25	if ($debug) {
58	0					0	printf ("%d %d\n", scalar (@kana), scalar (@nkana));
59							}
60
61							# $i is the offset in @kana, and $j is the offset in @nkana. Note
62							# that @kana and @nkana may have different lengths and the offsets
63							# are adjusted as we look though what edit is necessary to change
64							# "$kana" to "$n".
65
66	10					12	my $i = 0;
67	10					13	my $j = 0;
68
69	10					15	for my $edit (@edits) {
70
71	62	50				120	if ($debug) {
72	0					0	print "i = $i, j = $j, edit = $edit\n";
73							}
74	62	100				216	if ($edit eq 'r') {
		100
		50
		50
75
76							# Replaced $k with $q.
77
78	6					12	my $k = $kana[$i];
79	6					8	my $q = $nkana[$j];
80	6	50				34	if ($debug) {
81	0					0	print "Replace $k with $q\n";
82							}
83	6	100	66			56	if ($k =~ /[ーィイ]/ && $q =~ /[ーィイ]/) {
84
85							# Check whether the previous kana ends in "e", so it
86							# is something like "ヘイ" and "ヘー".
87
88	3	100				10	if (ends_in_e (\@kana, $i)) {
89	2					4	$gotcha = 1;
90							}
91	3	50	66			39	if (($k eq 'ー' && $q eq 'イ') \|\|
			33
			66
92							($q eq 'ー' && $k eq 'イ')) {
93	3	100				12	if (ends_in_i (\@kana, $i)) {
94	1					2	$gotcha = 1;
95							}
96							}
97							}
98	6	100	100			44	if ($k =~ /[ーッ]/ && $q =~ /[ーッ]/) {
99
100							# A chouon has been replaced with a sokuon, or
101							# vice-versa.
102
103	1					2	$gotcha = 1;
104							}
105	6	100	100			58	if (($k eq 'ー' && $q eq 'ウ') \|\|
			66
			66
106							($q eq 'ー' && $k eq 'ウ')) {
107	2	50				7	if (ends_in_ou (\@kana, $i)) {
108	2					3	$gotcha = 1;
109							}
110							}
111
112							# Whatever we had, increment $i and $j equally because a
113							# character was replaced.
114
115	6					10	$i++;
116	6					10	$j++;
117							}
118							elsif ($edit eq 'd') {
119
120							# Character $k was deleted from $kana to get $n, so we
121							# just increment $i.
122
123	5					6	my $k = $kana[$i];
124	5	50	100			32	if ($k eq 'ー' \|\| $k eq '・' \|\| $k eq 'ッ') {
			66
125
126							# A chouon, nakaguro, or sokuon was deleted from $kana
127							# to get $n.
128
129	5					6	$gotcha = 1;
130							}
131							# Check we are not at the end of the string.
132	5	50				13	if ($j < scalar (@kana)) {
133	5					7	my $q = $kana[$j];
134	5	50				9	if (! defined $q) {
135	0					0	warn "baba";
136							}
137	5	100				23	if ($q =~ /[ーィイ]/) {
138	2	50				6	if (ends_in_e (\@kana, $i)) {
139	0					0	$gotcha = 1;
140							}
141							}
142							}
143	5					11	$i++;
144							}
145							elsif ($edit eq 'i') {
146
147							# Character $k was inserted into $n, so we just increment
148							# $j, not $i.
149
150	0					0	my $k = $nkana[$j];
151	0	0	0			0	if ($k eq 'ー' \|\| $k eq '・' \|\| $k eq 'ッ') {
			0
152
153							# A chouon, nakaguro, or sokuon was inserted into
154							# $kana to get $n.
155
156	0					0	$gotcha = 1;
157							}
158	0					0	$j++;
159							}
160							elsif ($edit eq 'k') {
161
162							# The two strings are the same at this point, so do not do
163							# any checking but just increment the offsets.
164
165	51					48	$i++;
166	51					68	$j++;
167							}
168							}
169
170							# Check we did not make a mistake scanning the two strings.
171
172	10	50				28	if ($i != scalar @kana) {
173	0					0	warn "Bug: Mismatch $i";
174							}
175	10	50				21	if ($j != scalar @nkana) {
176	0					0	warn "Bug: Mismatch $j";
177							}
178	10					44	return $gotcha;
179							}
180
181							# Work out whether the kana before the one at $i ends in "e".
182
183							sub ends_in_e
184							{
185	5			5	0	9	my ($kana_ref, $i) = @_;
186	5					6	my $prev;
187	5	50				17	if ($i >= 1) {
188	5					11	$prev = $kana_ref->[$i - 1];
189	5					18	$prev = kana2romaji ($prev);
190	5	100				3528	if ($prev =~ /e$/) {
191	2					9	return 1;
192							}
193							}
194	3					10	return undef;
195							}
196
197							# Work out whether the kana before the one at $i ends in "ou".
198
199							sub ends_in_ou
200							{
201	2			2	0	5	my ($kana_ref, $i) = @_;
202	2					3	my $prev;
203	2	50				6	if ($i >= 1) {
204	2					6	$prev = $kana_ref->[$i - 1];
205	2					7	$prev = kana2romaji ($prev);
206	2	50				851	if ($prev =~ /[ou]$/) {
207	2					8	return 1;
208							}
209							}
210	0					0	return undef;
211							}
212							# Work out whether the kana before the one at $i ends in "e".
213
214							sub ends_in_i
215							{
216	3			3	0	5	my ($kana_ref, $i) = @_;
217	3					4	my $prev;
218	3	50				9	if ($i >= 1) {
219	3					7	$prev = $kana_ref->[$i - 1];
220	3					11	$prev = kana2romaji ($prev);
221	3	100				1161	if ($prev =~ /i$/) {
222	1					4	return 1;
223							}
224							}
225	2					7	return undef;
226							}
227							# Work out whether $x and $y differ in the ways we expect.
228
229							# The name "chouon" is a misnomer.
230
231							sub chouon
232							{
233	12			12	0	16	my ($x, $y) = @_;
234	12					26	my %xa = alph ($x);
235	12					33	my %ya = alph ($y);
236	12					21	my $found;
237	12					32	my $mismatch = check (\%xa, \%ya, \$found);
238	12	100				29	if ($mismatch) {
239	1					6	return undef;
240							}
241	11					25	$mismatch = check (\%ya, \%xa, \$found);
242	11	50				26	if ($mismatch) {
243	0					0	return undef;
244							}
245	11	50				23	if ($found) {
246	11					64	return 1;
247							}
248	0					0	return undef;
249							}
250
251							# Given a word $x, make an alphabet of its consituent letters.
252
253							sub alph
254							{
255	24			24	0	32	my ($x) = @_;
256	24					24	my %xa;
257	24					94	my @xl = split //, $x;
258	24					152	@xa{@xl} = @xl;
259	24					187	return %xa;
260							}
261
262							# Go through the keys of $ya, and check whether the keys which are not
263							# in $xa are the right kind of keys.
264
265							sub check
266							{
267	23			23	0	31	my ($xa, $ya, $found) = @_;
268	23					25	my $ok;
269	23					67	for my $k (keys %$ya) {
270	128	100				305	next if $xa->{$k};
271	18	100	100			154	if ($k eq 'ー' \|\|
			66
			100
			100
			100
272							$k eq 'イ' \|\|
273							$k eq 'ィ' \|\|
274							$k eq '・' \|\|
275							$k eq 'ッ' \|\|
276							$k eq 'ウ') {
277	17					19	$ok = 1;
278	17					29	next;
279							}
280	1					4	return $k;
281							}
282	22	100				58	if ($ok) {
283	16					24	$$found = $ok;
284							}
285	22					37	return;
286							}
287
288							1;