File Coverage

blib/lib/CEDict/Pinyin.pm

Criterion	Covered	Total	%
statement	59	108	54.6
branch	19	50	38.0
condition	2	11	18.1
subroutine	10	14	71.4
pod	4	5	80.0
total	94	188	50.0

line	stmt	bran	cond	sub	pod	time	code
1							package CEDict::Pinyin;
2							# Copyright (c) 2009 Christopher Davaz. All rights reserved.
3							# This program is free software; you can redistribute it and/or
4							# modify it under the same terms as Perl itself.
5	1			1		55045	use strict;
	1					2
	1					46
6	1			1		6	use warnings;
	1					13
	1					42
7	1			1		15	use vars qw($VERSION);
	1					5
	1					61
8	1			1		5	use base qw(Class::Light);
	1					3
	1					2330
9	1			1		2020	use Carp;
	1					3
	1					65
10	1			1		39081	use Encode;
	1					65330
	1					1573
11
12							$VERSION = '0.02004';
13
14							=encoding utf8
15
16							=head1 NAME
17
18							CEDict::Pinyin - Validates pinyin strings
19
20							=head1 SYNOPSIS
21
22							# This is from the test case provided with this module:
23
24							use CEDict::Pinyin;
25
26							my @good = ("ji2 - rui4 cheng2", "xi'an", "dian4 nao3, yuyan2", "kongzi");
27							my @bad = ("123", "not pinyin", "gu1 gu1 fsck4 fu3");
28							my $py = CEDict::Pinyin->new;
29
30							for (@good) {
31							$py->setSource($_);
32							ok($py->isPinyin, "correctly validated good pinyin");
33							print "pinyin: " . $py->getSource . "\n";
34							print "parts: " . join(', ', @{$py->getParts}) . "\n";
35							}
36
37							for (@bad) {
38							$py->setSource($_);
39							ok(!$py->isPinyin, "correctly invalidated bad pinyin");
40							print "pinyin: " . $py->getSource . "\n";
41							print "parts: " . join(', ', @{$py->getParts}) . "\n";
42							}
43
44							=head1 DESCRIPTION
45
46							This class helps you validate and parse pinyin. Currently the pinyin must follow
47							some rules about how it is formatted before being considered "valid" by this
48							class's validation method. All valid pinyin syllables are expressed by characters
49							within the 7-bit ASCII range. That means the validation method will fail on a
50							string like "nán nǚ lǎo shào". The pinyin should instead contain numbers after
51							the letter to represent tones. Instead of the string above we should use
52							"nan2 nv lao3 shao4". Being able to accept a string with accented characters
53							that represent the tone of the syllable is a feature I hope to add to a future
54							version of this module. The parser first takes a look at the entires string
55							you pass it to see if it is even worth parsing. The regular expression used
56							is shown below.
57
58							C<< /^[A-Za-z]+[A-Za-z1-5,'\- ]*$/ >>
59
60							If the pinyin doesn't match this regex, then isPinyin returns false and stops
61							parsing the string. All this means is that if you want to use this module to
62							validate your pinyin but your pinyin is not exactly in the same format as
63							just described then you need cleanup your pinyin strings a little bit first.
64
65							Again, hopefully future versions of this class will be more flexible in what
66							is accepted as valid pinyin. However we want to be sure that what we are looking
67							at is really pinyin and not some English words as this module was originally
68							written in part to distinguish between a pinyin string and English. I would
69							also like to keep this idea in future versions, so if you update the class
70							with your own code, please keep that in mind.
71
72							=cut
73
74							# Class Data defined here
75							my %ValidPinyin;
76							$ValidPinyin{$_} = chomp $_ while ;
77							close DATA;
78
79							# Characters containing the diacritic marks used in pinyin. The array is ordered
80							# by tone number (however since arrays are 0-indexed the first-tone diacritic
81							# information is at $Tones[0]).
82							my @Tones = (
83							{a=>"\x{0101}",e=>"\x{0113}",i=>"\x{012B}",o=>"\x{014D}",u=>"\x{016B}",v=>"\x{01D6}"},
84							{a=>"\x{00E1}",e=>"\x{00E9}",i=>"\x{00ED}",o=>"\x{00F3}",u=>"\x{00FA}",v=>"\x{01D8}"},
85							{a=>"\x{01CE}",e=>"\x{0115}",i=>"\x{01D0}",o=>"\x{01D2}",u=>"\x{01D4}",v=>"\x{01DA}"},
86							{a=>"\x{00E0}",e=>"\x{00E8}",i=>"\x{00EC}",o=>"\x{00F2}",u=>"\x{00F9}",v=>"\x{01DC}"}
87							);
88
89							sub getValidPinyin {
90	0			0	0	0	return \%ValidPinyin;
91							}
92
93							=head2 Methods
94
95							=over 4
96
97							=item C<< CEDict::Pinyin->new( >>IC<)>
98
99							Creates a new CEDict::Pinyin object. I should be a string containing
100							the pinyin you want to work with. If I is ommited it can be set later
101							using the C method.
102
103							=cut
104
105							sub _init {
106	1			1		44	my $self = shift;
107	1		50			7	my $source = shift \|\| '';
108	1					7	$self->{source} = $source;
109	1					4	$self->{parts} = [];
110							}
111
112							=item C<< $obj->setSource( >>IC<)>
113
114							Sets the source string to work with. Currently only the C method accesses
115							this attribute. Returns the previously set pinyin string.
116
117							=cut
118
119							sub setSource {
120	7			7	1	261	my $self = shift;
121	7					7	my $new = shift;
122	7					13	my $old = $self->{source};
123
124	7					12	$self->{source} = $new;
125	7					13	$self->{parts} = [];
126
127	7					19	return $old;
128							}
129
130							=item C<< $obj->diacritic >>
131
132							Returns the string of pinyin using diacritic marks instead of numbers to represent
133							tone information. For example, if the source pinyin is "nan2 nv lao3 shao4" then
134							C<< $obj->diacritic >> will return "nán nǚ lǎo shào".
135
136							=cut
137
138							# See http://www.pinyin.info/rules/where.html for a description of the
139							# rules regarding where tone marks go. These rules are copied here:
140							#
141							# * a and e trump all other vowels and always take the tone mark.
142							# There are no Mandarin syllables in Hanyu Pinyin that contain
143							# both a and e.
144							# * In the combination ou, o takes the mark.
145							# * In all other cases, the final vowel takes the mark.
146							#
147							sub diacritic {
148	0			0	1	0	my $self = shift;
149	0					0	my $source = $self->{source};
150	0					0	my $parts = [];
151	0	0				0	return unless $self->isPinyin($parts);
152	0					0	my @parts = @$parts;
153	0	0				0	return unless @parts;
154	0					0	my @string;
155
156	0					0	for my $part (@parts) {
157	0					0	$part =~ s/(.*)([1-5])/$1/;
158	0	0				0	my $tone = ($2 ? $2 : 5) - 1;
159	0	0				0	if ($tone < 4) {
160	0	0				0	if ($part =~ /([ae])/) {
		0
		0
		0
161	0					0	my $vowel = $1;
162	0					0	$part =~ s/$vowel/$Tones[$tone]{$vowel}/;
163							} elsif ($part =~ /ou/) {
164	0					0	$part =~ s/o/$Tones[$tone]{o}/;
165							} elsif ($part =~ /v/) {
166	0					0	$part =~ s/v/$Tones[$tone]{v}/;
167							} elsif (reverse $part =~ /([aeiou])/) {
168	0					0	my $vowel = $1;
169	0					0	$part =~ s/$vowel/$Tones[$tone]{$vowel}/;
170							}
171							}
172	0					0	push @string, $part;
173							}
174
175	0					0	return encode('UTF-8', join (' ', @string));
176							}
177
178							=item C<< $obj->getParts >>
179
180							Returns the individually parsed elements of the pinyin source string.
181
182							=cut
183
184							# defined by Class::Light
185
186							=item C<< $obj->getSource >>
187
188							Returns the pinyin source string that was supplied earlier either via the
189							constructor or C<< $obj->setSource >>.
190
191							=cut
192
193							# defined by Class::Light
194
195							=item C<< $obj->isPinyin >> I C<< $obj>->isPinyin( >>IC<)>
196
197							Validates the pinyin supplied to the constructor or to C<< $obj->setSource(SCALAR) >>.
198							If an I is supplied as an argument, adds each syllable of the parsed pinyin
199							to the array. If a syllable is considered invalid then the method stops parsing and
200							immediately returns false. Returns true otherwise.
201
202							=cut
203
204							sub isPinyin {
205	7			7	1	29	my $self = shift;
206	7		33			31	my $parts = shift \|\| $self->{parts};
207	7					11	my $source = $self->{source};
208	7	50				12	return unless $source;
209	7					13	$source = lc $source;
210	7	100				37	return 0 unless $source =~ /^[a-z]+[a-z1-5,'\- ]*$/;
211							# Find all the alphabetic characters before a syllable boundary ([1-5,'\- ]).
212							# The matched group may still consist of many syllables (for example, the
213							# string "shenjingbing"). So we still need to split this string into its
214							# constituent syllables.
215	6					36	my @result = $source =~ /([a-z]+[1-5]?)/g;
216	6					13	for my $validSubstring (@result) {
217	13					12	my $lastValidSubstring;
218							my $tone;
219
220	13	100				51	$tone = $1 if $validSubstring =~ /([1-5])/;
221	13					34	$validSubstring =~ s/[1-5]//;
222
223	13					31	while (1) {
224	15					18	$lastValidSubstring = $validSubstring;
225	15					26	$validSubstring = _getValidSubstring($validSubstring);
226	15	100				28	unless ($validSubstring) {
227	13	100				37	push @$parts, $lastValidSubstring . ($tone ? $tone : "");
228	13					15	last;
229							}
230	2					7	push @$parts, substr(
231							$lastValidSubstring,
232							0,
233							length($lastValidSubstring) - length($validSubstring)
234							);
235							}
236	13	100				39	return 0 unless defined $validSubstring;
237							}
238	4					21	return 1;
239							}
240
241							=item C<< CEDict::Pinyin->buildRegex( >>IC<)>
242
243							Takes a string containing pinyin and returns a regular expression that can be used with
244							the MySQL database (so far only tested against the 5.1 series). Accepts an asterisk ("*")
245							as a wildcard. Note that the C method will return false when validating such
246							a string, so if you plan on first validating the pinyin then generating the regex, make
247							sure you are validating the string without the asterisks C<($string =~ s/\*//g)>.
248
249							=back
250
251							=cut
252
253							sub buildRegex {
254	0			0	1	0	my $self = shift;
255	0	0				0	my $source = shift or return;
256	0					0	$source =~ s/\+/\/g; # Collapse redundant wildcards into one
257	0					0	my @reParts = ($source =~ /(\\|[^]+)/g);
258	0					0	my $regex = '^';
259
260	0					0	for (my $h = 0; $h < @reParts; $h++) {
261	0					0	$_ = $reParts[$h];
262	0	0				0	if ($_ eq '*') {
263	0					0	$regex .= '.*';
264	0					0	next;
265							}
266	0					0	my $pinyin = __PACKAGE__->new($_);
267	0					0	my $parts = []; $pinyin->isPinyin($parts);
	0					0
268	0					0	my @parts = @$parts;
269	0	0				0	return unless @parts;
270
271							# Check if last part is a valid pinyin substring
272	0	0				0	return unless _isValidInitialSubstring($parts[$#parts]);
273
274							# Use parts to construct a MySQL regular expression
275	0					0	for (my $i = 0; $i < @parts; $i++) {
276	0					0	$regex .= $parts[$i];
277	0	0				0	$regex .= '[1-5]?' unless $parts[$i] =~ /[1-5]/;
278	0	0	0			0	unless (
			0
279							($h == $#reParts
280							\|\| ($h == ($#reParts - 1) && $reParts[$#reParts] eq '*'))
281							&& $i == $#parts) {
282	0					0	$regex .= "[,' -]";
283							}
284							}
285							}
286
287	0					0	$regex .= '$';
288	0					0	return $regex;
289							}
290
291							# Private Static Method _isValidInitialSubstring
292							#
293							# Checks if $startsWith is the beginning of a valid pinyin string
294							sub _isValidInitialSubstring {
295	0			0		0	my $startsWith = shift;
296	0					0	$startsWith =~ s/[1-5]//;
297	0					0	for (keys %ValidPinyin) {
298	0	0				0	return 1 if /^$startsWith/;
299							}
300	0					0	return 0;
301							}
302
303							# Private Static Method _getValidSubstring
304							#
305							# _getValidSubstring returns the empty string if the entire $syllable matches
306							# a valid pinyin syllable. If only a portion of the string (starting from the
307							# beginning of the string) matches, then the rest of the string that didn't
308							# match is returned. If a match can't be found undef is returned.
309							sub _getValidSubstring {
310	15			15		24	my $syllable = shift;
311	15					17	my $part = undef;
312	15					15	my $valid = 0;
313	15	100				32	my $max = length($syllable) < 6 ? length($syllable) + 1 : 7;
314
315							# Do a quick lookup to see if the whole $syllable matches
316	15	100				50	return "" if exists $ValidPinyin{$syllable};
317
318							# Find the longest valid syllable
319	4					10	for (my $i = 1; $i < $max; $i++) {
320	18					25	$part = substr $syllable, 0, $i;
321	18	100				59	$valid = $i if exists $ValidPinyin{$part};
322							}
323
324							# $syllable is invalid so return undef
325	4	100				11	return undef unless $valid;
326
327							# Get only the valid part of $syllable
328	2					5	$part = substr $syllable, 0, $valid;
329
330	2					6	return substr $syllable, length($part), length($syllable);
331							}
332
333							1;
334
335							=head1 AUTHOR
336
337							Christopher Davaz www.chrisdavaz.com cdavaz@gmail.com
338
339							=head1 VERSION
340
341							Version 0.02004 (Mar 01 2010)
342
343							=head1 COPYRIGHT
344
345							Copyright (c) 2009 Christopher Davaz. All rights reserved.
346							This program is free software; you can redistribute it and/or
347							modify it under the same terms as Perl itself.
348
349							=cut
350
351							__DATA__