File Coverage

blib/lib/Lingua/ZH/CCDICT.pm

Criterion	Covered	Total	%
statement	109	111	98.2
branch	42	50	84.0
condition	2	3	66.6
subroutine	27	27	100.0
pod	10	10	100.0
total	190	201	94.5

line	stmt	bran	cond	sub	pod	time	code
1							package Lingua::ZH::CCDICT;
2
3	1			1		6	use strict;
	1					1
	1					31
4	1			1		4	use warnings;
	1					3
	1					26
5
6	1			1		29	use 5.006001;
	1					3
	1					34
7
8	1			1		5	use vars qw ($VERSION);
	1					2
	1					67
9
10							$VERSION = '0.05';
11
12	1			1		7	use File::Spec;
	1					2
	1					24
13	1			1		302713	use Params::Validate qw(:all);
	1					421955
	1					321
14	1			1		12	use Sub::Name qw( subname );
	1					2
	1					53
15	1			1		741	use Lingua::ZH::CCDICT::Romanization;
	1					3
	1					27
16	1			1		607	use Lingua::ZH::CCDICT::Romanization::Pinyin;
	1					3
	1					50
17
18							my %storage;
19	1			1		2	BEGIN { %storage = map { lc $_ => $_ } ( qw( InMemory XML BerkeleyDB ) ) }
	3					1386
20
21
22							sub new
23							{
24	1			1	1	13	my $class = shift;
25							my %p = validate_with( params => \@_,
26							spec =>
27							{ storage =>
28							{ callbacks =>
29							{ 'is a valid storage type' =>
30	1			1		15	sub { $storage{ lc $_[0] } } },
31							},
32							},
33	1					53	allow_extra => 1,
34							);
35
36	1					14	my $storage_class = __PACKAGE__ . '::Storage::' . $storage{ lc delete $p{storage} };
37
38	1			1		8	eval "use $storage_class";
	1					2
	1					16
	1					98
39	1	50				5	die $@ if $@;
40
41	1					8	return $storage_class->new(%p);
42							}
43
44							my %Ignore = map { $_ => 1 } qw( fUTF8 fCNS11643 fGB fBig5 );
45
46							my %CCDICTToInternal =
47							( fTotalStrokes => 'stroke_count',
48							fCangjie => 'cangjie',
49							fFourCorner => 'four_corner',
50							);
51
52							my %RomanizationToInternal =
53							( fHakka => 'pinjim',
54							fCantonese => 'jyutping',
55							fMandarin => 'pinyin',
56							);
57
58							my %RomanizationType = map { $_ => 1 } values %RomanizationToInternal;
59
60							sub parse_source_file
61							{
62	1			1	1	3	my $self = shift;
63	1					3	my $file = shift;
64	1	50				5	my $status_fh = $ENV{CCDICT_VERBOSE} ? \*STDERR : undef;
65	1					2	my $lines_per = 5000;
66
67	1	50				6	unless ( defined $file )
68							{
69	1					31	my $pack_as_file = File::Spec->catfile( split /::/, __PACKAGE__ );
70	1					3	$pack_as_file .= '.pm';
71
72	1					7	(my $dir = $INC{$pack_as_file}) =~ s/\.pm$//;
73
74	1					9	$file = File::Spec->catfile( $dir, 'Data.pm' );
75							}
76
77	1	50				71	open my $fh, '<', $file
78							or die "Cannot read $file: $!";
79
80	1					2	my $last_char;
81							my %entry;
82	1					52	while (<$fh>)
83							{
84	298555					359846	chomp;
85
86	298555	100				703149	next unless substr( $_, 0, 1 ) eq 'U';
87
88	298541					864822	my ( $unicode, $type, $data ) = split /\t/, $_, 3;
89
90	298541					3137863	s/^\s+\|\s+$//g for $unicode, $type, $data;
91
92	298541	100				904113	next if $Ignore{$type};
93
94	204180					919148	my ( $codepoint, $homograph ) = $unicode =~ /U\+([\dABCDEF]+)\.(\d)/;
95
96	204180	50				471488	die "Bad line (line #$.):\n$_\n\n" unless $codepoint;
97
98							# not sure how to handle this, to be honest.
99	204180	100				397277	next if $homograph;
100
101							# generate a real Unicode character in Perl
102	197284					448090	my $unicode_char = chr( hex($codepoint) );
103
104	197284	100				403431	$last_char = $unicode_char unless defined $last_char;
105
106							# this relies on the fact that data for each char is grouped
107							# together on consecutive lines in the ccdict.txt file.
108	197284	100				420691	if ( $unicode_char ne $last_char )
109							{
110	27493					316238	$self->_add_entry( $last_char, { %entry, unicode => $last_char } );
111
112	27493					129590	%entry = ();
113
114	27493					43437	$last_char = $unicode_char;
115							}
116
117	197284	100				531646	if ( exists $CCDICTToInternal{$type} )
		100
		100
		50
118							{
119	79290					105209	my $internal = $CCDICTToInternal{$type};
120
121	79290					185006	$entry{$internal} = $data;
122							}
123							elsif ( exists $RomanizationToInternal{$type} )
124							{
125	68974					98637	my $internal = $RomanizationToInternal{$type};
126
127	68974					166604	my $class = $self->_romanization_class($internal);
128
129	68974					94736	my $is_obsolete = 0;
130	68974					256916	foreach my $syl ( split /[\s;]+/, $data )
131							{
132							# Deal with various mysterious bits and mistakes
133	134089	100				271639	next if $syl eq '}';
134	134086	100				266721	next if $syl eq '(also)';
135	128294	100				245365	next if $syl eq '(old)';
136
137	128293	100				375107	next if $syl =~ /^p?\d+$/;
138
139	128287	100				283974	next if $syl =~ /^\(coll/;
140
141	128285					192995	$syl =~ s/^{?([^}]+)}$/$1/;
142	128285					148913	$syl =~ s/{[^}]+}?$//;
143
144	128285	100				264807	if ( $syl eq 'obs' )
145							{
146	23					42	$is_obsolete = 1;
147	23					65	next;
148							}
149
150	128262					489025	my $romanized = $class->new( syllable => $syl,
151							obsolete => $is_obsolete,
152							);
153
154	128262	100				399903	next unless $romanized;
155
156	128140					151566	push @{ $entry{$internal} }, $romanized;
	128140					520078
157							}
158							}
159							elsif ( $type eq 'fR/S' )
160							{
161	27495					71459	my ( $radical, $index ) = split /\./, $data;
162
163	27495					54776	$entry{radical} = $radical;
164	27495					63466	$entry{index} = $index;
165							}
166							elsif ( $type eq 'fEnglish' )
167							{
168	41503	50				211722	$entry{english} =
169	21525					104006	[ grep { defined && length } split /\s\[\d\d?\]\s/, $data ];
170							}
171							else
172							{
173	0					0	die "Invalid line: $_\n";
174							}
175
176	197284	50	66			1125478	if ( ! ( $. % $lines_per ) && $status_fh )
177							{
178	0					0	print $status_fh "$. lines processed\n";
179							}
180							}
181
182	1					9	$self->_add_entry( $last_char, { %entry, unicode => $last_char } );
183							}
184
185							sub _add_entry
186							{
187	27494			27494		42612	my $self = shift;
188	27494					37786	my $unicode = shift;
189	27494					28460	my $entry = shift;
190
191	27494	100				64282	return unless defined $entry->{radical};
192
193	27484					95436	$self->_real_add_entry( $unicode, $entry );
194							}
195
196							sub _romanization_class
197							{
198							return
199	68974	100		68974		181562	( $_[1] eq 'pinyin' ?
200							'Lingua::ZH::CCDICT::Romanization::Pinyin' :
201							'Lingua::ZH::CCDICT::Romanization'
202							);
203							}
204
205							sub _is_romanization_type
206							{
207	8			8		41	return $RomanizationType{ $_[1] };
208							}
209
210							sub _InternalTypes
211							{
212	1			1		5	return values %CCDICTToInternal, values %RomanizationToInternal, 'index', 'radical';
213							}
214
215							# Some of these may be overridden in subclasses, but they provide an
216							# easy default
217							foreach my $type ( __PACKAGE__->_InternalTypes() )
218							{
219							my $sub =
220							__PACKAGE__->_is_romanization_type($type)
221	1			1	1	656	? sub { my $s = shift;
				1	1
				1	1
222	1					3	$s->_match( $type => map { lc } @_ ) }
	1					9
223	2			2	1	838	: sub { my $s = shift;
				2	1
				2	1
				2	1
				2	1
224	2					7	$s->_match( $type => @_ ) };
225
226							my $sub_name = "match_$type";
227	1			1		8	no strict 'refs';
	1					2
	1					74
228
229							*{$sub_name} = subname $sub_name => $sub;
230							}
231
232
233							1;
234
235							__END__