File Coverage

blib/lib/Lingua/ZH/Toke.pm

Criterion	Covered	Total	%
statement	4	6	66.6
branch			n/a
condition			n/a
subroutine	2	2	100.0
pod			n/a
total	6	8	75.0

line	stmt	sub	time	code
1				# $File: //member/autrijus/Lingua-ZH-Toke/lib/Lingua/ZH/Toke.pm $ $Author: autrijus $
2				# $Revision: #2 $ $Change: 9667 $ $DateTime: 2004/01/11 12:56:49 $
3
4				package Lingua::ZH::Toke;
5				$Lingua::ZH::Toke::VERSION = '0.02';
6
7	3	3	2655	use strict;
	3		86
	3		135
8	3	3	2172	use Lingua::ZH::TaBE ();
	0
	0
9
10				=encoding big5
11
12				=head1 NAME
13
14				Lingua::ZH::Toke - Chinese Tokenizer
15
16				=head1 VERSION
17
18				This document describes version 0.02 of Lingua::ZH::Toke, released
19				January 11, 2004.
20
21				=head1 SYNOPSIS
22
23				use Lingua::ZH::Toke;
24
25				# -- if inputs are unicode strings, use the two lines below instead
26				# use utf8;
27				# use Lingua::ZH::Toke 'utf8';
28
29				# Create Lingua::ZH::Toke::Sentence object (->Sentence also works)
30				my $token = Lingua::ZH::Toke->new( '��H�o�b/�O��B/�q�o�N��' );
31
32				# Easy tokenization via array deferencing
33				print $token->[0] # Fragment - ��H�o�b
34				->[2] # Phrase - �o�b
35				->[0] # Character - �o
36				->[0] # Pronounciation - ��
37				->[2]; # Phonetic - ��
38
39				# Magic histogram via hash deferencing
40				print $token->{'��H�o�b'}; # 1 - One such fragment there
41				print $token->{'�N��'}; # 1 - One such phrase there
42				print $token->{'�o�N��'}; # undef - That's not a phrase
43				print $token->{'��'}; # 2 - Two such character there
44				print $token->{'��'}; # 2 - Two such pronounciation: �q�N
45				print $token->{'��'}; # 3 - Three such phonetics: ��B
46
47				# Iteration over fragments
48				while (my $fragment = <$token>) {
49				# Iteration over phrases
50				while (my $phrase = <$fragment>) {
51				# ...
52				}
53				}
54
55				=head1 DESCRIPTION
56
57				This module puts a thin wrapper around L, by blessing
58				refereces to B's objects into its English counterparts.
59
60				Besides offering more readable class names, this module also offers
61				various overloaded methods for tokenization; please see L for
62				the three major ones.
63
64				Since L is a Big5-oriented module, we also provide a
65				simple utf8 layer around it; if you have Perl version 5.6.1 or later,
66				just use this:
67
68				use utf8;
69				use Lingua::ZH::Toke 'utf8';
70
71				With the C flag set, all B objects will stringify to unicode
72				strings, and constructors will take either unicode strings, or
73				big5-encoded bytestrings.
74
75				Note that on Perl 5.6.x, L is needed for the C
76				feature to work.
77
78				=head1 METHODS
79
80				The constructor methods correspond to the six object levels:
81				C<-ESentence>, C<-EFragment>, C<-EPhrase>, C<-ECharacter>,
82				C<-EPronounciation> and C<-EPhonetic>. Each of them takes one
83				string argument, representing the string to be tokenized.
84
85				The C<-Enew> method is an alias to C<-E>Sentence>.
86
87				All object methods, except C<-Enew>, are passed to the underlying
88				B object.
89
90				=head1 CAVEATS
91
92				Under I mode, you may sometimes need to explicitly stringify
93				the return values, so their utf8 flag can be properly set:
94
95				$value = $token->[0]; # this may or may not work
96				$value = "$token->[0]"; # this is guaranteed to work
97
98				This module does not care about efficiency or memory consumption yet,
99				hence it's likely to fail miserably if you demand either of them.
100				Patches welcome.
101
102				As the name suggests, the chosen interface is very bizzare. Use it at
103				the risk of your own sanity.
104
105				=cut
106
107				use vars '$AUTOLOAD';
108
109				my @hier = qw(Chu Chunk Tsi Zhi Yin ZuYin);
110				my @name = qw(Sentence Fragment Phrase Character Pronounciation Phonetic);
111
112				my %next; @next{'', @hier} = (@hier, '');
113				my %tabe; @tabe{@hier, @name} = (@hier, @hier);
114				my %toke; @toke{@name, @hier} = (@name, @name);
115
116				for my $h (\%next, \%tabe, \%toke) {
117				$h->{_tabe($_)} = $h->{_toke($_)} = $h->{$_} for grep $_, keys %$h;
118				}
119
120				{ no strict 'refs'; @{_toke($_) . '::ISA'} = __PACKAGE__ for @name }
121
122				my (%hist, %iter, $_b2u, $_u2b);
123
124				BEGIN { $_b2u = $_u2b = sub { ${$_[0]} } }
125
126				sub import {
127				my $class = shift;
128				my $encoding = shift;
129				if ($encoding eq 'utf8') {
130				if ($] < 5.007) {
131				eval { require Encode::compat; 1 }
132				or die "Pre-5.8 perls needs Encode::compat to use the 'utf8' feature";
133				}
134
135				require Encode;
136
137				$_b2u = sub {
138				Encode::decode( big5 => ${$_[0]} )
139				};
140				$_u2b = sub {
141				Encode::is_utf8(${$_[0]})
142				? Encode::encode( big5 => ${$_[0]} )
143				: ${$_[0]};
144				};
145				}
146				}
147
148				use overload (
149				'""' => sub { $_b2u->(@_) },
150				'0+' => sub {
151				# scalar @{$_[0]}
152				# ... somehow look up freq ...
153				},
154				'@{}' => sub {
155				my $meth = ${$_[0]}->can(lc("$next{_tabe($_[0])}s")) or return [];
156				[ map bless(\$_, _toke($_)), $meth->(${$_[0]}) ]
157				},
158				'%{}' => sub {
159				$hist{overload::StrVal(${$_[0]})} \|\|= do {
160				my %o; $o{"$_"}++ for @{$_[0]};
161				my %h;
162				for my $c (@{$_[0]}) {
163				$h{"$_"} += $c->{"$_"} for keys %$c;
164				}
165				+{ %o, %h };
166				};
167				},
168				'<>' => sub {
169				$_[0]->[$iter{overload::StrVal($_[0])}++];
170				},
171				'fallback' => 1,
172				);
173
174				my $Tabe;
175
176				sub new {
177				my $class = shift;
178				my $child = $_[1] \|\| $class;
179				my $method = $tabe{ref($child) \|\| $child} \|\| $hier[0];
180				my $obj = ($Tabe \|\|= Lingua::ZH::TaBE->new)->$method($_u2b->(\$_[0]));
181				my $self = bless(\$obj, _toke($obj));
182				}
183
184				sub AUTOLOAD {
185				no strict 'refs';
186				$AUTOLOAD =~ s/.*:://;
187
188				my $name = _toke($AUTOLOAD)
189				or return ${$_[0]}->$AUTOLOAD(@_[1..$#_]);
190
191				return $name->new(@_[1..$#_]);
192				}
193
194				sub CLONE { }
195				sub DESTROY { }
196
197				sub _tabe { 'Lingua::ZH::TaBE::' . ($tabe{ref($_[0]) \|\| $_[0]} \|\| die $_[0]) }
198				sub _toke { 'Lingua::ZH::Toke::' . ($toke{ref($_[0]) \|\| $_[0]} \|\| die $_[0]) }
199
200				1;
201
202				=head1 SEE ALSO
203
204				L, L, L
205
206				=head1 AUTHORS
207
208				Autrijus Tang Eautrijus@autrijus.orgE
209
210				=head1 COPYRIGHT
211
212				Copyright 2003, 2004 by Autrijus Tang Eautrijus@autrijus.orgE.
213
214				This program is free software; you can redistribute it and/or modify it
215				under the same terms as Perl itself.
216
217				See L
218
219				=cut