line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Lingua::LO::NLP::Romanize; |
2
|
5
|
|
|
5
|
|
406083
|
use strict; |
|
5
|
|
|
|
|
21
|
|
|
5
|
|
|
|
|
146
|
|
3
|
5
|
|
|
5
|
|
27
|
use warnings; |
|
5
|
|
|
|
|
11
|
|
|
5
|
|
|
|
|
302
|
|
4
|
5
|
|
|
5
|
|
176
|
use 5.012000; |
|
5
|
|
|
|
|
22
|
|
5
|
5
|
|
|
5
|
|
27
|
use utf8; |
|
5
|
|
|
|
|
12
|
|
|
5
|
|
|
|
|
34
|
|
6
|
5
|
|
|
5
|
|
1020
|
use version 0.77; our $VERSION = version->declare('v1.0.1'); |
|
5
|
|
|
|
|
6099
|
|
|
5
|
|
|
|
|
43
|
|
7
|
5
|
|
|
5
|
|
554
|
use Carp; |
|
5
|
|
|
|
|
13
|
|
|
5
|
|
|
|
|
386
|
|
8
|
5
|
|
|
5
|
|
33
|
use Scalar::Util 'blessed'; |
|
5
|
|
|
|
|
10
|
|
|
5
|
|
|
|
|
251
|
|
9
|
5
|
|
|
5
|
|
854
|
use Class::Accessor::Fast 'antlers'; |
|
5
|
|
|
|
|
8306
|
|
|
5
|
|
|
|
|
45
|
|
10
|
5
|
|
|
5
|
|
1578
|
use Lingua::LO::NLP::Syllabify; |
|
5
|
|
|
|
|
28
|
|
|
5
|
|
|
|
|
61
|
|
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
=encoding utf8 |
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
=head1 NAME |
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
Lingua::LO::NLP::Romanize - Romanize Lao syllables |
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
=head1 FUNCTION |
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
This is a factory class for C. Currently there |
21
|
|
|
|
|
|
|
are the following romanization modules: |
22
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
=over 4 |
24
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
=item L |
26
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
for the standard set by the |
28
|
|
|
|
|
|
|
L |
29
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
=item L |
31
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
for the International Phonetic Alphabet |
33
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
=back |
35
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
=head1 SYNOPSIS |
37
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
my $o = Lingua::LO::NLP::Romanize->new( |
39
|
|
|
|
|
|
|
variant => 'PCGN', |
40
|
|
|
|
|
|
|
hyphen => 1, |
41
|
|
|
|
|
|
|
); |
42
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
=cut |
44
|
|
|
|
|
|
|
|
45
|
|
|
|
|
|
|
=head1 METHODS |
46
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
=head2 new |
48
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
The constructor takes any number of hash-style named arguments. The following |
50
|
|
|
|
|
|
|
ones are always recognized: |
51
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
=over 4 |
53
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
=item C |
55
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
Standard according to which to romanize; this determines the |
57
|
|
|
|
|
|
|
L subclass to actually instantiate. This argument is |
58
|
|
|
|
|
|
|
mandatory. |
59
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
=item C |
61
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
Separate runs of Lao syllables with "hyphens". Set this to the character you |
63
|
|
|
|
|
|
|
would like to use as a hyphen - usually this will be the ASCII "hyphen minus" |
64
|
|
|
|
|
|
|
(U+002D) but it can be the unambiguous Unicode hyphen ("โ", U+2010), a slash or |
65
|
|
|
|
|
|
|
anything you like (except for the special-cased '0' and '1' - but you wouldn't |
66
|
|
|
|
|
|
|
want those between your syllables anyway!). As a special case, you can pass a 1 |
67
|
|
|
|
|
|
|
to use the ASCII version. If this argument is missing, C or C<0>, blanks |
68
|
|
|
|
|
|
|
are used. Syllables duplicated using "เป" are always joined with a hyphen: |
69
|
|
|
|
|
|
|
either the one you specify or the ASCII one. |
70
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
=item C |
72
|
|
|
|
|
|
|
|
73
|
|
|
|
|
|
|
Run text through tone mark order normalization; see |
74
|
|
|
|
|
|
|
L. If your text looks fine but |
75
|
|
|
|
|
|
|
syllables are not recognized, you may need this. |
76
|
|
|
|
|
|
|
|
77
|
|
|
|
|
|
|
=back |
78
|
|
|
|
|
|
|
|
79
|
|
|
|
|
|
|
Subclasses may specify additional arguments, such as |
80
|
|
|
|
|
|
|
L's C that controls the rendering of |
81
|
|
|
|
|
|
|
IPA diacritics for tonal languages. |
82
|
|
|
|
|
|
|
|
83
|
|
|
|
|
|
|
=cut |
84
|
|
|
|
|
|
|
|
85
|
|
|
|
|
|
|
sub new { |
86
|
11
|
|
|
11
|
1
|
3036
|
my ($class, %args) = @_; |
87
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
# Allow subclasses to omit a constructor |
89
|
11
|
100
|
|
|
|
45
|
return bless {}, $class if $class ne __PACKAGE__; |
90
|
|
|
|
|
|
|
|
91
|
|
|
|
|
|
|
# If we've been called on Lingua::LO::NLP::Romanize, require a variant |
92
|
10
|
100
|
|
|
|
233
|
my $variant = delete $args{variant} or croak("`variant' argument missing or undefined"); |
93
|
9
|
|
|
|
|
21
|
my $hyphen = delete $args{hyphen}; |
94
|
9
|
|
|
|
|
17
|
my $normalize = delete $args{normalize}; |
95
|
|
|
|
|
|
|
|
96
|
9
|
|
|
|
|
64
|
my $subclass = __PACKAGE__ . "::$variant"; |
97
|
9
|
|
|
|
|
67
|
(my $module = $subclass) =~ s!::!/!g; |
98
|
9
|
|
|
|
|
1954
|
require "$module.pm"; ## no critic (BarewordIncludes) |
99
|
|
|
|
|
|
|
|
100
|
9
|
|
|
|
|
213
|
my $self = $subclass->new(%args); |
101
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
# Pass an explicit false if hyphen arg was unset |
103
|
9
|
|
100
|
|
|
80
|
$self->hyphen($hyphen // 0); |
104
|
9
|
|
|
|
|
57
|
$self->normalize($normalize); |
105
|
9
|
|
|
|
|
124
|
return $self; |
106
|
|
|
|
|
|
|
} |
107
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
=head2 romanize |
109
|
|
|
|
|
|
|
|
110
|
|
|
|
|
|
|
romanize( $text ) |
111
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
Return the romanization of C<$text> according to the standard passed to the |
113
|
|
|
|
|
|
|
constructor. Text is split up by |
114
|
|
|
|
|
|
|
L; Lao syllables are processed |
115
|
|
|
|
|
|
|
and everything else is passed through unchanged save for possible conversion of |
116
|
|
|
|
|
|
|
combining characters to a canonically equivalent form by |
117
|
|
|
|
|
|
|
L. |
118
|
|
|
|
|
|
|
|
119
|
|
|
|
|
|
|
=cut |
120
|
|
|
|
|
|
|
|
121
|
|
|
|
|
|
|
sub romanize { |
122
|
50
|
|
|
50
|
1
|
16971
|
my ($self, $text) = @_; |
123
|
50
|
|
|
|
|
86
|
my $result = ''; |
124
|
|
|
|
|
|
|
|
125
|
50
|
|
|
|
|
128
|
my @frags = Lingua::LO::NLP::Syllabify->new( $text, normalize => $self->normalize )->get_fragments; |
126
|
50
|
|
|
|
|
172
|
while(@frags) { |
127
|
51
|
|
|
|
|
65
|
my @lao; |
128
|
51
|
|
100
|
|
|
312
|
push @lao, shift @frags while @frags and $frags[0]->{is_lao}; |
129
|
51
|
|
|
|
|
117
|
$result .= join($self->{hyphen}, map { $self->romanize_syllable( $_->{text} ) } @lao); |
|
100
|
|
|
|
|
251
|
|
130
|
50
|
|
100
|
|
|
256
|
$result .= (shift @frags)->{text} while @frags and not $frags[0]->{is_lao}; |
131
|
|
|
|
|
|
|
} |
132
|
49
|
|
|
|
|
304
|
return $result; |
133
|
|
|
|
|
|
|
} |
134
|
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
=head2 romanize_syllable |
136
|
|
|
|
|
|
|
|
137
|
|
|
|
|
|
|
romanize_syllable( $syllable | $analysis ) |
138
|
|
|
|
|
|
|
|
139
|
|
|
|
|
|
|
Return the romanization of a single C<$syllable> according to the standard |
140
|
|
|
|
|
|
|
passed to the constructor. This method accepts either a plain string or an |
141
|
|
|
|
|
|
|
analysis result from L. The latter helps avoid |
142
|
|
|
|
|
|
|
redundant parsing if you need both an analysis and a romanization. |
143
|
|
|
|
|
|
|
|
144
|
|
|
|
|
|
|
=cut |
145
|
|
|
|
|
|
|
|
146
|
|
|
|
|
|
|
sub romanize_syllable { |
147
|
100
|
|
|
100
|
1
|
181
|
my ($self, $thing) = @_; |
148
|
100
|
50
|
|
|
|
252
|
unless( blessed($thing) ) { |
149
|
|
|
|
|
|
|
# Analyze syllable first unless we got an analysis result already |
150
|
|
|
|
|
|
|
# (we just assume it is one if we have an object) |
151
|
100
|
|
|
|
|
261
|
$thing = Lingua::LO::NLP::Analyze->new($thing); |
152
|
|
|
|
|
|
|
} |
153
|
100
|
|
|
|
|
284
|
return $self->_romanize_syllable( $thing ); |
154
|
|
|
|
|
|
|
} |
155
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
=head2 _romanize_syllable |
157
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
_romanize_syllable( $analysis ) |
159
|
|
|
|
|
|
|
|
160
|
|
|
|
|
|
|
Return the romanization of a syllable passed in as a 'Lingua::LO::NLP::Analyze' |
161
|
|
|
|
|
|
|
result, according to the standard passed to the constructor. This is a virtual |
162
|
|
|
|
|
|
|
method that must be implemented by subclasses. |
163
|
|
|
|
|
|
|
|
164
|
|
|
|
|
|
|
=cut |
165
|
|
|
|
|
|
|
|
166
|
|
|
|
|
|
|
sub _romanize_syllable { |
167
|
1
|
|
|
1
|
|
2
|
my $self = shift; |
168
|
1
|
|
|
|
|
23
|
die blessed($self) . " must implement _romanize_syllable()"; |
169
|
|
|
|
|
|
|
} |
170
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
=head2 hyphen |
172
|
|
|
|
|
|
|
|
173
|
|
|
|
|
|
|
my $hyphen = $o->hyphen; |
174
|
|
|
|
|
|
|
$o->hyphen( '-' ); # Use ASCII hyphen |
175
|
|
|
|
|
|
|
$o->hyphen( 1 ); # Dito |
176
|
|
|
|
|
|
|
$o->hyphen( 0 ); # No hyphenation, separate syllables with spaces |
177
|
|
|
|
|
|
|
$o->hyphen( 'โ' ); # Unicode hyphen U+2010 |
178
|
|
|
|
|
|
|
|
179
|
|
|
|
|
|
|
Accessor for the C attribute, see L. |
180
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
=cut |
182
|
|
|
|
|
|
|
|
183
|
|
|
|
|
|
|
sub hyphen { |
184
|
12
|
|
|
12
|
1
|
33
|
my ($self, $hyphen) = @_; |
185
|
12
|
50
|
|
|
|
35
|
if(defined $hyphen) { |
186
|
12
|
100
|
|
|
|
61
|
if($hyphen eq '1') { |
|
|
100
|
|
|
|
|
|
187
|
2
|
|
|
|
|
13
|
$self->{hyphen} = '-'; |
188
|
|
|
|
|
|
|
} elsif($hyphen eq '0') { |
189
|
7
|
|
|
|
|
44
|
$self->{hyphen} = ' '; |
190
|
|
|
|
|
|
|
} else { |
191
|
3
|
|
|
|
|
7
|
$self->{hyphen} = $hyphen; |
192
|
|
|
|
|
|
|
} |
193
|
|
|
|
|
|
|
} |
194
|
12
|
|
|
|
|
29
|
return $self->{hyphen}; |
195
|
|
|
|
|
|
|
} |
196
|
|
|
|
|
|
|
|
197
|
|
|
|
|
|
|
=head2 normalize |
198
|
|
|
|
|
|
|
|
199
|
|
|
|
|
|
|
my $normalization = $o->normalize; |
200
|
|
|
|
|
|
|
$o->normalize( $bool ); |
201
|
|
|
|
|
|
|
|
202
|
|
|
|
|
|
|
Accessor for the C attribute, see L. |
203
|
|
|
|
|
|
|
|
204
|
|
|
|
|
|
|
=cut |
205
|
|
|
|
|
|
|
|
206
|
|
|
|
|
|
|
has normalize => (is => 'rw'); |
207
|
|
|
|
|
|
|
|
208
|
|
|
|
|
|
|
1; |
209
|
|
|
|
|
|
|
|