line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Lingua::LO::NLP; |
2
|
2
|
|
|
2
|
|
149656
|
use strict; |
|
2
|
|
|
|
|
6
|
|
|
2
|
|
|
|
|
59
|
|
3
|
2
|
|
|
2
|
|
11
|
use warnings; |
|
2
|
|
|
|
|
4
|
|
|
2
|
|
|
|
|
63
|
|
4
|
2
|
|
|
2
|
|
65
|
use 5.012000; |
|
2
|
|
|
|
|
7
|
|
5
|
2
|
|
|
2
|
|
9
|
use utf8; |
|
2
|
|
|
|
|
3
|
|
|
2
|
|
|
|
|
13
|
|
6
|
2
|
|
|
2
|
|
55
|
use feature 'unicode_strings'; |
|
2
|
|
|
|
|
4
|
|
|
2
|
|
|
|
|
184
|
|
7
|
2
|
|
|
2
|
|
316
|
use version 0.77; our $VERSION = version->declare('v1.0.1'); |
|
2
|
|
|
|
|
1918
|
|
|
2
|
|
|
|
|
12
|
|
8
|
2
|
|
|
2
|
|
550
|
use Lingua::LO::NLP::Syllabify; |
|
2
|
|
|
|
|
6
|
|
|
2
|
|
|
|
|
20
|
|
9
|
2
|
|
|
2
|
|
515
|
use Lingua::LO::NLP::Analyze; |
|
2
|
|
|
|
|
9
|
|
|
2
|
|
|
|
|
24
|
|
10
|
2
|
|
|
2
|
|
615
|
use Lingua::LO::NLP::Romanize; |
|
2
|
|
|
|
|
6
|
|
|
2
|
|
|
|
|
14
|
|
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
=encoding utf8 |
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
=head1 NAME |
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
Lingua::LO::NLP - Various Lao text processing functions |
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
=head1 SYNOPSIS |
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
use utf8; |
21
|
|
|
|
|
|
|
use 5.10.1; |
22
|
|
|
|
|
|
|
use open qw/ :std :encoding(UTF-8) /; |
23
|
|
|
|
|
|
|
use Lingua::LO::NLP; |
24
|
|
|
|
|
|
|
use Data::Dumper; |
25
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
my $lao = Lingua::LO::NLP->new; |
27
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
my @syllables = $lao->split_to_syllables("ສະບາຍດີ"); # qw( ສະ ບາຍ ດີ ) |
29
|
|
|
|
|
|
|
print Dumper(\@syllables); |
30
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
for my $syl (@syllables) { |
32
|
|
|
|
|
|
|
my $analysis = $lao->analyze_syllable($syl); |
33
|
|
|
|
|
|
|
printf "%s: %s\n", $analysis->syllable, $analysis->tone; |
34
|
|
|
|
|
|
|
# ສະ: TONE_HIGH_STOP |
35
|
|
|
|
|
|
|
# ບາຍ: TONE_LOW |
36
|
|
|
|
|
|
|
# ດີ: TONE_LOW |
37
|
|
|
|
|
|
|
} |
38
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
say $lao->romanize("ສະບາຍດີ", variant => 'PCGN', hyphen => "\N{HYPHEN}"); # sa‐bay‐di |
40
|
|
|
|
|
|
|
say $lao->romanize("ສະບາຍດີ", variant => 'IPA'); # saʔ baːj diː |
41
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
=head1 DESCRIPTION |
43
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
This module provides various functions for processing Lao text. Currently it can |
45
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
=over 4 |
47
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
=item |
49
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
split Lao text (usually written without blanks between words) into syllables |
51
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
=item |
53
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
analyze syllables with regards to core and end consonants, vowels, tone and |
55
|
|
|
|
|
|
|
other properties |
56
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
=item |
58
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
romanize Lao text according to the PCGN standard or to IPA (experimental) |
60
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
=back |
62
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
These functions are basically just shortcuts to the functionality of some |
64
|
|
|
|
|
|
|
specialized modules: L, |
65
|
|
|
|
|
|
|
L and L. If |
66
|
|
|
|
|
|
|
you need only one of them, you can shave off a little overhead by using those |
67
|
|
|
|
|
|
|
directly. |
68
|
|
|
|
|
|
|
|
69
|
|
|
|
|
|
|
=head1 METHODS |
70
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
=head2 new |
72
|
|
|
|
|
|
|
|
73
|
|
|
|
|
|
|
new(option => value, ...) |
74
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
=head3 Options |
76
|
|
|
|
|
|
|
|
77
|
|
|
|
|
|
|
=over 4 |
78
|
|
|
|
|
|
|
|
79
|
|
|
|
|
|
|
=item * C: passed to L and L. |
80
|
|
|
|
|
|
|
|
81
|
|
|
|
|
|
|
=back |
82
|
|
|
|
|
|
|
|
83
|
|
|
|
|
|
|
=cut |
84
|
|
|
|
|
|
|
sub new { |
85
|
2
|
|
|
2
|
1
|
84
|
my $class = shift; |
86
|
2
|
|
|
|
|
5
|
my %opts = @_; |
87
|
2
|
|
|
|
|
11
|
return bless \%opts, $class; |
88
|
|
|
|
|
|
|
} |
89
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
=head2 split_to_syllables |
91
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
my @syllables = $object->split_to_syllables( $text, %options ); |
93
|
|
|
|
|
|
|
|
94
|
|
|
|
|
|
|
Split Lao text into its syllables using a regexp modelled after PHISSAMAY, |
95
|
|
|
|
|
|
|
DALALOY and DURRANI: I. Takes |
96
|
|
|
|
|
|
|
as its only mandatory parameter a character string to split and optionally a |
97
|
|
|
|
|
|
|
number of named options; see L for those. |
98
|
|
|
|
|
|
|
Returns a list of syllables. |
99
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
=cut |
101
|
|
|
|
|
|
|
sub split_to_syllables { |
102
|
1
|
|
|
1
|
1
|
7
|
my $self = shift; |
103
|
1
|
|
|
|
|
2
|
my $text = shift; |
104
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
return Lingua::LO::NLP::Syllabify->new( |
106
|
|
|
|
|
|
|
$text, |
107
|
|
|
|
|
|
|
normalize => $self->{normalize}, |
108
|
|
|
|
|
|
|
@_ |
109
|
1
|
|
|
|
|
14
|
)->get_syllables; |
110
|
|
|
|
|
|
|
} |
111
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
=head2 analyze_syllable |
113
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
my $classified = $object->analyze_syllable( $syllable, %options ); |
115
|
|
|
|
|
|
|
|
116
|
|
|
|
|
|
|
Returns a L object that allows you to query |
117
|
|
|
|
|
|
|
various syllable properties such as core consonant, tone mark, vowel length and |
118
|
|
|
|
|
|
|
tone. See there for details. |
119
|
|
|
|
|
|
|
|
120
|
|
|
|
|
|
|
=cut |
121
|
|
|
|
|
|
|
sub analyze_syllable { |
122
|
1
|
|
|
1
|
1
|
1287
|
my $self = shift; |
123
|
1
|
|
|
|
|
2
|
my $syllable = shift; |
124
|
|
|
|
|
|
|
return Lingua::LO::NLP::Analyze->new( |
125
|
|
|
|
|
|
|
$syllable, |
126
|
|
|
|
|
|
|
normalize => $self->{normalize}, |
127
|
|
|
|
|
|
|
@_ |
128
|
1
|
|
|
|
|
11
|
); |
129
|
|
|
|
|
|
|
} |
130
|
|
|
|
|
|
|
|
131
|
|
|
|
|
|
|
=head2 romanize |
132
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
$object->romanize( $lao, %options ); |
134
|
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
Returns a romanized version of the text passed in as C<$lao>. See |
136
|
|
|
|
|
|
|
L for options. The default C is 'PCGN'. |
137
|
|
|
|
|
|
|
|
138
|
|
|
|
|
|
|
=cut |
139
|
|
|
|
|
|
|
sub romanize { |
140
|
2
|
|
|
2
|
1
|
594
|
my (undef, $lao, %options) = @_; |
141
|
2
|
|
50
|
|
|
13
|
$options{variant} //= 'PCGN'; |
142
|
2
|
|
|
|
|
16
|
return Lingua::LO::NLP::Romanize->new(%options)->romanize( $lao ); |
143
|
|
|
|
|
|
|
} |
144
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
=head2 analyze_text |
146
|
|
|
|
|
|
|
|
147
|
|
|
|
|
|
|
my @syllables = $object->analyze_text( $text, %options ); |
148
|
|
|
|
|
|
|
|
149
|
|
|
|
|
|
|
Split Lao text into its syllables and analyze them, returning an array of |
150
|
|
|
|
|
|
|
hashes. Each hash has at least a key 'analysis' with a |
151
|
|
|
|
|
|
|
L object as a value. If the Coption is set |
152
|
|
|
|
|
|
|
to a true value, it also has a "romanization" key. In this case, the C |
153
|
|
|
|
|
|
|
option (see L) is also required. |
154
|
|
|
|
|
|
|
|
155
|
|
|
|
|
|
|
=cut |
156
|
|
|
|
|
|
|
sub analyze_text { |
157
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
158
|
0
|
|
|
|
|
|
my $text = shift; |
159
|
0
|
|
|
|
|
|
my %opts = @_; |
160
|
0
|
|
|
|
|
|
my $romanizer; |
161
|
0
|
0
|
|
|
|
|
$romanizer = Lingua::LO::NLP::Romanize->new( %opts ) if delete $opts{romanize}; |
162
|
|
|
|
|
|
|
|
163
|
|
|
|
|
|
|
my @result = Lingua::LO::NLP::Syllabify->new( |
164
|
|
|
|
|
|
|
$text, |
165
|
|
|
|
|
|
|
normalize => $self->{normalize}, |
166
|
0
|
|
|
|
|
|
%opts |
167
|
|
|
|
|
|
|
)->get_syllables; |
168
|
|
|
|
|
|
|
|
169
|
0
|
0
|
|
|
|
|
if($romanizer) { |
170
|
|
|
|
|
|
|
return map { |
171
|
0
|
|
|
|
|
|
{ |
172
|
0
|
|
|
|
|
|
analysis => $_, |
173
|
|
|
|
|
|
|
romanization => $romanizer->romanize_syllable($_) |
174
|
|
|
|
|
|
|
} |
175
|
|
|
|
|
|
|
} @result; |
176
|
|
|
|
|
|
|
} else { |
177
|
0
|
|
|
|
|
|
return map { { analysis => $_ } } @result; |
|
0
|
|
|
|
|
|
|
178
|
|
|
|
|
|
|
} |
179
|
|
|
|
|
|
|
} |
180
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
=head1 SEE ALSO |
182
|
|
|
|
|
|
|
|
183
|
|
|
|
|
|
|
L is the module that inspired this one. It has some |
184
|
|
|
|
|
|
|
issues with ambiguous syllable boundaries as in "ໃນວົງ" though. |
185
|
|
|
|
|
|
|
|
186
|
|
|
|
|
|
|
=head1 AUTHOR |
187
|
|
|
|
|
|
|
|
188
|
|
|
|
|
|
|
Matthias Bethke, Ematthias@towiski.deE |
189
|
|
|
|
|
|
|
|
190
|
|
|
|
|
|
|
=head1 COPYRIGHT AND LICENSE |
191
|
|
|
|
|
|
|
|
192
|
|
|
|
|
|
|
Copyright (C) 2016-2017 by Matthias Bethke |
193
|
|
|
|
|
|
|
|
194
|
|
|
|
|
|
|
This library is free software; you can redistribute it and/or modify it under |
195
|
|
|
|
|
|
|
the same terms as Perl itself, either Perl version 5.14.2 or, at your option, |
196
|
|
|
|
|
|
|
any later version of Perl 5 you may have available. Significant portions of the |
197
|
|
|
|
|
|
|
code are (C) PostgreSQL Global Development Group and The Regents of the |
198
|
|
|
|
|
|
|
University of California. All modified versions must retain the file COPYRIGHT |
199
|
|
|
|
|
|
|
included in the distribution. |
200
|
|
|
|
|
|
|
|
201
|
|
|
|
|
|
|
=cut |
202
|
|
|
|
|
|
|
|
203
|
|
|
|
|
|
|
1; |