|  line  | 
 stmt  | 
 bran  | 
 cond  | 
 sub  | 
 pod  | 
 time  | 
 code  | 
| 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 package Lingua::LO::NLP::Syllabify;  | 
| 
2
 | 
6
 | 
 
 | 
 
 | 
  
6
  
 | 
 
 | 
125151
 | 
 use strict;  | 
| 
 
 | 
6
 | 
 
 | 
 
 | 
 
 | 
 
 | 
16
 | 
    | 
| 
 
 | 
6
 | 
 
 | 
 
 | 
 
 | 
 
 | 
163
 | 
    | 
| 
3
 | 
6
 | 
 
 | 
 
 | 
  
6
  
 | 
 
 | 
28
 | 
 use warnings;  | 
| 
 
 | 
6
 | 
 
 | 
 
 | 
 
 | 
 
 | 
12
 | 
    | 
| 
 
 | 
6
 | 
 
 | 
 
 | 
 
 | 
 
 | 
137
 | 
    | 
| 
4
 | 
6
 | 
 
 | 
 
 | 
  
6
  
 | 
 
 | 
87
 | 
 use 5.012000;  | 
| 
 
 | 
6
 | 
 
 | 
 
 | 
 
 | 
 
 | 
23
 | 
    | 
| 
5
 | 
6
 | 
 
 | 
 
 | 
  
6
  
 | 
 
 | 
30
 | 
 use utf8;  | 
| 
 
 | 
6
 | 
 
 | 
 
 | 
 
 | 
 
 | 
12
 | 
    | 
| 
 
 | 
6
 | 
 
 | 
 
 | 
 
 | 
 
 | 
45
 | 
    | 
| 
6
 | 
6
 | 
 
 | 
 
 | 
  
6
  
 | 
 
 | 
168
 | 
 use feature 'unicode_strings';  | 
| 
 
 | 
6
 | 
 
 | 
 
 | 
 
 | 
 
 | 
11
 | 
    | 
| 
 
 | 
6
 | 
 
 | 
 
 | 
 
 | 
 
 | 
513
 | 
    | 
| 
7
 | 
6
 | 
 
 | 
 
 | 
  
6
  
 | 
 
 | 
227
 | 
 use version 0.77; our $VERSION = version->declare('v1.0.1');  | 
| 
 
 | 
6
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1589
 | 
    | 
| 
 
 | 
6
 | 
 
 | 
 
 | 
 
 | 
 
 | 
33
 | 
    | 
| 
8
 | 
6
 | 
 
 | 
 
 | 
  
6
  
 | 
 
 | 
474
 | 
 use charnames qw/ :full lao /;  | 
| 
 
 | 
6
 | 
 
 | 
 
 | 
 
 | 
 
 | 
10
 | 
    | 
| 
 
 | 
6
 | 
 
 | 
 
 | 
 
 | 
 
 | 
42
 | 
    | 
| 
9
 | 
6
 | 
 
 | 
 
 | 
  
6
  
 | 
 
 | 
2980
 | 
 use Carp;  | 
| 
 
 | 
6
 | 
 
 | 
 
 | 
 
 | 
 
 | 
14
 | 
    | 
| 
 
 | 
6
 | 
 
 | 
 
 | 
 
 | 
 
 | 
326
 | 
    | 
| 
10
 | 
6
 | 
 
 | 
 
 | 
  
6
  
 | 
 
 | 
1826
 | 
 use Unicode::Normalize qw/ NFC /;  | 
| 
 
 | 
6
 | 
 
 | 
 
 | 
 
 | 
 
 | 
7629
 | 
    | 
| 
 
 | 
6
 | 
 
 | 
 
 | 
 
 | 
 
 | 
406
 | 
    | 
| 
11
 | 
6
 | 
 
 | 
 
 | 
  
6
  
 | 
 
 | 
903
 | 
 use Class::Accessor::Fast 'antlers';  | 
| 
 
 | 
6
 | 
 
 | 
 
 | 
 
 | 
 
 | 
5922
 | 
    | 
| 
 
 | 
6
 | 
 
 | 
 
 | 
 
 | 
 
 | 
43
 | 
    | 
| 
12
 | 
6
 | 
 
 | 
 
 | 
  
6
  
 | 
 
 | 
1996
 | 
 use Lingua::LO::NLP::Data ':all';  | 
| 
 
 | 
6
 | 
 
 | 
 
 | 
 
 | 
 
 | 
20
 | 
    | 
| 
 
 | 
6
 | 
 
 | 
 
 | 
 
 | 
 
 | 
2726
 | 
    | 
| 
13
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
14
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =encoding utf8  | 
| 
15
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
16
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head1 NAME  | 
| 
17
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
18
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Lingua::LO::NLP::Syllabify - Segment Lao or mixed-script text into syllables.  | 
| 
19
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
20
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head1 FUNCTION  | 
| 
21
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
22
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 This implements a purely regular expression based algorithm to segment Lao text  | 
| 
23
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 into syllables, based on the one described in PHISSAMAY et al:  | 
| 
24
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 I.  | 
| 
25
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
26
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =cut  | 
| 
27
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
28
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 has text => (is => 'ro');  | 
| 
29
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
30
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 my $syl_re = Lingua::LO::NLP::Data::get_sylre_basic;  | 
| 
31
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 my $complete_syl_re = Lingua::LO::NLP::Data::get_sylre_full;  | 
| 
32
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
33
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head1 METHODS  | 
| 
34
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
35
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head2 new  | 
| 
36
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
37
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 C  | 
| 
38
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
39
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 The constructor takes a mandatory argument containing the text to split, and  | 
| 
40
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 any number of hash-style named options. Currently, the only such option is  | 
| 
41
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 C which takes a boolean argument and indicates whether to run the  | 
| 
42
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 text though a normalization function that swaps tone marks and vowels appearing  | 
| 
43
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 in the wrong order.  | 
| 
44
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
45
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Note that in any case text is passed through L first  | 
| 
46
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 to obtain the Composed Normal Form. In pure Lao text, this affects only the  | 
| 
47
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 decomposed form of LAO VOWEL SIGN AM that will be transformed from C,  | 
| 
48
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 C to C.  | 
| 
49
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
50
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =cut  | 
| 
51
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
52
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub new {  | 
| 
53
 | 
77
 | 
 
 | 
 
 | 
  
77
  
 | 
  
1
  
 | 
22615
 | 
     my $class = shift;  | 
| 
54
 | 
77
 | 
 
 | 
 
 | 
 
 | 
 
 | 
165
 | 
     my $text = shift;  | 
| 
55
 | 
77
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
458
 | 
     croak("`text' argument missing or undefined") unless defined $text;  | 
| 
56
 | 
76
 | 
 
 | 
 
 | 
 
 | 
 
 | 
204
 | 
     my %opts = @_;  | 
| 
57
 | 
76
 | 
 
 | 
 
 | 
 
 | 
 
 | 
655
 | 
     $text = NFC( $text );  | 
| 
58
 | 
76
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
261
 | 
     normalize_tone_marks($text) if $opts{normalize};  | 
| 
59
 | 
76
 | 
 
 | 
 
 | 
 
 | 
 
 | 
391
 | 
     return bless { text => $text }, $class  | 
| 
60
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
61
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
62
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head2 get_syllables  | 
| 
63
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
64
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 C  | 
| 
65
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
66
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Returns a list of Lao syllables found in the text passed to the constructor. If  | 
| 
67
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 there are any blanks, non-Lao parts etc. mixed in, they will be silently  | 
| 
68
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 dropped.  | 
| 
69
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
70
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =cut  | 
| 
71
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
72
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub get_syllables {  | 
| 
73
 | 
22
 | 
 
 | 
 
 | 
  
22
  
 | 
  
1
  
 | 
890
 | 
     return shift->text =~ m/($complete_syl_re)/og;  | 
| 
74
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
75
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
76
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head2 get_fragments  | 
| 
77
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
78
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 C  | 
| 
79
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
80
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Returns a complete segmentation of the text passed to the constructor as an  | 
| 
81
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 array of hashes. Each hash has two keys:  | 
| 
82
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
83
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =over 4  | 
| 
84
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
85
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =item C  | 
| 
86
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
87
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 The text of the respective fragment  | 
| 
88
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
89
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =item C  | 
| 
90
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
91
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 If true, the fragment is a single valid Lao syllable. If  | 
| 
92
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 false, it may be whitespace, non-Lao script, Lao characters that don't  | 
| 
93
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 constitute valid syllables - basically anything at all that's I a valid  | 
| 
94
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 syllable.  | 
| 
95
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
96
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =back  | 
| 
97
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
98
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =cut  | 
| 
99
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
100
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub get_fragments {  | 
| 
101
 | 
53
 | 
 
 | 
 
 | 
  
53
  
 | 
  
1
  
 | 
153
 | 
     my $self = shift;  | 
| 
102
 | 
53
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1126
 | 
     my $t = $self->text;  | 
| 
103
 | 
53
 | 
 
 | 
 
 | 
 
 | 
 
 | 
318
 | 
     my @matches;  | 
| 
104
 | 
53
 | 
 
 | 
 
 | 
 
 | 
 
 | 
3201
 | 
     while($t =~ /\G($complete_syl_re | .+?(?=$complete_syl_re|$) )/oxgcs) {  | 
| 
105
 | 
122
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
4732
 | 
         unless($1 eq "\N{ZERO WIDTH SPACE}") {  | 
| 
106
 | 
120
 | 
 
 | 
 
 | 
 
 | 
 
 | 
268
 | 
             my $match = $1;  | 
| 
107
 | 
120
 | 
 
 | 
 
 | 
 
 | 
 
 | 
4151
 | 
             push @matches, { text => $match, is_lao => scalar($match =~ /^$syl_re/) };  | 
| 
108
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }  | 
| 
109
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
110
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     return @matches  | 
| 
111
 | 
53
 | 
 
 | 
 
 | 
 
 | 
 
 | 
248
 | 
 }  | 
| 
112
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
113
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 1;  |