File Coverage

blib/lib/Lingua/LO/Transform/Syllables.pm
Criterion Covered Total %
statement 44 44 100.0
branch 1 2 50.0
condition n/a
subroutine 14 14 100.0
pod 3 3 100.0
total 62 63 98.4


line stmt bran cond sub pod time code
1             package Lingua::LO::Transform::Syllables;
2 3     3   222696 use strict;
  3         3  
  3         75  
3 3     3   10 use warnings;
  3         4  
  3         72  
4 3     3   65 use 5.012000;
  3         9  
5 3     3   10 use utf8;
  3         5  
  3         13  
6 3     3   55 use feature 'unicode_strings';
  3         3  
  3         262  
7 3     3   505 use version 0.77; our $VERSION = version->declare('v0.0.1');
  3         1472  
  3         15  
8 3     3   663 use charnames qw/ :full lao /;
  3         22716  
  3         17  
9 3     3   19988 use Carp;
  3         3  
  3         207  
10 3     3   1130 use Unicode::Normalize qw/ NFC /;
  3         312834  
  3         367  
11 3     3   1819 use Class::Accessor::Fast 'antlers';
  3         6351  
  3         24  
12 3     3   1042 use Lingua::LO::Transform::Data;
  3         4  
  3         884  
13              
14             =encoding UTF-8
15              
16             =head1 NAME
17              
18             Lingua::LO::Transform::Syllables - Segment Lao or mixed-script text into syllables.
19              
20             =head1 FUNCTION
21              
22             This implements a purely regular expression based algorithm to segment Lao text into syllables, based
23             on the one described in PHISSAMAY et al: I.
24              
25             =cut
26              
27             has text => (is => 'ro');
28              
29             my $syl_re = Lingua::LO::Transform::Data::get_sylre_basic;
30             my $complete_syl_re = Lingua::LO::Transform::Data::get_sylre_full;
31              
32             =head1 METHODS
33              
34             =head2 new
35              
36             C $text, ... )>
37              
38             The constructor takes hash-style named arguments. The only one defined so far
39             is C whose value is obviously the text to be segmented.
40              
41             Note that text is passed through L<"Unicode::Normalize"/NFC> first to obtain the Composed Normal Form. In pure Lao text, this affects only the decomposed form of LAO VOWEL SIGN AM that will be transformed from C,C to C.
42              
43             =cut
44              
45             sub new {
46 41     41 1 9569 my $class = shift;
47 41         78 my %opts = @_;
48 41 50       89 croak("`text' key missing or undefined") unless defined $opts{text};
49             return bless {
50 41         99 text => NFC( $opts{text} ),
51             }, $class;
52             }
53              
54             =head2 get_syllables
55              
56             C
57              
58             Returns a list of Lao syllables found in the text passed to the constructor. If
59             there are any blanks, non-Lao parts etc. mixed in, they will be silently
60             dropped.
61              
62             =cut
63              
64             sub get_syllables {
65 20     20 1 4006 return shift->text =~ m/($complete_syl_re)/og;
66             }
67              
68             =head2 get_fragments
69              
70             C
71              
72             Returns a complete segmentation of the text passed to the constructor as an
73             array of hashes. Each hash has two keys:
74              
75             =over 4
76              
77             =item C: the text of the respective fragment
78              
79             =item C: if true, the fragment is a single valid Lao syllable. If
80             false, it may be whitespace, non-Lao script, Lao characters that don't
81             constitute valid syllables - basically anything at all that's I a valid
82             syllable.
83              
84             =back
85              
86             =cut
87              
88             sub get_fragments {
89 20     20 1 2501 my $self = shift;
90 20         46 my $t = $self->text;
91 20         70 my @matches;
92 20         1259 while($t =~ /\G($complete_syl_re | .+?(?=$complete_syl_re|$) )/oxgcs) {
93 45         2457 my $match = $1;
94 45         1364 push @matches, { text => $match, is_lao => scalar($match =~ /^$syl_re/) };
95             }
96             return @matches
97 20         86 }
98              
99             1;