File Coverage

blib/lib/Lingua/LO/NLP/Syllabify.pm

Criterion	Covered	Total	%
statement	48	48	100.0
branch	6	6	100.0
condition			n/a
subroutine	14	14	100.0
pod	3	3	100.0
total	71	71	100.0

line	stmt	bran	sub	pod	time	code
1						package Lingua::LO::NLP::Syllabify;
2	6		6		111519	use strict;
	6				15
	6				149
3	6		6		28	use warnings;
	6				10
	6				131
4	6		6		84	use 5.012000;
	6				21
5	6		6		30	use utf8;
	6				11
	6				45
6	6		6		175	use feature 'unicode_strings';
	6				11
	6				501
7	6		6		220	use version 0.77; our $VERSION = version->declare('v1.0.1');
	6				1341
	6				33
8	6		6		488	use charnames qw/ :full lao /;
	6				10
	6				40
9	6		6		2761	use Carp;
	6				11
	6				320
10	6		6		1690	use Unicode::Normalize qw/ NFC /;
	6				7073
	6				374
11	6		6		790	use Class::Accessor::Fast 'antlers';
	6				5329
	6				38
12	6		6		1978	use Lingua::LO::NLP::Data ':all';
	6				14
	6				2045
13
14						=encoding utf8
15
16						=head1 NAME
17
18						Lingua::LO::NLP::Syllabify - Segment Lao or mixed-script text into syllables.
19
20						=head1 FUNCTION
21
22						This implements a purely regular expression based algorithm to segment Lao text
23						into syllables, based on the one described in PHISSAMAY et al:
24						I.
25
26						=cut
27
28						has text => (is => 'ro');
29
30						my $syl_re = Lingua::LO::NLP::Data::get_sylre_basic;
31						my $complete_syl_re = Lingua::LO::NLP::Data::get_sylre_full;
32
33						=head1 METHODS
34
35						=head2 new
36
37						C
38
39						The constructor takes a mandatory argument containing the text to split, and
40						any number of hash-style named options. Currently, the only such option is
41						C which takes a boolean argument and indicates whether to run the
42						text though a normalization function that swaps tone marks and vowels appearing
43						in the wrong order.
44
45						Note that in any case text is passed through L first
46						to obtain the Composed Normal Form. In pure Lao text, this affects only the
47						decomposed form of LAO VOWEL SIGN AM that will be transformed from C,
48						C to C.
49
50						=cut
51
52						sub new {
53	77		77	1	15425	my $class = shift;
54	77				139	my $text = shift;
55	77	100			368	croak("`text' argument missing or undefined") unless defined $text;
56	76				188	my %opts = @_;
57	76				3494	$text = NFC( $text );
58	76	100			259	normalize_tone_marks($text) if $opts{normalize};
59	76				424	return bless { text => $text }, $class
60						}
61
62						=head2 get_syllables
63
64						C
65
66						Returns a list of Lao syllables found in the text passed to the constructor. If
67						there are any blanks, non-Lao parts etc. mixed in, they will be silently
68						dropped.
69
70						=cut
71
72						sub get_syllables {
73	22		22	1	632	return shift->text =~ m/($complete_syl_re)/og;
74						}
75
76						=head2 get_fragments
77
78						C
79
80						Returns a complete segmentation of the text passed to the constructor as an
81						array of hashes. Each hash has two keys:
82
83						=over 4
84
85						=item C
86
87						The text of the respective fragment
88
89						=item C
90
91						If true, the fragment is a single valid Lao syllable. If
92						false, it may be whitespace, non-Lao script, Lao characters that don't
93						constitute valid syllables - basically anything at all that's I a valid
94						syllable.
95
96						=back
97
98						=cut
99
100						sub get_fragments {
101	53		53	1	141	my $self = shift;
102	53				1088	my $t = $self->text;
103	53				363	my @matches;
104	53				2979	while($t =~ /\G($complete_syl_re \| .+?(?=$complete_syl_re\|$) )/oxgcs) {
105	122	100			4122	unless($1 eq "\N{ZERO WIDTH SPACE}") {
106	120				254	my $match = $1;
107	120				3788	push @matches, { text => $match, is_lao => scalar($match =~ /^$syl_re/) };
108						}
109						}
110						return @matches
111	53				262	}
112
113						1;