File Coverage

blib/lib/Lingua/Sentence.pm

Criterion	Covered	Total	%
statement	77	84	91.6
branch	24	36	66.6
condition	11	24	45.8
subroutine	11	11	100.0
pod	3	3	100.0
total	126	158	79.7

line	stmt	bran	cond	sub	pod	time	code
1							package Lingua::Sentence;
2
3	1			1		392	use strict;
	1					1
	1					21
4	1			1		2	use warnings;
	1					1
	1					17
5
6	1			1		2	use Carp ();
	1					1
	1					8
7	1			1		509	use File::ShareDir ();
	1					4801
	1					16
8	1			1		5	use File::Spec ();
	1					1
	1					9
9	1			1		718	use Path::Tiny ();
	1					9383
	1					417
10
11							our $VERSION = '1.100';
12							$VERSION = eval $VERSION;
13
14							sub new {
15	7			7	1	1043	my ($class, $lang_id, $prefix_file) = @_;
16	7	50	33			62	Carp::croak("Invalid language id: $lang_id")
17							unless ($lang_id && $lang_id =~ /^[a-z][a-z]$/i);
18
19							# Try loading nonbreaking prefix file specified in constructor
20	7					35	my $dir = File::ShareDir::dist_dir('Lingua-Sentence');
21	7					936	my $fallback = File::Spec->catfile($dir, 'nonbreaking_prefix.' . $lang_id);
22	7					45	my $fallback_en = File::Spec->catfile($dir, 'nonbreaking_prefix.en');
23	7	100				21	if (defined($prefix_file)) {
24	1	50				17	unless (-e $prefix_file) {
25	0					0	warn
26							"WARNING: Specified prefix file '$prefix_file' does not exist, attempting fall-back to $lang_id version...\n";
27	0					0	$prefix_file = $fallback;
28							}
29							}
30							else {
31	6					8	$prefix_file = $fallback;
32							}
33
34							#default back to English if we don't have a language-specific prefix file
35	7	100				216	unless (-e $prefix_file) {
36	1					3	$prefix_file = $fallback_en;
37	1					26	warn
38							"WARNING: No known abbreviations for language '$lang_id', attempting fall-back to English version...\n";
39							}
40
41							# grab all non-breaking prefixes and store them in a hashref
42	7					13	my $nb_prefix = {};
43	7					34	my $pt = Path::Tiny::path($prefix_file);
44	7	50				208	if ($pt->is_file) {
45	7					193	for my $line ($pt->lines_utf8({chomp => 1})) {
46	2887	100				8137	next unless $line;
47	2856	100				4126	next if substr($line, 0, 1) eq '#';
48	2796	100				2359	if ($line =~ /^(.*?)\s+#NUMERIC_ONLY#/) {
49	10					20	$nb_prefix->{$1} = 2;
50							}
51							else {
52	2786					3274	$nb_prefix->{$line} = 1;
53							}
54							}
55							}
56							else {
57	0					0	die("ERROR: No abbreviations files found in $dir\n");
58							}
59
60	7					363	return bless {LangID => $lang_id, Nonbreaking => $nb_prefix,}, $class;
61							}
62
63							sub split {
64	8			8	1	2075	my $self = shift;
65	8	50				26	if (!ref $self) {
66	0					0	return "Unnamed $self";
67							}
68	8					14	my $text = shift;
69	8	50				14	if (!$text) {
70	0					0	return '';
71							}
72	8					18	return _preprocess($self, $text);
73							}
74
75							sub split_array {
76	5			5	1	4400	my $self = shift;
77	5	50				17	if (!ref $self) {
78	0					0	return "Unnamed $self";
79							}
80	5					8	my $text = shift;
81	5	50				12	if (!$text) {
82	0					0	return ();
83							}
84	5					11	my $splittext = _preprocess($self, $text);
85	5					10	chomp $splittext;
86	5					28	return split(/\n/, $splittext);
87							}
88
89							sub _preprocess {
90	13			13		22	my ($self, $text) = @_;
91
92							#####add sentence breaks as needed#####
93
94							#non-period end of sentence markers (?!) followed by sentence starters.
95	1			1		7	$text =~ s/([?!]) +(['"([\x{00bf}\x{00A1}\p{IsPi}]*[\p{IsUpper}])/$1\n$2/g;
	1					4
	1					13
	13					48
96
97							#multi-dots followed by sentence starters
98	13					38	$text
99							=~ s/(\.[\.]+) +(['"([\x{00bf}\x{00A1}\p{IsPi}]*[\p{IsUpper}])/$1\n$2/g;
100
101							# add breaks for sentences that end with some sort of punctuation inside a quote or parenthetical and are followed by a possible sentence starter punctuation and upper case
102	13					61	$text
103							=~ s/([?!\.][\ ]['")\]\p{IsPf}]+) +(['"([\x{00bf}\x{00A1}\p{IsPi}][\ ]*[\p{IsUpper}])/$1\n$2/g;
104
105							# add breaks for sentences that end with some sort of punctuation are followed by a sentence starter punctuation and upper case
106	13					68	$text
107							=~ s/([?!\.]) +(['"([\x{00bf}\x{00A1}\p{IsPi}]+[\ ]*[\p{IsUpper}])/$1\n$2/g;
108
109							# special punctuation cases are covered. Check all remaining periods.
110	13					17	my $word;
111							my $i;
112	13					111	my @words = split(/ +/, $text);
113	13					20	$text = "";
114	13					40	for ($i = 0; $i < (scalar(@words) - 1); $i++) {
115	135	100				317	if ($words[$i] =~ /([\p{IsAlnum}\.\-])([\'\"\)\]\%\p{IsPf}])(\.+)$/) {
116
117							#check if $1 is a known honorific and $2 is empty, never break
118	13					27	my $prefix = $1;
119	13					20	my $starting_punct = $2;
120	13	100	66			146	if ( $prefix
		50	66
		50	66
121							&& $self->{Nonbreaking}{$prefix}
122							&& $self->{Nonbreaking}{$prefix} == 1
123							&& !$starting_punct)
124							{
125							#not breaking;
126							}
127							elsif ($words[$i] =~ /(\.)[\p{IsUpper}\-]+(\.+)$/) {
128
129							#not breaking - upper case acronym
130							}
131							elsif ($words[$i + 1]
132							=~ /^([ ]['"([\x{00bf}\x{00A1}\p{IsPi}][ ]*[\p{IsUpper}0-9])/)
133							{
134							#the next word has a bunch of initial quotes, maybe a space, then either upper case or a number
135							$words[$i] = $words[$i] . "\n"
136							unless ($prefix
137							&& $self->{Nonbreaking}{$prefix}
138	11	0	66			71	&& $self->{Nonbreaking}{$prefix} == 2
			33
			33
			0
139							&& !$starting_punct
140							&& ($words[$i + 1] =~ /^[0-9]+/));
141
142							#we always add a return for these unless we have a numeric non-breaker and a number start
143							}
144
145							}
146	135					447	$text = $text . $words[$i] . " ";
147							}
148
149							#we stopped one token from the end to allow for easy look-ahead. Append it now.
150	13					24	$text = $text . $words[$i];
151
152							# clean up spaces at head and tail of each line as well as any double-spacing
153	13					141	$text =~ s/ +/ /g;
154	13					40	$text =~ s/\n /\n/g;
155	13					19	$text =~ s/ \n/\n/g;
156	13					17	$text =~ s/^ //g;
157	13					17	$text =~ s/ $//g;
158
159							#add trailing break
160	13	50				37	$text .= "\n" unless $text =~ /\n$/;
161
162	13					75	return $text;
163							}
164
165							1;
166							__END__