line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Lingua::Sentence; |
2
|
|
|
|
|
|
|
|
3
|
1
|
|
|
1
|
|
392
|
use strict; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
21
|
|
4
|
1
|
|
|
1
|
|
2
|
use warnings; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
17
|
|
5
|
|
|
|
|
|
|
|
6
|
1
|
|
|
1
|
|
2
|
use Carp (); |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
8
|
|
7
|
1
|
|
|
1
|
|
509
|
use File::ShareDir (); |
|
1
|
|
|
|
|
4801
|
|
|
1
|
|
|
|
|
16
|
|
8
|
1
|
|
|
1
|
|
5
|
use File::Spec (); |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
9
|
|
9
|
1
|
|
|
1
|
|
718
|
use Path::Tiny (); |
|
1
|
|
|
|
|
9383
|
|
|
1
|
|
|
|
|
417
|
|
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
our $VERSION = '1.100'; |
12
|
|
|
|
|
|
|
$VERSION = eval $VERSION; |
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
sub new { |
15
|
7
|
|
|
7
|
1
|
1043
|
my ($class, $lang_id, $prefix_file) = @_; |
16
|
7
|
50
|
33
|
|
|
62
|
Carp::croak("Invalid language id: $lang_id") |
17
|
|
|
|
|
|
|
unless ($lang_id && $lang_id =~ /^[a-z][a-z]$/i); |
18
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
# Try loading nonbreaking prefix file specified in constructor |
20
|
7
|
|
|
|
|
35
|
my $dir = File::ShareDir::dist_dir('Lingua-Sentence'); |
21
|
7
|
|
|
|
|
936
|
my $fallback = File::Spec->catfile($dir, 'nonbreaking_prefix.' . $lang_id); |
22
|
7
|
|
|
|
|
45
|
my $fallback_en = File::Spec->catfile($dir, 'nonbreaking_prefix.en'); |
23
|
7
|
100
|
|
|
|
21
|
if (defined($prefix_file)) { |
24
|
1
|
50
|
|
|
|
17
|
unless (-e $prefix_file) { |
25
|
0
|
|
|
|
|
0
|
warn |
26
|
|
|
|
|
|
|
"WARNING: Specified prefix file '$prefix_file' does not exist, attempting fall-back to $lang_id version...\n"; |
27
|
0
|
|
|
|
|
0
|
$prefix_file = $fallback; |
28
|
|
|
|
|
|
|
} |
29
|
|
|
|
|
|
|
} |
30
|
|
|
|
|
|
|
else { |
31
|
6
|
|
|
|
|
8
|
$prefix_file = $fallback; |
32
|
|
|
|
|
|
|
} |
33
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
#default back to English if we don't have a language-specific prefix file |
35
|
7
|
100
|
|
|
|
216
|
unless (-e $prefix_file) { |
36
|
1
|
|
|
|
|
3
|
$prefix_file = $fallback_en; |
37
|
1
|
|
|
|
|
26
|
warn |
38
|
|
|
|
|
|
|
"WARNING: No known abbreviations for language '$lang_id', attempting fall-back to English version...\n"; |
39
|
|
|
|
|
|
|
} |
40
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
# grab all non-breaking prefixes and store them in a hashref |
42
|
7
|
|
|
|
|
13
|
my $nb_prefix = {}; |
43
|
7
|
|
|
|
|
34
|
my $pt = Path::Tiny::path($prefix_file); |
44
|
7
|
50
|
|
|
|
208
|
if ($pt->is_file) { |
45
|
7
|
|
|
|
|
193
|
for my $line ($pt->lines_utf8({chomp => 1})) { |
46
|
2887
|
100
|
|
|
|
8137
|
next unless $line; |
47
|
2856
|
100
|
|
|
|
4126
|
next if substr($line, 0, 1) eq '#'; |
48
|
2796
|
100
|
|
|
|
2359
|
if ($line =~ /^(.*?)\s+#NUMERIC_ONLY#/) { |
49
|
10
|
|
|
|
|
20
|
$nb_prefix->{$1} = 2; |
50
|
|
|
|
|
|
|
} |
51
|
|
|
|
|
|
|
else { |
52
|
2786
|
|
|
|
|
3274
|
$nb_prefix->{$line} = 1; |
53
|
|
|
|
|
|
|
} |
54
|
|
|
|
|
|
|
} |
55
|
|
|
|
|
|
|
} |
56
|
|
|
|
|
|
|
else { |
57
|
0
|
|
|
|
|
0
|
die("ERROR: No abbreviations files found in $dir\n"); |
58
|
|
|
|
|
|
|
} |
59
|
|
|
|
|
|
|
|
60
|
7
|
|
|
|
|
363
|
return bless {LangID => $lang_id, Nonbreaking => $nb_prefix,}, $class; |
61
|
|
|
|
|
|
|
} |
62
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
sub split { |
64
|
8
|
|
|
8
|
1
|
2075
|
my $self = shift; |
65
|
8
|
50
|
|
|
|
26
|
if (!ref $self) { |
66
|
0
|
|
|
|
|
0
|
return "Unnamed $self"; |
67
|
|
|
|
|
|
|
} |
68
|
8
|
|
|
|
|
14
|
my $text = shift; |
69
|
8
|
50
|
|
|
|
14
|
if (!$text) { |
70
|
0
|
|
|
|
|
0
|
return ''; |
71
|
|
|
|
|
|
|
} |
72
|
8
|
|
|
|
|
18
|
return _preprocess($self, $text); |
73
|
|
|
|
|
|
|
} |
74
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
sub split_array { |
76
|
5
|
|
|
5
|
1
|
4400
|
my $self = shift; |
77
|
5
|
50
|
|
|
|
17
|
if (!ref $self) { |
78
|
0
|
|
|
|
|
0
|
return "Unnamed $self"; |
79
|
|
|
|
|
|
|
} |
80
|
5
|
|
|
|
|
8
|
my $text = shift; |
81
|
5
|
50
|
|
|
|
12
|
if (!$text) { |
82
|
0
|
|
|
|
|
0
|
return (); |
83
|
|
|
|
|
|
|
} |
84
|
5
|
|
|
|
|
11
|
my $splittext = _preprocess($self, $text); |
85
|
5
|
|
|
|
|
10
|
chomp $splittext; |
86
|
5
|
|
|
|
|
28
|
return split(/\n/, $splittext); |
87
|
|
|
|
|
|
|
} |
88
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
sub _preprocess { |
90
|
13
|
|
|
13
|
|
22
|
my ($self, $text) = @_; |
91
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
#####add sentence breaks as needed##### |
93
|
|
|
|
|
|
|
|
94
|
|
|
|
|
|
|
#non-period end of sentence markers (?!) followed by sentence starters. |
95
|
1
|
|
|
1
|
|
7
|
$text =~ s/([?!]) +(['"([\x{00bf}\x{00A1}\p{IsPi}]*[\p{IsUpper}])/$1\n$2/g; |
|
1
|
|
|
|
|
4
|
|
|
1
|
|
|
|
|
13
|
|
|
13
|
|
|
|
|
48
|
|
96
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
#multi-dots followed by sentence starters |
98
|
13
|
|
|
|
|
38
|
$text |
99
|
|
|
|
|
|
|
=~ s/(\.[\.]+) +(['"([\x{00bf}\x{00A1}\p{IsPi}]*[\p{IsUpper}])/$1\n$2/g; |
100
|
|
|
|
|
|
|
|
101
|
|
|
|
|
|
|
# add breaks for sentences that end with some sort of punctuation inside a quote or parenthetical and are followed by a possible sentence starter punctuation and upper case |
102
|
13
|
|
|
|
|
61
|
$text |
103
|
|
|
|
|
|
|
=~ s/([?!\.][\ ]*['")\]\p{IsPf}]+) +(['"([\x{00bf}\x{00A1}\p{IsPi}]*[\ ]*[\p{IsUpper}])/$1\n$2/g; |
104
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
# add breaks for sentences that end with some sort of punctuation are followed by a sentence starter punctuation and upper case |
106
|
13
|
|
|
|
|
68
|
$text |
107
|
|
|
|
|
|
|
=~ s/([?!\.]) +(['"([\x{00bf}\x{00A1}\p{IsPi}]+[\ ]*[\p{IsUpper}])/$1\n$2/g; |
108
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
# special punctuation cases are covered. Check all remaining periods. |
110
|
13
|
|
|
|
|
17
|
my $word; |
111
|
|
|
|
|
|
|
my $i; |
112
|
13
|
|
|
|
|
111
|
my @words = split(/ +/, $text); |
113
|
13
|
|
|
|
|
20
|
$text = ""; |
114
|
13
|
|
|
|
|
40
|
for ($i = 0; $i < (scalar(@words) - 1); $i++) { |
115
|
135
|
100
|
|
|
|
317
|
if ($words[$i] =~ /([\p{IsAlnum}\.\-]*)([\'\"\)\]\%\p{IsPf}]*)(\.+)$/) { |
116
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
#check if $1 is a known honorific and $2 is empty, never break |
118
|
13
|
|
|
|
|
27
|
my $prefix = $1; |
119
|
13
|
|
|
|
|
20
|
my $starting_punct = $2; |
120
|
13
|
100
|
66
|
|
|
146
|
if ( $prefix |
|
|
50
|
66
|
|
|
|
|
|
|
50
|
66
|
|
|
|
|
121
|
|
|
|
|
|
|
&& $self->{Nonbreaking}{$prefix} |
122
|
|
|
|
|
|
|
&& $self->{Nonbreaking}{$prefix} == 1 |
123
|
|
|
|
|
|
|
&& !$starting_punct) |
124
|
|
|
|
|
|
|
{ |
125
|
|
|
|
|
|
|
#not breaking; |
126
|
|
|
|
|
|
|
} |
127
|
|
|
|
|
|
|
elsif ($words[$i] =~ /(\.)[\p{IsUpper}\-]+(\.+)$/) { |
128
|
|
|
|
|
|
|
|
129
|
|
|
|
|
|
|
#not breaking - upper case acronym |
130
|
|
|
|
|
|
|
} |
131
|
|
|
|
|
|
|
elsif ($words[$i + 1] |
132
|
|
|
|
|
|
|
=~ /^([ ]*['"([\x{00bf}\x{00A1}\p{IsPi}]*[ ]*[\p{IsUpper}0-9])/) |
133
|
|
|
|
|
|
|
{ |
134
|
|
|
|
|
|
|
#the next word has a bunch of initial quotes, maybe a space, then either upper case or a number |
135
|
|
|
|
|
|
|
$words[$i] = $words[$i] . "\n" |
136
|
|
|
|
|
|
|
unless ($prefix |
137
|
|
|
|
|
|
|
&& $self->{Nonbreaking}{$prefix} |
138
|
11
|
0
|
66
|
|
|
71
|
&& $self->{Nonbreaking}{$prefix} == 2 |
|
|
|
33
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
|
0
|
|
|
|
|
139
|
|
|
|
|
|
|
&& !$starting_punct |
140
|
|
|
|
|
|
|
&& ($words[$i + 1] =~ /^[0-9]+/)); |
141
|
|
|
|
|
|
|
|
142
|
|
|
|
|
|
|
#we always add a return for these unless we have a numeric non-breaker and a number start |
143
|
|
|
|
|
|
|
} |
144
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
} |
146
|
135
|
|
|
|
|
447
|
$text = $text . $words[$i] . " "; |
147
|
|
|
|
|
|
|
} |
148
|
|
|
|
|
|
|
|
149
|
|
|
|
|
|
|
#we stopped one token from the end to allow for easy look-ahead. Append it now. |
150
|
13
|
|
|
|
|
24
|
$text = $text . $words[$i]; |
151
|
|
|
|
|
|
|
|
152
|
|
|
|
|
|
|
# clean up spaces at head and tail of each line as well as any double-spacing |
153
|
13
|
|
|
|
|
141
|
$text =~ s/ +/ /g; |
154
|
13
|
|
|
|
|
40
|
$text =~ s/\n /\n/g; |
155
|
13
|
|
|
|
|
19
|
$text =~ s/ \n/\n/g; |
156
|
13
|
|
|
|
|
17
|
$text =~ s/^ //g; |
157
|
13
|
|
|
|
|
17
|
$text =~ s/ $//g; |
158
|
|
|
|
|
|
|
|
159
|
|
|
|
|
|
|
#add trailing break |
160
|
13
|
50
|
|
|
|
37
|
$text .= "\n" unless $text =~ /\n$/; |
161
|
|
|
|
|
|
|
|
162
|
13
|
|
|
|
|
75
|
return $text; |
163
|
|
|
|
|
|
|
} |
164
|
|
|
|
|
|
|
|
165
|
|
|
|
|
|
|
1; |
166
|
|
|
|
|
|
|
__END__ |