File Coverage

blib/lib/Lingua/Sentence.pm
Criterion Covered Total %
statement 77 84 91.6
branch 24 36 66.6
condition 11 24 45.8
subroutine 11 11 100.0
pod 3 3 100.0
total 126 158 79.7


line stmt bran cond sub pod time code
1             package Lingua::Sentence;
2              
3 1     1   392 use strict;
  1         1  
  1         21  
4 1     1   2 use warnings;
  1         1  
  1         17  
5              
6 1     1   2 use Carp ();
  1         1  
  1         8  
7 1     1   509 use File::ShareDir ();
  1         4801  
  1         16  
8 1     1   5 use File::Spec ();
  1         1  
  1         9  
9 1     1   718 use Path::Tiny ();
  1         9383  
  1         417  
10              
11             our $VERSION = '1.100';
12             $VERSION = eval $VERSION;
13              
14             sub new {
15 7     7 1 1043 my ($class, $lang_id, $prefix_file) = @_;
16 7 50 33     62 Carp::croak("Invalid language id: $lang_id")
17             unless ($lang_id && $lang_id =~ /^[a-z][a-z]$/i);
18              
19             # Try loading nonbreaking prefix file specified in constructor
20 7         35 my $dir = File::ShareDir::dist_dir('Lingua-Sentence');
21 7         936 my $fallback = File::Spec->catfile($dir, 'nonbreaking_prefix.' . $lang_id);
22 7         45 my $fallback_en = File::Spec->catfile($dir, 'nonbreaking_prefix.en');
23 7 100       21 if (defined($prefix_file)) {
24 1 50       17 unless (-e $prefix_file) {
25 0         0 warn
26             "WARNING: Specified prefix file '$prefix_file' does not exist, attempting fall-back to $lang_id version...\n";
27 0         0 $prefix_file = $fallback;
28             }
29             }
30             else {
31 6         8 $prefix_file = $fallback;
32             }
33              
34             #default back to English if we don't have a language-specific prefix file
35 7 100       216 unless (-e $prefix_file) {
36 1         3 $prefix_file = $fallback_en;
37 1         26 warn
38             "WARNING: No known abbreviations for language '$lang_id', attempting fall-back to English version...\n";
39             }
40              
41             # grab all non-breaking prefixes and store them in a hashref
42 7         13 my $nb_prefix = {};
43 7         34 my $pt = Path::Tiny::path($prefix_file);
44 7 50       208 if ($pt->is_file) {
45 7         193 for my $line ($pt->lines_utf8({chomp => 1})) {
46 2887 100       8137 next unless $line;
47 2856 100       4126 next if substr($line, 0, 1) eq '#';
48 2796 100       2359 if ($line =~ /^(.*?)\s+#NUMERIC_ONLY#/) {
49 10         20 $nb_prefix->{$1} = 2;
50             }
51             else {
52 2786         3274 $nb_prefix->{$line} = 1;
53             }
54             }
55             }
56             else {
57 0         0 die("ERROR: No abbreviations files found in $dir\n");
58             }
59              
60 7         363 return bless {LangID => $lang_id, Nonbreaking => $nb_prefix,}, $class;
61             }
62              
63             sub split {
64 8     8 1 2075 my $self = shift;
65 8 50       26 if (!ref $self) {
66 0         0 return "Unnamed $self";
67             }
68 8         14 my $text = shift;
69 8 50       14 if (!$text) {
70 0         0 return '';
71             }
72 8         18 return _preprocess($self, $text);
73             }
74              
75             sub split_array {
76 5     5 1 4400 my $self = shift;
77 5 50       17 if (!ref $self) {
78 0         0 return "Unnamed $self";
79             }
80 5         8 my $text = shift;
81 5 50       12 if (!$text) {
82 0         0 return ();
83             }
84 5         11 my $splittext = _preprocess($self, $text);
85 5         10 chomp $splittext;
86 5         28 return split(/\n/, $splittext);
87             }
88              
89             sub _preprocess {
90 13     13   22 my ($self, $text) = @_;
91              
92             #####add sentence breaks as needed#####
93              
94             #non-period end of sentence markers (?!) followed by sentence starters.
95 1     1   7 $text =~ s/([?!]) +(['"([\x{00bf}\x{00A1}\p{IsPi}]*[\p{IsUpper}])/$1\n$2/g;
  1         4  
  1         13  
  13         48  
96              
97             #multi-dots followed by sentence starters
98 13         38 $text
99             =~ s/(\.[\.]+) +(['"([\x{00bf}\x{00A1}\p{IsPi}]*[\p{IsUpper}])/$1\n$2/g;
100              
101             # add breaks for sentences that end with some sort of punctuation inside a quote or parenthetical and are followed by a possible sentence starter punctuation and upper case
102 13         61 $text
103             =~ s/([?!\.][\ ]*['")\]\p{IsPf}]+) +(['"([\x{00bf}\x{00A1}\p{IsPi}]*[\ ]*[\p{IsUpper}])/$1\n$2/g;
104              
105             # add breaks for sentences that end with some sort of punctuation are followed by a sentence starter punctuation and upper case
106 13         68 $text
107             =~ s/([?!\.]) +(['"([\x{00bf}\x{00A1}\p{IsPi}]+[\ ]*[\p{IsUpper}])/$1\n$2/g;
108              
109             # special punctuation cases are covered. Check all remaining periods.
110 13         17 my $word;
111             my $i;
112 13         111 my @words = split(/ +/, $text);
113 13         20 $text = "";
114 13         40 for ($i = 0; $i < (scalar(@words) - 1); $i++) {
115 135 100       317 if ($words[$i] =~ /([\p{IsAlnum}\.\-]*)([\'\"\)\]\%\p{IsPf}]*)(\.+)$/) {
116              
117             #check if $1 is a known honorific and $2 is empty, never break
118 13         27 my $prefix = $1;
119 13         20 my $starting_punct = $2;
120 13 100 66     146 if ( $prefix
    50 66        
    50 66        
121             && $self->{Nonbreaking}{$prefix}
122             && $self->{Nonbreaking}{$prefix} == 1
123             && !$starting_punct)
124             {
125             #not breaking;
126             }
127             elsif ($words[$i] =~ /(\.)[\p{IsUpper}\-]+(\.+)$/) {
128              
129             #not breaking - upper case acronym
130             }
131             elsif ($words[$i + 1]
132             =~ /^([ ]*['"([\x{00bf}\x{00A1}\p{IsPi}]*[ ]*[\p{IsUpper}0-9])/)
133             {
134             #the next word has a bunch of initial quotes, maybe a space, then either upper case or a number
135             $words[$i] = $words[$i] . "\n"
136             unless ($prefix
137             && $self->{Nonbreaking}{$prefix}
138 11 0 66     71 && $self->{Nonbreaking}{$prefix} == 2
      33        
      33        
      0        
139             && !$starting_punct
140             && ($words[$i + 1] =~ /^[0-9]+/));
141              
142             #we always add a return for these unless we have a numeric non-breaker and a number start
143             }
144              
145             }
146 135         447 $text = $text . $words[$i] . " ";
147             }
148              
149             #we stopped one token from the end to allow for easy look-ahead. Append it now.
150 13         24 $text = $text . $words[$i];
151              
152             # clean up spaces at head and tail of each line as well as any double-spacing
153 13         141 $text =~ s/ +/ /g;
154 13         40 $text =~ s/\n /\n/g;
155 13         19 $text =~ s/ \n/\n/g;
156 13         17 $text =~ s/^ //g;
157 13         17 $text =~ s/ $//g;
158              
159             #add trailing break
160 13 50       37 $text .= "\n" unless $text =~ /\n$/;
161              
162 13         75 return $text;
163             }
164              
165             1;
166             __END__