File Coverage

blib/lib/Treex/Tool/Segment/RuleBased.pm
Criterion Covered Total %
statement 53 117 45.3
branch 7 36 19.4
condition 0 21 0.0
subroutine 14 20 70.0
pod 6 13 46.1
total 80 207 38.6


line stmt bran cond sub pod time code
1             package Treex::Tool::Segment::RuleBased;
2             $Treex::Tool::Segment::RuleBased::VERSION = '2.20151102';
3 2     2   20384 use strict;
  2         4  
  2         55  
4 2     2   8 use warnings;
  2         4  
  2         56  
5 2     2   843 use utf8;
  2         13  
  2         12  
6 2     2   836 use Moose;
  2         467198  
  2         16  
7 2     2   12545 use Treex::Core::Common;
  2         599247  
  2         16  
8              
9             has use_paragraphs => (
10             is => 'ro',
11             isa => 'Bool',
12             default => 1,
13             documentation =>
14             'Should paragraph boundaries be preserved as sentence boundaries?'
15             . ' Paragraph boundary is defined as two or more consecutive newlines.',
16             );
17              
18             has use_lines => (
19             is => 'ro',
20             isa => 'Bool',
21             default => 0,
22             documentation =>
23             'Should newlines in the text be preserved as sentence boundaries?'
24             . '(But if you want to detect sentence boundaries just based on newlines'
25             . ' and nothing else, use rather W2A::SegmentOnNewlines.)',
26             );
27              
28             has limit_words => (
29             is => 'ro',
30             isa => 'Int',
31             default => 250,
32             documentation =>
33             'Should very long segments (longer than the given number of words) be split?'
34             . 'The number of words is only approximate; detected by counting whitespace only,'
35             . 'not by full tokenization. Set to zero to disable this function completely.',
36             );
37              
38             has detect_lists => (
39             is => 'ro',
40             isa => 'Int',
41             default => 100,
42             documentation =>
43             'Minimum (approx.) number of words to toggle list detection, 0 = never, 1 = always.'
44             );
45              
46             # Tokens that usually do not end a sentence even if they are followed by a period and a capital letter:
47             # * single uppercase letters serve usually as first name initials
48             # * in langauge-specific descendants consider adding
49             # * period-ending items that never indicate sentence breaks
50             # * titles before names of persons etc.
51             #
52             # Note, that we cannot write
53             # sub get_unbreakers { return qr{...}; }
54             # because we want the regex to be compiled just once, not on every method call.
55 2     2   11743 my $UNBREAKERS = qr{\p{Upper}};
  2         5  
  2         27  
56              
57             sub unbreakers {
58 1     1 1 3 return $UNBREAKERS;
59             }
60              
61             # Characters that can appear after period (or other end-sentence symbol)
62             sub closings {
63 1     1 1 18 return '"”»)';
64             }
65              
66             # Characters that can appear before the first word of a sentence
67             sub openings {
68 1     1 1 5 return '"“«(';
69             }
70              
71             # Contextual rules for "un-breaking" (to be overridden)
72             sub apply_contextual_rules {
73 1     1 1 3 my ($self, $text) = @_;
74 1         2 return $text;
75             }
76              
77             sub get_segments {
78 1     1 1 2494 my ( $self, $text ) = @_;
79              
80             # Pre-processing
81 1         7 $text = $self->apply_contextual_rules($text);
82              
83 1         5 my $unbreakers = $self->unbreakers;
84 1         35 $text =~ s/\b($unbreakers)\./$1<<<DOT>>>/g;
85              
86             # two newlines usually separate paragraphs
87 1 50       188 if ( $self->use_paragraphs ) {
88 1         3 $text =~ s/([^.!?])\n\n+/$1<<<SEP>>>/gsm;
89             }
90              
91 1 50       40 if ( $self->use_lines ) {
92 0         0 $text =~ s/\n/<<<SEP>>>/gsm;
93             }
94              
95             # Normalize whitespaces
96 1         9 $text =~ s/\s+/ /gsm;
97              
98             # This is the main work
99 1         5 $text = $self->split_at_terminal_punctuation($text);
100              
101             # Post-processing
102 1         4 $text =~ s/<<<SEP>>>/\n/gsmx;
103 1         3 $text =~ s/<<<DOT>>>/./gsxm;
104 1         5 $text =~ s/\s+$//gsxm;
105 1         4 $text =~ s/^\s+//gsxm;
106              
107             # try to separate various list items (e.g. TV programmes, calendars)
108 1         4 my @segs = map { $self->split_at_list_items($_) } split /\n/, $text;
  2         7  
109              
110             # handle segments that are too long
111 1 50       3 return map { $self->segment_too_long($_) ? $self->handle_long_segment($_) : $_ } @segs;
  2         7  
112             }
113              
114             sub split_at_terminal_punctuation {
115 1     1 1 3 my ( $self, $text ) = @_;
116 1         5 my ( $openings, $closings ) = ( $self->openings, $self->closings );
117 1         35 $text =~ s{
118             ([.?!]) # $1 = end-sentence punctuation
119             ([$closings]?) # $2 = optional closing quote/bracket
120             \s # space
121             ([$openings]?\p{Upper}) # $3 = uppercase letter (optionally preceded by opening quote)
122             }{$1$2\n$3}gsxm;
123 1         94 return $text;
124             }
125              
126             sub handle_long_segment {
127 0     0 0 0 my ( $self, $seg ) = @_;
128              
129             # split at some other dividing punctuation characters (poems, unending speech)
130 0 0       0 my @split = map { $self->segment_too_long($_) ? $self->split_at_dividing_punctuation($_) : $_ } $seg;
  0         0  
131              
132             # split at any punctuation
133 0 0       0 @split = map { $self->segment_too_long($_) ? $self->split_at_any_punctuation($_) : $_ } @split;
  0         0  
134              
135             # split hard if still too long
136 0 0       0 return map { $self->segment_too_long($_) ? $self->split_hard($_) : $_ } @split;
  0         0  
137             }
138              
139             # Return 1 if the segment is too long
140             sub segment_too_long {
141 2     2 0 4 my ( $self, $seg ) = @_;
142              
143             # skip everything if the limit is infinity
144 2 50       77 return 0 if ( $self->limit_words == 0 );
145              
146             # return 1 if the number of space-separated segments exceeds the limit
147 2         11 my $wc = () = $seg =~ m/\s+/g;
148 2 50       77 return 1 if ( $wc >= $self->limit_words );
149 2         71 return 0;
150             }
151              
152             # "Non-final" punctuation that could divide segments (NB: single dot excluded due to abbreviations)
153             my $DIV_PUNCT = qr{(!|\.\.+|\?|\*|[–—-](\s*[–—-])+|;)};
154              
155             sub split_at_dividing_punctuation {
156 0     0 0 0 my ( $self, $text ) = @_;
157              
158 0         0 my $closings = $self->closings;
159 0         0 $text =~ s/($DIV_PUNCT\s*[$closings]?,?)/$1\n/g;
160              
161 0         0 return split /\n/, $self->_join_too_short_segments($text);
162             }
163              
164             # Universal list types (currently only semicolon-separated lists, to be overridden in language-specific blocks)
165             my $LIST_TYPES = [
166             {
167             name => ';', # a label for the list type (just for debugging)
168             sep => ';\h+', # separator regexp
169             sel_sep => undef, # separator regexp used only for the selection of this list (sep used if not set)
170             type => 'e', # type of separator (ending: e / staring: s)
171             max => 400, # maximum average list-item length (overrides the default)
172             min => 30, # minimum average list-item length (overrides the default)
173             # negative pre-context, not used if not set (here: skip semicolons separating just numbers)
174             neg_pre => '[0-9]\h*(?=;\h*[0-9]+(?:[^\.0-9]|\.[0-9]|$))',
175             },
176             ];
177              
178             # Language-specific blocks should override this method and provide usual list types for the given language
179             sub list_types {
180 0     0 0 0 return @{$LIST_TYPES};
  0         0  
181             }
182              
183             my $MAX_AVG_ITEM_LEN = 400; # default maximum average list item length, in characters
184             my $MIN_AVG_ITEM_LEN = 30; # default minimum average list item length, in characters
185             my $MIN_LIST_ITEMS = 3; # minimum number of items in a list
186             my $PRIORITY = 2.5; # multiple of list items a lower-rank list type must have over a higher-rank type
187              
188             sub split_at_list_items {
189              
190 2     2 0 4 my ( $self, $text ) = @_;
191              
192             # skip this if list detection is turned off
193 2 50       81 return $text if ( $self->detect_lists == 0 );
194              
195             # skip too short lines
196 2         11 my $wc = () = $text =~ m/\s+/g;
197 2 50       78 return $text if ( $self->detect_lists > $wc );
198              
199 0           my @list_types = $self->list_types;
200 0           my $sel_list_type;
201             my $sel_len;
202              
203             # find out which list type is the best for the given text
204 0           for ( my $i = 0; $i < @list_types; ++$i ) {
205              
206 0           my $cur_list_type = $list_types[$i];
207 0   0       my $sep = $cur_list_type->{sel_sep} || $cur_list_type->{sep};
208 0           my $neg = $cur_list_type->{neg_pre};
209 0   0       my $min = $cur_list_type->{min} || $MIN_AVG_ITEM_LEN;
210 0   0       my $max = $cur_list_type->{max} || $MAX_AVG_ITEM_LEN;
211              
212 0           my $items = () = $text =~ m/$sep/gi;
213              
214             # count number of items; exclude negative pre-context matches, if negative pre-context is specified
215 0           my $false = 0;
216 0 0         $false = () = $text =~ m/$neg(?=$sep)/gi if ($neg);
217 0           $items -= $false;
218              
219 0 0         my $len = $items > 0 ? ( length($text) / $items ) : 'NaN';
220              
221             # test if this type overrides the previously set one
222 0 0 0       if ( $items >= $MIN_LIST_ITEMS && $len < $max && $len > $min && ( !$sel_len || $len * $PRIORITY < $sel_len ) ) {
      0        
      0        
      0        
223 0           $sel_list_type = $cur_list_type;
224 0           $sel_len = $len;
225             }
226             }
227              
228             # return if no list type found
229 0 0         return $text if ( !$sel_list_type );
230              
231             # list type detected, split by the given list type
232 0           my $sep = $sel_list_type->{sep};
233 0           my $neg = $sel_list_type->{neg_pre};
234 0           my $name = $sel_list_type->{name};
235              
236             # protect negative pre-context, if any is specified
237 0 0         $text =~ s/($neg)(?=$sep)/$1<<<NEG>>>/gi if ($neg);
238              
239             # split at the given list type
240 0 0         if ( $sel_list_type->{type} eq 'e' ) {
241 0           $text =~ s/(?<!<<<NEG>>>)($sep)/$1\n/gi;
242             }
243             else {
244 0           $text =~ s/(?<!<<<NEG>>>)($sep)/\n$1/gi;
245             }
246              
247             # remove negative pre-context protection
248 0           $text =~ s/<<<NEG>>>//g;
249              
250             # delete too short splits
251 0           $text = $self->_join_too_short_segments($text);
252              
253             # return the split result
254 0           return split /\n/, $text;
255             }
256              
257             sub _join_too_short_segments {
258 0     0     my ( $self, $text ) = @_;
259              
260 0           $text =~ s/^\n//;
261 0           $text =~ s/\n$//;
262 0           $text =~ s/\n(?=\h*(\S+(\h+\S+){0,2})?\h*(\n|$))/ /g;
263 0           return $text;
264             }
265              
266             sub split_at_any_punctuation {
267 0     0 0   my ( $self, $text ) = @_;
268              
269 0           my $closings = $self->closings;
270              
271             # prefer punctuation followed by a letter
272 0           $text =~ s/([,;!?–—-]+\s*[$closings]?)\s+(\p{Alpha})/$1\n$2/g;
273              
274             # delete too short splits
275 0           $text = $self->_join_too_short_segments($text);
276              
277 0           my @split = split /\n/, $text;
278              
279             # split at any punctuation if the text is still too long
280             return map {
281 0 0         $_ =~ s/([,;!?–—-]+\s*[$closings]?)/$1\n/g if ( $self->segment_too_long($_) );
  0            
282 0           split /\n/, $self->_join_too_short_segments($_)
283             } @split;
284             }
285              
286             sub split_hard {
287 0     0 0   my ( $self, $text ) = @_;
288              
289 0           my @tokens = split /(\s+)/, $text;
290 0           my @result;
291 0           my $pos = 0;
292              
293 0           while ( $pos < @tokens ) {
294 0           my $limit = $pos + $self->limit_words * 2 - 1;
295 0 0         $limit = @tokens - 1 if ( $limit > @tokens - 1 );
296 0           push @result, join( '', @tokens[ $pos .. $limit ] );
297 0           $pos = $limit + 1;
298             }
299 0           return @result;
300             }
301              
302             1;
303              
304             __END__
305              
306             =encoding utf-8
307              
308             =head1 NAME
309              
310             Treex::Tool::Segment::RuleBased - Rule based pseudo language-independent sentence segmenter
311              
312             =head1 VERSION
313              
314             version 2.20151102
315              
316             =head1 DESCRIPTION
317              
318             Sentence boundaries are detected based on a regex rules
319             that detect end-sentence punctuation ([.?!]) followed by a uppercase letter.
320             This class is implemented in a pseudo language-independent way,
321             but it can be used as an ancestor for language-specific segmentation
322             by overriding the method C<segment_text>
323             (using C<around> see L<Moose::Manual::MethodModifiers>)
324             or just by overriding methods C<unbreakers>, C<openings> and C<closings>.
325              
326             See L<Treex::Block::W2A::EN::Segment>
327              
328             =head1 METHODS
329              
330             =over 4
331              
332             =item get_segments
333              
334             Returns list of sentences
335              
336             =back
337              
338             =head1 METHODS TO OVERRIDE
339              
340             =over 4
341              
342             =item segment_text
343              
344             Do the segmentation (handling C<use_paragraphs> and C<use_lines>)
345              
346             =item $text = split_at_terminal_punctuation($text)
347              
348             Adds newlines after terminal punctuation followed by an uppercase letter.
349              
350             =item $text = apply_contextual_rules($text)
351              
352             Add unbreakers (C<E<lt>E<lt>E<lt>DOTE<gt>E<gt>E<gt>>) and hard breaks (C<\n>) using the whole context, not
353             just a single word.
354              
355             =item unbreakers
356              
357             Returns regex that should match tokens that usually do not end a sentence even if they are followed by a period and a capital letter:
358             * single uppercase letters serve usually as first name initials
359             * in language-specific descendants consider adding:
360             * period-ending items that never indicate sentence breaks
361             * titles before names of persons etc.
362              
363             =item openings
364              
365             Returns string with characters that can appear before the first word of a sentence
366              
367             =item closings
368              
369             Returns string with characters that can appear after period (or other end-sentence symbol)
370              
371             =back
372              
373             =head1 AUTHOR
374              
375             Martin Popel <popel@ufal.mff.cuni.cz>
376              
377             OndÅ™ej DuÅ¡ek <odusek@ufal.mff.cuni.cz>
378              
379             =head1 COPYRIGHT AND LICENSE
380              
381             Copyright © 2011-2012 by Institute of Formal and Applied Linguistics, Charles University in Prague
382              
383             This module is free software; you can redistribute it and/or modify it under the same terms as Perl itself.