File Coverage

blib/lib/Treex/Tool/Segment/RuleBased.pm

Criterion	Covered	Total	%
statement	5	7	71.4
branch			n/a
condition			n/a
subroutine	3	3	100.0
pod			n/a
total	8	10	80.0

line	stmt	sub	time	code
1				package Treex::Tool::Segment::RuleBased;
2				BEGIN {
3	1	1	24222	$Treex::Tool::Segment::RuleBased::VERSION = '0.08170';
4				}
5	1	1	1310	use utf8;
	1		11
	1		6
6	1	1	412	use Moose;
	0
	0
7				use Treex::Core::Common;
8
9				has use_paragraphs => (
10				is => 'ro',
11				isa => 'Bool',
12				default => 1,
13				documentation =>
14				'Should paragraph boundaries be preserved as sentence boundaries?'
15				. ' Paragraph boundary is defined as two or more consecutive newlines.',
16				);
17
18				has use_lines => (
19				is => 'ro',
20				isa => 'Bool',
21				default => 0,
22				documentation =>
23				'Should newlines in the text be preserved as sentence boundaries?'
24				. '(But if you want to detect sentence boundaries just based on newlines'
25				. ' and nothing else, use rather W2A::SegmentOnNewlines.)',
26				);
27
28				# Tokens that usually do not end a sentence even if they are followed by a period and a capital letter:
29				# * single uppercase letters serve usually as first name initials
30				# * in langauge-specific descendants consider adding
31				# * period-ending items that never indicate sentence breaks
32				# * titles before names of persons etc.
33				#
34				# Note, that we cannot write
35				# sub get_unbreakers { return qr{...}; }
36				# because we want the regex to be compiled just once, not on every method call.
37				my $UNBREAKERS = qr{\p{Upper}};
38
39				sub unbreakers {
40				return $UNBREAKERS;
41				}
42
43				# Characters that can appear after period (or other end-sentence symbol)
44				sub closings {
45				return '"â€Â»)';
46				}
47
48				# Characters that can appear before the first word of a sentence
49				sub openings {
50				return '"â€œÂ«(';
51				}
52
53				sub get_segments {
54				my ( $self, $text ) = @_;
55
56				# Pre-processing
57				my $unbreakers = $self->unbreakers;
58				$text =~ s/\b($unbreakers)\./$1<<<DOT>>>/g;
59
60				# two newlines usually separate paragraphs
61				if ( $self->use_paragraphs ) {
62				$text =~ s/([^.!?])\n\n+/$1<<<SEP>>>/gsm;
63				}
64
65				if ( $self->use_lines ) {
66				$text =~ s/\n/<<<SEP>>>/gsm;
67				}
68
69				# Normalize whitespaces
70				$text =~ s/\s+/ /gsm;
71
72				# This is the main work
73				$text = $self->split_at_terminal_punctuation($text);
74
75				# Post-processing
76				$text =~ s/<<<SEP>>>/\n/gsmx;
77				$text =~ s/<<<DOT>>>/./gsxm;
78				$text =~ s/\s+$//gsxm;
79				$text =~ s/^\s+//gsxm;
80
81				return split /\n/, $text;
82				}
83
84				sub split_at_terminal_punctuation {
85				my ( $self, $text ) = @_;
86				my ( $openings, $closings ) = ( $self->openings, $self->closings );
87				$text =~ s{
88				([.?!]) # $1 = end-sentence punctuation
89				([$closings]?) # $2 = optional closing quote/bracket
90				\s # space
91				([$openings]?\p{Upper}) # $3 = uppercase letter (optionally preceded by opening quote)
92				}{$1$2\n$3}gsxm;
93				return $text;
94				}
95
96				1;
97
98				__END__
99
100				=encoding utf-8
101
102				=head1 NAME
103
104				Treex::Tool::Segment::RuleBased - Rule based pseudo language-independent sentence segmenter
105
106				=head1 VERSION
107
108				version 0.08170
109
110				=head1 DESCRIPTION
111
112				Sentence boundaries are detected based on a regex rules
113				that detect end-sentence punctuation ([.?!]) followed by a uppercase letter.
114				This class is implemented in a pseudo language-independent way,
115				but it can be used as an ancestor for language-specific segmentation
116				by overriding the method C<segment_text>
117				(using C<around> see L<Moose::Manual::MethodModifiers>)
118				or just by overriding methods C<unbreakers>, C<openings> and C<closings>.
119
120				See L<Treex::Block::W2A::EN::Segment>
121
122				=head1 METHODS
123
124				=over 4
125
126				=item get_segments
127
128				Returns list of sentences
129
130				=back
131
132				=head1 METHODS TO OVERRIDE
133
134				=over 4
135
136				=item segment_text
137
138				Do the segmentation (handling C<use_paragraphs> and C<use_lines>)
139
140				=item $text = split_at_terminal_punctuation($text)
141
142				Adds newlines after terminal punctuation followed by an uppercase letter.
143
144				=item unbreakers
145
146				Returns regex that should match tokens that usually do not end a sentence even if they are followed by a period and a capital letter:
147				* single uppercase letters serve usually as first name initials
148				* in langauge-specific descendants consider adding
149				* period-ending items that never indicate sentence breaks
150				* titles before names of persons etc.
151
152				=item openings
153
154				Returns string with characters that can appear before the first word of a sentence
155
156				=item closings
157
158				Returns string with characters that can appear after period (or other end-sentence symbol)
159
160				=back
161
162				=head1 AUTHOR
163
164				Martin Popel <popel@ufal.mff.cuni.cz>
165
166				=head1 COPYRIGHT AND LICENSE
167
168				Copyright Â© 2011 by Institute of Formal and Applied Linguistics, Charles University in Prague
169
170				This module is free software; you can redistribute it and/or modify it under the same terms as Perl itself.