File Coverage

blib/lib/Treex/Block/W2A/ResegmentSentences.pm

Criterion	Covered	Total	%
statement	39	100	39.0
branch	4	22	18.1
condition	0	5	0.0
subroutine	11	13	84.6
pod	1	1	100.0
total	55	141	39.0

line	stmt	bran	cond	sub	pod	time	code
1							package Treex::Block::W2A::ResegmentSentences;
2							$Treex::Block::W2A::ResegmentSentences::VERSION = '2.20151102';
3	1			1		20553	use strict;
	1					2
	1					25
4	1			1		5	use warnings;
	1					2
	1					25
5	1			1		810	use Moose;
	1					472353
	1					6
6	1			1		7791	use Treex::Core::Common;
	1					596312
	1					5
7							extends 'Treex::Core::Block';
8
9							has remove => (
10							is => 'ro',
11							isa => enum( [qw(no all diff)] ),
12							default => 'no',
13							documentation => 'remove=no ... Do not delete any bundles (default). '
14							. 'remove=all ... Delete bundles with multiple subsegments. '
15							. 'remove=diff ... Delete bundles with zones with different number of subsegments.',
16							);
17
18							has 'segmenters' => (
19							is => 'rw',
20							isa => 'HashRef[Treex::Tool::Segment::RuleBased]',
21							default => sub { return {} },
22							);
23
24							sub _get_segmenter {
25	9			9		5280	my $self = shift;
26	9					18	my $lang = uc shift;
27	9	100				400	if ( exists $self->segmenters->{$lang} ) {
28	6					245	return $self->segmenters->{$lang};
29							}
30	3					10	my $specific = "Treex::Tool::Segment::${lang}::RuleBased";
31	3					5	my $fallback = "Treex::Tool::Segment::RuleBased";
32	3					6	foreach my $class ( $specific, $fallback ) {
33	6					83	my $segmenter;
34							{
35	6					8	local $SIG{"__DIE__"};
	6					21
36	1			1		456	$segmenter = eval "use $class; $class->new()"; ##no critic (BuiltinFunctions::ProhibitStringyEval) We want to use it, it is simpler and we check result
	0			1		0
	0			1		0
	1			1		773
	1			1		2
	1			1		38
	1					433
	0					0
	0					0
	1					6
	1					2
	1					23
	1					374
	0					0
	0					0
	1					7
	1					2
	1					29
	6					498
37							}
38	6	100				5368	if ($segmenter) {
39	3					130	$self->segmenters->{$lang} = $segmenter;
40	3					17	return $segmenter;
41							}
42							else {
43	3					20	log_info("Failed during creating segmenter $class: $@");
44							}
45							}
46	0						log_fatal("Cannot create segmenter for $lang");
47							}
48
49							sub process_bundle {
50	0			0	1		my ( $self, $bundle ) = @_;
51
52	0		0				my $my_label = $self->zone_label \|\| '';
53	0						my %sentences;
54	0						my ( $my_segments, $max_segments ) = ( 0, 0 );
55	0						foreach my $zone ( $bundle->get_all_zones() ) {
56	0						my $lang = $zone->language;
57	0						my $label = $zone->get_label();
58	0						my $segmenter = $self->_get_segmenter($lang);
59	0						$sentences{$label} = [ $segmenter->get_segments( $zone->sentence ) ];
60	0						my $segments = @{ $sentences{$label} };
	0
61	0	0					if ( $segments > $max_segments ) { $max_segments = $segments; }
	0
62	0	0					if ( $label eq $my_label ) { $my_segments = $segments; }
	0
63							}
64
65							# If no language (and selector) were specified for this block
66							# resegment all zones
67	0	0					if ( $my_segments == 0 ) {
68	0						$my_segments = $max_segments;
69							}
70
71							# We are finished if
72							# the zone to be processed contains just one sentence.
73	0	0					return if $my_segments == 1;
74
75							# So we have more subsegments. Delete the bundle and exit if requested.
76	0	0					if ($self->remove eq 'all'){
77	0						$bundle->remove();
78	0						return;
79							}
80
81							# TODO: If a zone contains less subsegments (e.g. just 1) than $segments
82							# we can try to split it to equally long chunks regardless of the real
83							# sentence boundaries. Anyway, all evaluation blocks should join the
84							# segments together again before measuring BLEU etc.
85	0						my $doc = $bundle->get_document;
86	0						my $orig_id = $bundle->id;
87	0						my $last_bundle = $bundle;
88	0						my @labels = keys %sentences;
89
90							# If any zone has different number of subsegments than $my_segments
91							# and the user requested to delete such bundles, do it and exit.
92	0	0					if ($self->remove eq 'diff'){
93	0	0		0			if (any {$_ != $my_segments} map {scalar @{$sentences{$_}}} @labels) {
	0
	0
	0
94	0						$bundle->remove();
95	0						return;
96							}
97							}
98
99							# First subsegment will be saved into the original bundle (with renamed id)
100	0						$bundle->set_id("${orig_id}_1of$my_segments");
101	0						foreach my $zone ( $bundle->get_all_zones() ) {
102	0						my $label = $zone->get_label();
103	0						my $sent = shift @{ $sentences{$label} };
	0
104	0						$zone->set_sentence($sent);
105							}
106
107							# Other subsegments will be saved to new bundles
108	0						for my $i ( 2 .. $my_segments ) {
109	0						my $new_bundle = $doc->create_bundle( { after => $last_bundle } );
110	0						$last_bundle = $new_bundle;
111	0						$new_bundle->set_id("${orig_id}_${i}of$my_segments");
112	0						foreach my $label (@labels) {
113	0						my $sent = shift @{ $sentences{$label} };
	0
114	0	0					if ( !defined $sent ) { $sent = ' '; }
	0
115
116							# If some zone contains more segments than the "current" zone,
117							# the remaining segments will be joined to the last bundle.
118	0	0	0				if ( $i == $my_segments && $max_segments > $my_segments ) {
119	0						$sent .= ' ' . join( ' ', @{ $sentences{$label} } );
	0
120							}
121	0						my ( $lang, $selector ) = split /_/, $label;
122	0						my $new_zone = $new_bundle->create_zone( $lang, $selector );
123	0						$new_zone->set_sentence($sent);
124							}
125							}
126
127	0						return;
128							}
129
130							1;
131
132							__END__
133
134							=encoding utf-8
135
136							=head1 NAME
137
138							Treex::Block::W2A::ResegmentSentences - split bundles which contain more sentences
139
140							=head1 VERSION
141
142							version 2.20151102
143
144							=head1 MOTIVATION
145
146							Some resources (most notably WMT newstest) are segmented to chunks of text
147							which mostly correspond to sentences, but sometimes contain more than one sentence.
148							Sometimes we want to process such documents in Treex and output (Write::*)
149							the result in a format where one output segement correspond to one input segement.
150							(So e.g. for "one-sentence-per-line writers", we have the same number of input and output lines.)
151
152							However, most Treex blocks expect exactly one (linguistic) sentence in each bundle.
153							The solution is to use block C<W2A::ResegmentSentences> after the reader
154							and C<Misc::JoinBundles> before the writer.
155
156							=head1 DESCRIPTION
157
158							If the sentence segmenter says that the current sentence is
159							actually composed of two or more sentences, then new bundles
160							are inserted after the current bundle, each containing just
161							one piece of the resegmented original sentence.
162
163							This block should be executed before tokenization (and tagging etc).
164							It deals only with the (string) attribute C<sentence> in each zone,
165							it does not process any trees.
166
167							All zones are processed.
168							The number of bundles created is determined by the number of subsegments
169							in the "current" zone (specified by the parameters C<language> and C<selector>).
170							If a zone contains less subsegments than the current one,
171							the remaining bundles will contain empty sentence.
172							If a zone contains more subsegments than the current one,
173							the remaining subsegments will be joined in the last bundle.
174
175							In other words, it is granted that the current zone,
176							will not contain empty sentences.
177
178							As a special case if parameters C<language> and C<selector> define a zone
179							which is not present in a bundle (this holds also for language=all),
180							the "current" zone is the one with most subsegments, i.e. no subsegments are joined.
181
182							=head1 PARAMETERS
183
184							=head2 remove (no\|all\|diff)
185							By setting parameter C<remove> you can delete some bundles.
186							Default is remove=no.
187							Setting remove=all will delete all bundles with more than one subsegments in the current zone.
188							Setting remove=diff will delete all bundles that have (at least) two zones with different number of subsegments.
189
190							=head1 SEE ALSO
191
192							L<Treex::Block::Misc::JoinBundles>
193
194							=head1 AUTHOR
195
196							ZdenÄ›k Å½abokrtskÃ½ <zabokrtsky@ufal.mff.cuni.cz>
197
198							Martin Popel <popel@ufal.mff.cuni.cz>
199
200							=head1 COPYRIGHT AND LICENSE
201
202							Copyright Â© 2011-2012 by Institute of Formal and Applied Linguistics, Charles University in Prague
203
204							This module is free software; you can redistribute it and/or modify it under the same terms as Perl itself.