File Coverage

blib/lib/Treex/Block/W2A/ResegmentSentences.pm
Criterion Covered Total %
statement 7 9 77.7
branch n/a
condition n/a
subroutine 3 3 100.0
pod n/a
total 10 12 83.3


line stmt bran cond sub pod time code
1             package Treex::Block::W2A::ResegmentSentences;
2             $Treex::Block::W2A::ResegmentSentences::VERSION = '0.13095';
3 1     1   13147 use strict;
  1         2  
  1         25  
4 1     1   7 use warnings;
  1         1  
  1         19  
5 1     1   736 use Moose;
  0            
  0            
6             use Treex::Core::Common;
7             extends 'Treex::Core::Block';
8              
9             has remove => (
10             is => 'ro',
11             isa => enum( [qw(no all diff)] ),
12             default => 'no',
13             documentation => 'remove=no ... Do not delete any bundles (default). '
14             . 'remove=all ... Delete bundles with multiple subsegments. '
15             . 'remove=diff ... Delete bundles with zones with different number of subsegments.',
16             );
17              
18             has 'segmenters' => (
19             is => 'rw',
20             isa => 'HashRef[Treex::Tool::Segment::RuleBased]',
21             default => sub { return {} },
22             );
23              
24             sub _get_segmenter {
25             my $self = shift;
26             my $lang = uc shift;
27             if ( exists $self->segmenters->{$lang} ) {
28             return $self->segmenters->{$lang};
29             }
30             my $specific = "Treex::Tool::Segment::${lang}::RuleBased";
31             my $fallback = "Treex::Tool::Segment::RuleBased";
32             foreach my $class ( $specific, $fallback ) {
33             my $segmenter = eval "use $class; $class->new()"; ##no critic (BuiltinFunctions::ProhibitStringyEval) We want to use it, it is simpler and we check result
34             if ($segmenter) {
35             $self->segmenters->{$lang} = $segmenter;
36             return $segmenter;
37             }
38             else {
39             log_info("Failed during creating segmenter $class: $@");
40             }
41             }
42             log_fatal("Cannot create segmenter for $lang");
43             }
44              
45             sub process_bundle {
46             my ( $self, $bundle ) = @_;
47              
48             my $my_label = $self->zone_label || '';
49             my %sentences;
50             my ( $my_segments, $max_segments ) = ( 0, 0 );
51             foreach my $zone ( $bundle->get_all_zones() ) {
52             my $lang = $zone->language;
53             my $label = $zone->get_label();
54             my $segmenter = $self->_get_segmenter($lang);
55             $sentences{$label} = [ $segmenter->get_segments( $zone->sentence ) ];
56             my $segments = @{ $sentences{$label} };
57             if ( $segments > $max_segments ) { $max_segments = $segments; }
58             if ( $label eq $my_label ) { $my_segments = $segments; }
59             }
60              
61             # If no language (and selector) were specified for this block
62             # resegment all zones
63             if ( $my_segments == 0 ) {
64             $my_segments = $max_segments;
65             }
66              
67             # We are finished if
68             # the zone to be processed contains just one sentence.
69             return if $my_segments == 1;
70            
71             # So we have more subsegments. Delete the bundle and exit if requested.
72             if ($self->remove eq 'all'){
73             $bundle->remove();
74             return;
75             }
76              
77             # TODO: If a zone contains less subsegments (e.g. just 1) than $segments
78             # we can try to split it to equally long chunks regardless of the real
79             # sentence boundaries. Anyway, all evaluation blocks should join the
80             # segments together again before measuring BLEU etc.
81             my $doc = $bundle->get_document;
82             my $orig_id = $bundle->id;
83             my $last_bundle = $bundle;
84             my @labels = keys %sentences;
85              
86             # If any zone has different number of subsegments than $my_segments
87             # and the user requested to delete such bundles, do it and exit.
88             if ($self->remove eq 'diff'){
89             if (any {$_ != $my_segments} map {scalar @{$sentences{$_}}} @labels) {
90             $bundle->remove();
91             return;
92             }
93             }
94              
95             # First subsegment will be saved into the original bundle (with renamed id)
96             $bundle->set_id("${orig_id}_1of$my_segments");
97             foreach my $zone ( $bundle->get_all_zones() ) {
98             my $label = $zone->get_label();
99             my $sent = shift @{ $sentences{$label} };
100             $zone->set_sentence($sent);
101             }
102              
103             # Other subsegments will be saved to new bundles
104             for my $i ( 2 .. $my_segments ) {
105             my $new_bundle = $doc->create_bundle( { after => $last_bundle } );
106             $last_bundle = $new_bundle;
107             $new_bundle->set_id("${orig_id}_${i}of$my_segments");
108             foreach my $label (@labels) {
109             my $sent = shift @{ $sentences{$label} };
110             if ( !defined $sent ) { $sent = ' '; }
111              
112             # If some zone contains more segments than the "current" zone,
113             # the remaining segments will be joined to the last bundle.
114             if ( $i == $my_segments && $max_segments > $my_segments ) {
115             $sent .= ' ' . join( ' ', @{ $sentences{$label} } );
116             }
117             my ( $lang, $selector ) = split /_/, $label;
118             my $new_zone = $new_bundle->create_zone( $lang, $selector );
119             $new_zone->set_sentence($sent);
120             }
121             }
122              
123             return;
124             }
125              
126             1;
127              
128             __END__