File Coverage

blib/lib/Treex/Core/Block.pm
Criterion Covered Total %
statement 49 213 23.0
branch 8 76 10.5
condition 0 18 0.0
subroutine 11 25 44.0
pod 7 12 58.3
total 75 344 21.8


line stmt bran cond sub pod time code
1             package Treex::Core::Block;
2             $Treex::Core::Block::VERSION = '2.20150928';
3 2     2   20727 use Moose;
  2         460569  
  2         17  
4 2     2   12171 use Treex::Core::Common;
  2         7  
  2         15  
5 2     2   10536 use Treex::Core::Resource;
  2         4  
  2         99  
6 2     2   10 use Digest::MD5 qw(md5_hex);
  2         4  
  2         100  
7 2     2   10 use Storable;
  2         5  
  2         113  
8 2     2   10 use Time::HiRes;
  2         4  
  2         22  
9 2     2   1755 use App::whichpm 'which_pm';
  2         1013  
  2         110  
10 2     2   10 use Readonly;
  2         4  
  2         92  
11 2     2   10 use List::MoreUtils qw(uniq);
  2         5  
  2         26  
12              
13             has selector => ( is => 'ro', isa => 'Str', default => '' );
14             has language => ( is => 'ro', isa => 'Str', default => 'all' );
15              
16             has scenario => (
17             is => 'ro',
18             isa => 'Treex::Core::Scenario',
19             writer => '_set_scenario',
20             weak_ref => 1,
21             );
22              
23             has select_bundles => (
24             is => 'ro',
25             default => 0,
26             documentation => 'apply process_bundle only on the specified bundles,'
27             . ' e.g. "1-4,6,8-12". The default is 0 which means all bundles. Useful for debugging.',
28             );
29              
30             has if_missing_zone => (
31             is => 'ro',
32             isa => enum( [qw(fatal warn ignore create)] ),
33             default => 'fatal',
34             documentation => 'What to do if process_zone is to be called on a zone'
35             . ' (specified by parameters language and selector) that is missing in a given bundle?',
36             );
37              
38             has if_missing_tree => (
39             is => 'ro',
40             isa => enum( [qw(fatal warn ignore create)] ),
41             default => 'fatal',
42             documentation => 'What to do if process_[atnp]tree is to be called on a tree'
43             . ' that is missing in a given zone?',
44             );
45              
46             has if_missing_bundles => (
47             is => 'ro',
48             isa => enum( [qw(fatal warn ignore)] ),
49             default => 'fatal',
50             documentation => 'What to do if process_document is to be called on a document'
51             . ' with no bundles?',
52             );
53              
54              
55              
56             has report_progress => (
57             is => 'ro',
58             isa => 'Str',
59             default => 0,
60             documentation => 'Report which bundle (TODO: zone,tree,node) is being processed via log_info. Useful for debugging.',
61             );
62              
63             has [qw(_is_bundle_selected _is_language_selected _is_selector_selected)] => ( is => 'rw' );
64              
65             has _hash => ( is => 'rw', isa => 'Str' );
66              
67             has is_started => ( is => 'ro', isa => 'Bool', writer => '_set_is_started', default => 0 );
68              
69             Readonly our $DOCUMENT_PROCESSED => 1;
70             Readonly our $DOCUMENT_FROM_CACHE => 2;
71              
72              
73             # For load_other_block() and get_or_load_other_block()
74             # TODO this could also be in Scenario instead of Block...
75              
76             # new other block of same name replaces old one here
77             has _loaded_other_blocks => ( is => 'rw', isa => 'HashRef', default => sub { {} } );
78             # all loaded other blocks, no replacing
79             has _loaded_other_blocks_array => ( is => 'rw', isa => 'ArrayRef', default => sub { [] } );
80              
81             sub zone_label {
82 0     0 1 0 my ($self) = @_;
83 0 0       0 my $label = $self->language or return;
84 0 0 0     0 if ( defined $self->selector && $self->selector ne '' ) {
85 0         0 $label .= '_' . $self->selector;
86             }
87 0         0 return $label;
88             }
89              
90             # TODO
91             # has robust => ( is=> 'ro', isa=>'Bool', default=>0,
92             # documentation=>'no fatal errors in robust mode');
93              
94             sub BUILD {
95 8     8 0 15155 my $self = shift;
96              
97 8 50       304 if ( $self->select_bundles ) {
98 0 0       0 log_fatal 'select_bundles=' . $self->select_bundles . ' does not match /^\d+(-\d+)?(,\d+(-\d+)?)*$/'
99             if $self->select_bundles !~ /^\d+(-\d+)?(,\d+(-\d+)?)*$/;
100 0         0 my %selected;
101 0         0 foreach my $span ( split /,/, $self->select_bundles ) {
102 0 0       0 if ( $span =~ /(\d+)-(\d+)/ ) {
103 0         0 @selected{ $1 .. $2 } = ( $1 .. $2 );
104             }
105             else {
106 0         0 $selected{$span} = 1;
107             }
108             }
109 0         0 $self->_set_is_bundle_selected( \%selected );
110             }
111              
112 8 100       268 if ( $self->language ne 'all' ) {
113 7         211 my @codes = split /,/, $self->language;
114 7         16 my %selected;
115 7         19 for my $code (@codes) {
116 7 50       49 log_fatal "'$code' is not a valid ISO 639-1 language code"
117             if !Treex::Core::Types::is_lang_code($code);
118 7         27 $selected{$code} = 1;
119             }
120 7         293 $self->_set_is_language_selected( \%selected );
121             }
122              
123 8 50       284 if ( $self->selector ne 'all' ) {
124 8 100       279 if ( $self->selector eq '' ) {
125 7         323 $self->_set_is_selector_selected( { q{} => 1 } );
126             }
127             else {
128 1         41 my @selectors = split /,/, $self->selector;
129 1         4 my %selected;
130 1         4 for my $selector (@selectors) {
131 1 50       9 log_fatal "'$selector' is not a valid selector name"
132             if $selector !~ /^[a-z\d]*$/i;
133 1         5 $selected{$selector} = 1;
134             }
135 1         50 $self->_set_is_selector_selected( \%selected );
136             }
137             }
138              
139 8         29 return;
140             }
141              
142             sub _compute_hash {
143 0     0     my $self = shift;
144              
145 0           my $md5 = Digest::MD5->new();
146              
147             # compute block parameters hash
148 0           my $params_str = "";
149             map {
150 0           $params_str .= $_ . "=" . $self->{$_};
151              
152             # log_warn("\t\t" . $_ . "=" . $self->{$_} . " - " . ref($self->{$_}));
153             }
154             sort # in canonical form
155 0           grep { !ref( $self->{$_} ) } # no references
156 0           grep { defined( $self->{$_} ) } # value has to be defined
157 0           grep { !/(scenario|block)/ }
158 0           keys %{$self};
  0            
159            
160             # Digest::MD5 cannot handle Unicode strings (it dies with "Wide character in subroutine entry")
161 2     2   3207 use Encode;
  2         5  
  2         5625  
162 0           $md5->add(Encode::encode_utf8($params_str));
163              
164             # compute block source code hash
165 0           my ( $block_filename, $block_version ) = which_pm( $self->get_block_name() );
166 0 0         open( my $block_fh, "<", $block_filename ) or log_fatal("Can't open '$block_filename': $!");
167 0           binmode($block_fh);
168 0           $md5->addfile($block_fh);
169 0           close($block_fh);
170              
171 0           $self->_set_hash( $md5->hexdigest );
172              
173 0           return;
174             }
175              
176             sub get_hash {
177 0     0 0   my $self = shift;
178 0 0         if (!$self->_hash){
179 0           $self->_compute_hash();
180             }
181 0           return $self->_hash;
182             }
183              
184             sub require_files_from_share {
185 0     0 1   my ( $self, @rel_paths ) = @_;
186 0           my $my_name = 'the block ' . $self->get_block_name();
187             return map {
188 0           log_info $self->get_block_name() . " requires file " . $_;
  0            
189 0           Treex::Core::Resource::require_file_from_share( $_, $my_name )
190             } @rel_paths;
191             }
192              
193             sub get_required_share_files {
194 0     0 1   my ($self) = @_;
195              
196             # By default there are no required share files.
197             # The purpose of this method is to be overriden if needed.
198 0           return ();
199             }
200              
201             sub process_document {
202 0     0 1   my $self = shift;
203 0           my ($document) = pos_validated_list(
204             \@_,
205             { isa => 'Treex::Core::Document' },
206             );
207              
208 0 0 0       if ( !$document->get_bundles() && $self->if_missing_bundles =~ /fatal|warn/){
209 0           my $message = "There are no bundles in the document and block " . $self->get_block_name() .
210             " doesn't override the method process_document. You can use prepend 'Util::SetGlobal if_missing_bundles=ignore' to allow processing empty documents. ";
211 0 0         log_fatal($message) if $self->if_missing_bundles eq 'fatal';
212 0           log_warn($message);
213             }
214              
215 0           my $bundleNo = 1;
216 0           foreach my $bundle ( $document->get_bundles() ) {
217 0 0 0       if ( !$self->select_bundles || $self->_is_bundle_selected->{$bundleNo} ) {
218 0           $self->process_bundle( $bundle, $bundleNo );
219             }
220 0           $bundleNo++;
221             }
222 0           return 1;
223             }
224              
225             sub _apply_function_on_each_zone {
226 0     0     my ($self, $doc, $function, @function_params) = @_;
227 0           my %zones;
228              
229             # When using "all", we must collect the zones used in the whole document.
230 0 0 0       if ($self->language eq 'all' || $self->selector eq 'all'){
231 0           foreach my $bundle ($doc->get_bundles){
232 0           foreach my $zone ($bundle->get_all_zones()){
233 0           $zones{$zone->get_label()} = 1;
234             }
235             }
236             }
237             # Otherwise, we can make a Cartesian product of lang(uage)s and sel(ector)s
238             else {
239 0           foreach my $lang (keys %{$self->_is_language_selected}){
  0            
240 0           foreach my $sel (keys %{$self->_is_selector_selected}){
  0            
241 0           $zones{$lang . '_' . $sel} = 1;
242             }
243             }
244             }
245              
246 0           my $orig_language = $self->language;
247 0           my $orig_selector = $self->selector;
248 0           foreach my $label (keys %zones){
249 0           my ($lang, $sel) = split /_/, $label;
250              
251             # pretend this block was called with only this one language and selector
252 0           $self->{language} = $lang;
253 0           $self->{selector} = $sel;
254 0           $function->(@function_params);
255             }
256 0           $self->{language} = $orig_language;
257 0           $self->{selector} = $orig_selector;
258 0           return;
259             }
260              
261             sub process_bundle {
262 0     0 1   my ( $self, $bundle, $bundleNo ) = @_;
263 0 0         if ($self->report_progress){
264 0           log_info "Processing bundle $bundleNo";
265             }
266              
267 0           my @zones = $bundle->get_all_zones();
268              
269 0 0         if ($self->if_missing_zone eq 'create') {
270 0           my (@langs, @sels);
271 0 0         if ($self->language eq 'all') {
272 0           @langs = uniq map{$_->language} @zones;
  0            
273             } else {
274 0           @langs = keys %{$self->_is_language_selected};
  0            
275             }
276 0 0         if ($self->selector eq 'all') {
277 0           @sels = uniq map{$_->selector} @zones;
  0            
278             } else {
279 0           @sels = keys %{$self->_is_selector_selected};
  0            
280             }
281            
282             # Cartesian product of lang(uage)s and sel(ector)s
283 0           @zones = map {my $l = $_; map{$bundle->get_or_create_zone($l, $_)} @sels} @langs;
  0            
  0            
  0            
284             } else {
285 0           @zones = $self->get_selected_zones(@zones);
286             }
287            
288 0 0 0       if (!@zones && $self->if_missing_zone =~ /fatal|warn/) {
289 0           my $message = "No zone (language="
290             . $self->language
291             . ", selector="
292             . $self->selector
293             . ") was found in a bundle and block " . $self->get_block_name()
294             . " doesn't override the method process_bundle";
295 0 0         log_fatal($message) if $self->if_missing_zone eq 'fatal';
296 0           log_warn($message);
297             }
298              
299 0           foreach my $zone (@zones) {
300 0           $self->process_zone( $zone, $bundleNo );
301             }
302 0           return;
303             }
304              
305             sub get_selected_zones {
306 0     0 0   my ( $self, @zones ) = @_;
307 0 0         if ( $self->language ne 'all') {
308 0           @zones = grep { $self->_is_language_selected->{ $_->language } } @zones;
  0            
309             }
310 0 0         if ( $self->selector ne 'all') {
311 0           @zones = grep { $self->_is_selector_selected->{ $_->selector } } @zones;
  0            
312             }
313              
314 0           return @zones;
315             }
316              
317             sub _try_process_layer {
318 0     0     my ( $self, $zone, $layer, $bundleNo ) = @_;
319 0           my $meta = $self->meta;
320              
321 0 0         if ( my $m = $meta->find_method_by_name("process_${layer}tree") ) {
322 0 0         if (!$zone->has_tree($layer)){
323 0 0         if ($self->if_missing_tree eq 'create'){
324 0           $zone->create_tree($layer);
325             } else {
326 0           return 0;
327             }
328             }
329            
330             #$self->process_atree($tree, $bundleNo);
331 0           $m->execute( $self, $zone->get_tree($layer), $bundleNo );
332 0           return 1;
333             }
334              
335 0 0         if ( my $m = $meta->find_method_by_name("process_${layer}node") ) {
336 0 0         if (!$zone->has_tree($layer)){
337 0 0         if ($self->if_missing_tree eq 'create'){
338 0           $zone->create_tree($layer);
339             } else {
340 0           return 0;
341             }
342             }
343 0           my $tree = $zone->get_tree($layer);
344            
345             # process_ptree should be executed also on the root node (usually the S phrase)
346 0 0         my @opts = $layer eq 'p' ? ( { add_self => 1 } ) : ();
347 0           foreach my $node ( $tree->get_descendants(@opts) ) {
348             # Skip nodes deleted by previous process_Xnode() call.
349 0 0         next if ref $node eq 'Treex::Core::Node::Deleted';
350              
351             #$self->process_anode($node, $bundleNo);
352 0           $m->execute( $self, $node, $bundleNo );
353             }
354 0           return 1;
355             }
356              
357 0           return 0;
358             }
359              
360             sub process_zone {
361 0     0 1   my ( $self, $zone, $bundleNo ) = @_;
362 0           my $overriden = 0;
363              
364 0           for my $layer (qw(a t n p)) {
365 0 0         if ($self->_try_process_layer( $zone, $layer, $bundleNo )){
366 0           $overriden++;
367             }
368             }
369            
370 0 0 0       if (!$overriden && $self->if_missing_tree =~ /fatal|warn/){
371             my $message = "At least one of the methods /process_(document|bundle|zone|[atnp](tree|node))/ "
372             . "must be overriden and the corresponding [atnp] trees must be present in bundles.\n"
373             . "The zone '" . $zone->get_label() . "' contains trees ( "
374 0           . ( join ',', map { $_->get_layer() } $zone->get_all_trees() ) . ").";
  0            
375 0 0         log_fatal($message) if $self->if_missing_tree eq 'fatal';
376 0           log_warn($message);
377             }
378              
379 0           return;
380             }
381              
382             sub process_start {
383             my ($self) = @_;
384              
385             $self->require_files_from_share( $self->get_required_share_files() );
386              
387             return;
388             }
389              
390             after 'process_start' => sub {
391             my ($self) = @_;
392             $self->_set_is_started(1);
393             };
394              
395             sub process_end {
396             my ($self) = @_;
397              
398             # default implementation is empty, but can be overriden
399             return;
400             }
401              
402             after 'process_end' => sub {
403             my ($self) = @_;
404             foreach my $other_block (@{$self->_loaded_other_blocks_array}) {
405             if ( $other_block->is_started ) {
406             $other_block->process_end();
407             }
408             }
409             $self->_set_is_started(0);
410             };
411              
412             sub get_block_name {
413 0     0 1   my $self = shift;
414 0           return ref($self);
415             }
416              
417             sub load_other_block {
418 0     0 0   my ($self, $other_block_name, $params_hash_ref) = @_;
419              
420 0           my $other_block_full_name = "Treex::Block::$other_block_name";
421              
422             # CONSTRUCT PARAMETERS HASH
423             # global params (TODO: do that?)
424 0           my %params = %{$self->scenario->_global_params};
  0            
425             # overridden by selected (TODO: all?) block params
426 0           $params{language} = $self->language;
427 0           $params{selector} = $self->selector;
428 0           $params{scenario} = $self->scenario;
429             # overridden by locally set params
430 0           @params{ keys %$params_hash_ref } = values %$params_hash_ref;
431              
432             # CREATE IT and start it
433 0 0         eval "use $other_block_full_name; 1;" or
434             log_fatal "Treex::Core::Block->get_other_block: " .
435             "Can't use block $other_block_name!\n$@\n";
436 0           my $other_block;
437 0 0         eval {
438 0           $other_block = $other_block_full_name->new( \%params );
439 0           1;
440             } or log_fatal "Treex::Core::Block->get_other_block: " .
441             "Can't initialize block $other_block_name!\n$@\n";
442 0           $other_block->process_start();
443              
444             # this may replace older block with same name
445 0           $self->_loaded_other_blocks->{$other_block_name} = $other_block;
446             # this not
447 0           push @{$self->_loaded_other_blocks_array}, $other_block;
  0            
448              
449 0           return $other_block;
450             }
451              
452             sub get_or_load_other_block {
453 0     0 0   my ($self, $other_block_name, $params_hash_ref) = @_;
454              
455             my $other_block =
456             exists ($self->_loaded_other_blocks->{$other_block_name})
457             ?
458 0 0         $self->_loaded_other_blocks->{$other_block_name}
459             :
460             $self->load_other_block($other_block_name, $params_hash_ref)
461             ;
462              
463 0           return $other_block;
464             }
465              
466             1;
467              
468             __END__
469              
470             =for Pod::Coverage BUILD build_language
471              
472             =encoding utf-8
473              
474             =head1 NAME
475              
476             Treex::Core::Block - the basic data-processing unit in the Treex framework
477              
478             =head1 VERSION
479              
480             version 2.20150928
481              
482             =head1 SYNOPSIS
483              
484             package Treex::Block::My::Block;
485             use Moose;
486             use Treex::Core::Common;
487             extends 'Treex::Core::Block';
488              
489             sub process_bundle {
490             my ( $self, $bundle) = @_;
491              
492             # bundle processing
493              
494             }
495              
496             =head1 DESCRIPTION
497              
498             C<Treex::Core::Block> is a base class serving as a common ancestor of
499             all Treex blocks.
500             C<Treex::Core::Block> can't be used directly in any scenario.
501             Use it's descendants which implement one of the methods
502             C<process_document()>, C<process_bundle()>, C<process_zone()>,
503             C<process_[atnp]tree()> or C<process_[atnp]node()>.
504              
505              
506             =head1 CONSTRUCTOR
507              
508             =over 4
509              
510             =item my $block = Treex::Block::My::Block->new();
511              
512             Instance of a block derived from C<Treex::Core::Block> can be created
513             by the constructor (optionally, a reference to a hash of block parameters
514             can be specified as the constructor's argument, see L</BLOCK PARAMETRIZATION>).
515             However, it is not likely to appear in your code since block initialization
516             is usually invoked automatically when initializing a scenario.
517              
518             =back
519              
520             =head1 METHODS FOR BLOCK EXECUTION
521              
522             You must override one of the following methods:
523              
524             =over 4
525              
526             =item $block->process_document($document);
527              
528             Applies the block instance on the given instance of
529             L<Treex::Core::Document>. The default implementation
530             iterates over all bundles in a document and calls C<process_bundle()>. So in
531             most cases you don't need to override this method.
532              
533             =item $block->process_bundle($bundle);
534              
535             Applies the block instance on the given bundle
536             (L<Treex::Core::Bundle>).
537              
538             =item $block->process_zone($zone);
539              
540             Applies the block instance on the given bundle zone
541             (L<Treex::Core::BundleZone>). Unlike
542             C<process_document> and C<process_bundle>, C<process_zone> requires block
543             attribute C<language> (and possibly also C<selector>) to be specified.
544              
545             =item $block->process_I<X>tree($tree);
546              
547             Here I<X> stands for a,t,n or p.
548             This method is executed on the root node of a tree on a given layer (a,t,n,p).
549              
550             =item $block->process_I<X>node($node);
551              
552             Here I<X> stands for a,t,n or p.
553             This method is executed on the every node of a tree on a given layer (a,t,n,p).
554             Note that for layers a, t, and n, this method is not executed on the root node
555             (because the root node is just a "technical" root without the attributes of regular nodes).
556             However, C<process_pnode> is executed also on the root node
557             (because its a regular non-terminal node with a phrase attribute, usually C<S>).
558              
559             =back
560              
561             =head2 $block->process_start();
562              
563             This method is called before all documents are processed.
564             This method is responsible for loading required models.
565              
566             =head2 $block->process_end();
567              
568             This method is called after all documents are processed.
569             The default implementation is empty, but derived classes can override it
570             to e.g. print some final summaries, statistics etc.
571             Overriding this method is preferable to both
572             standard Perl END blocks (where you cannot access C<$self> and instance attributes),
573             and DEMOLISH (which is not called in some cases, e.g. C<treex --watch>).
574              
575              
576              
577             =head1 BLOCK PARAMETRIZATION
578              
579             =over 4
580              
581             =item my $block = BlockGroup::My_Block->new({$name1=>$value1,$name2=>$value2...});
582              
583             Block instances can be parametrized by a hash containing parameter name/value
584             pairs.
585              
586             =item my $param_value = $block->get_parameter($param_name);
587              
588             Parameter values used in block construction can
589             be revealed by C<get_parameter> method (but cannot be changed).
590              
591             =back
592              
593             =head1 MISCEL
594              
595             =over 4
596              
597             =item my $langcode_selector = $block->zone_label();
598              
599             =item my $block_name = $block->get_block_name();
600              
601             It returns the name of the block module.
602              
603             =item my @needed_files = $block->get_required_share_files();
604              
605             If a block requires some files to be present in the shared part of Treex,
606             their list (with relative paths starting in
607             L<Treex::Core::Config-E<gt>share_dir|Treex::Core::Config/share_dir>) can be
608             specified by redefining by this method. By default, an empty list is returned.
609             Presence of the files is automatically checked in the block constructor. If
610             some of the required file is missing, the constructor tries to download it
611             from L<http://ufallab.ms.mff.cuni.cz>.
612              
613             This method should be used especially for downloading statistical models,
614             but not for installed tools or libraries.
615              
616             sub get_required_share_files {
617             my $self = shift;
618             return (
619             'data/models/mytool/'.$self->language.'/features.gz',
620             'data/models/mytool/'.$self->language.'/weights.tsv',
621             );
622             }
623              
624             =item require_files_from_share()
625              
626             This method checks existence of files given as parameters, it tries to download them if they are not present
627              
628             =back
629              
630             =head1 SEE ALSO
631              
632             L<Treex::Core::Node>,
633             L<Treex::Core::Bundle>,
634             L<Treex::Core::Document>,
635             L<Treex::Core::Scenario>,
636              
637             =head1 AUTHOR
638              
639             ZdenÄ›k Žabokrtský <zabokrtsky@ufal.mff.cuni.cz>
640              
641             Martin Popel <popel@ufal.mff.cuni.cz>
642              
643             =head1 COPYRIGHT AND LICENSE
644              
645             Copyright © 2011-2012 by Institute of Formal and Applied Linguistics, Charles University in Prague
646              
647             This module is free software; you can redistribute it and/or modify it under the same terms as Perl itself.