File Coverage

blib/lib/Treex/Tool/Parser/MSTperl/Config.pm

Criterion	Covered	Total	%
statement	1	3	33.3
branch			n/a
condition			n/a
subroutine	1	1	100.0
pod			n/a
total	2	4	50.0

line	stmt	sub	time	code
1				package Treex::Tool::Parser::MSTperl::Config;
2				{
3				$Treex::Tool::Parser::MSTperl::Config::VERSION = '0.11949';
4				}
5
6	1	1	50759	use Moose;
	0
	0
7				use autodie;
8				use Carp;
9				use File::Spec;
10
11				use Treex::Tool::Parser::MSTperl::FeaturesControl;
12				use Treex::Tool::Parser::MSTperl::ModelAdditional;
13
14				# varied levels of debug info,
15				# ranging from 0 (no debug info)
16				# through 1 (progress messages - this is the default setting)
17				# through 2, 3 and 4 to 5 (more and more debug info)
18				has 'DEBUG' => (
19				is => 'rw',
20				isa => 'Int',
21				default => '1',
22				);
23
24				# Viterbi settings
25
26				has 'SEQUENCE_BOUNDARY_LABEL' => (
27				is => 'rw',
28				isa => 'Str',
29				default => '###',
30				);
31
32				has 'VITERBI_STATES_NUM_THRESHOLD' => (
33				is => 'rw',
34				isa => 'Int',
35				default => 5,
36				);
37
38				# stopping criterion of EM algorithm (when the sum of change of smoothing
39				# parameters is lower than the epsilon, the algorithm stops)
40				has 'EM_EPSILON' => (
41				is => 'rw',
42				isa => 'Num',
43				default => 0.00001,
44				);
45
46				# strmost sigmoidy
47				has 'SIGM_LAMBDA' => (
48				is => 'rw',
49				isa => 'Num',
50
51				# default => 0.0015, probably good for data as they used to be :-)
52				default => 1,
53				);
54
55				# added to emission probs to make them non-negative
56				# has 'EMISSIONS_SHIFT' => (
57				# is => 'rw',
58				# isa => 'Int',
59				# default => 500,
60				# );
61
62				# where in training data do heldout data for EM algorithm start
63				# (a number between 0 and 1, eg. 0.75 means that first 75% of sentences
64				# are training data and the last 25% are heldout data)
65				has 'EM_heldout_data_at' => (
66				is => 'rw',
67				isa => 'Num',
68				default => 0.9,
69				);
70
71				has 'config_file' => (
72				is => 'ro',
73				isa => 'Str',
74				required => '1',
75				);
76
77				has 'unlabelledFeaturesControl' => (
78				isa => 'Maybe[Treex::Tool::Parser::MSTperl::FeaturesControl]',
79				is => 'rw',
80				);
81
82				has 'labelledFeaturesControl' => (
83				isa => 'Maybe[Treex::Tool::Parser::MSTperl::FeaturesControl]',
84				is => 'rw',
85				);
86
87				# has 'imlabelledFeaturesControl' => (
88				# isa => 'Maybe[Treex::Tool::Parser::MSTperl::FeaturesControl]',
89				# is => 'rw',
90				# );
91
92				# CONFIGURATION
93
94				# only assigning is_member (as opposed to afun labelling)
95				# has 'is_member_labelling' => (
96				# is => 'ro',
97				# isa => 'Bool',
98				# default => '0',
99				# );
100
101				# training mode or parsing mode
102				has 'training' => (
103				is => 'ro',
104				isa => 'Bool',
105				default => '0',
106				);
107
108				# (default is parsing mode)
109
110				# has 'ord_field_index' => (
111				# is => 'rw',
112				# isa => 'Int',
113				# );
114
115				# just temporary before it is found out
116				# which algorithm is the best one
117				has 'labeller_algorithm' => (
118				is => 'rw',
119				isa => 'Int',
120				default => '0',
121				);
122
123				has 'parent_ord' => (
124				is => 'rw',
125				isa => 'Str',
126				trigger => \&_parent_ord_set,
127				);
128
129				# sets parent_ord_field_index
130				sub _parent_ord_set {
131				my ( $self, $parent_ord ) = @_;
132
133				# set index of parent's ord field
134				my $parent_ord_index = $self->field_name2index($parent_ord);
135				$self->parent_ord_field_index($parent_ord_index);
136
137				return;
138				}
139
140				has 'parent_ord_field_index' => (
141				is => 'rw',
142				isa => 'Int',
143				);
144
145				has 'label' => (
146				is => 'rw',
147				isa => 'Str',
148				trigger => \&_label_set,
149				);
150
151				# sets label_field_index
152				sub _label_set {
153				my ( $self, $label ) = @_;
154
155				# set index of label field
156				my $label_index = $self->field_name2index($label);
157				$self->label_field_index($label_index);
158
159				return;
160				}
161
162				has 'label_field_index' => (
163				is => 'rw',
164				isa => 'Maybe[Int]',
165
166				# default => 'undef',
167				);
168
169				# has 'ismember' => (
170				# is => 'rw',
171				# isa => 'Str',
172				# trigger => \&_ismember_set,
173				# );
174
175				# sets ismember_field_index
176				# sub _ismember_set {
177				# my ( $self, $ismember ) = @_;
178				#
179				# # set index of ismember field
180				# my $ismember_index = $self->field_name2index($ismember);
181				# $self->ismember_field_index($ismember_index);
182				#
183				# return;
184				# }
185
186				# has 'ismember_field_index' => (
187				# is => 'rw',
188				# isa => 'Maybe[Int]',
189				#
190				# # default => 'undef',
191				# );
192
193				has 'root_field_values' => (
194				is => 'rw',
195				isa => 'ArrayRef[Str]',
196				default => sub { [] },
197				trigger => \&_root_field_values_set,
198				);
199
200				# checks number of root field values
201				sub _root_field_values_set {
202				my ($self) = @_;
203
204				# check number of fields
205				my $root_fields_count = scalar( @{ $self->root_field_values } );
206				if ( $root_fields_count != $self->field_names_count ) {
207				croak "MSTperl config file error: " .
208				"Incorrect number of root field values ($root_fields_count), " .
209				"must be same as number of field names (" .
210				$self->field_names_count . ")!";
211				}
212
213				return;
214				}
215
216				has 'number_of_iterations' => (
217				isa => 'Int',
218				is => 'rw',
219				default => 3,
220				);
221
222				has 'labeller_number_of_iterations' => (
223				isa => 'Int',
224				is => 'rw',
225				default => 3,
226				);
227
228				# has 'imlabeller_number_of_iterations' => (
229				# isa => 'Int',
230				# is => 'rw',
231				# default => 3,
232				# );
233
234				has 'use_edge_features_cache' => (
235				is => 'rw',
236				isa => 'Bool',
237				default => '0',
238				);
239
240				has 'labeller_use_edge_features_cache' => (
241				is => 'rw',
242				isa => 'Bool',
243				default => '0',
244				);
245
246				# has 'imlabeller_use_edge_features_cache' => (
247				# is => 'rw',
248				# isa => 'Bool',
249				# default => '0',
250				# );
251
252				# using cache turned off to fit into RAM by default
253				# turn on if training with a lot of RAM or on small training data
254				# turned off when parsing (does not make any sense for parsing)
255
256				# Distance buckets
257
258				has 'distance_buckets' => (
259				is => 'rw',
260				isa => 'ArrayRef[Int]',
261				default => sub { [] },
262				trigger => \&_distance_buckets_set,
263				);
264
265				# sets distance2bucket, maxBucket and minBucket
266				sub _distance_buckets_set {
267				my ( $self, $distance_buckets ) = @_;
268
269				my %distance2bucket;
270
271				# find maximal bucket & partly fill %distance2bucket
272				my $maxBucket = 0;
273				foreach my $bucket ( @{$distance_buckets} ) {
274				if ( $distance2bucket{$bucket} ) {
275				warn "Bucket '$bucket' is defined more than once; " .
276				"disregarding its later definitions.\n";
277				} elsif ( $bucket <= 0 ) {
278				croak "MSTperl config file error: " .
279				"Error on bucket '$bucket' - " .
280				"buckets must be positive integers.";
281				} else {
282				$distance2bucket{$bucket} = $bucket;
283				$distance2bucket{ -$bucket } = -$bucket;
284				if ( $bucket > $maxBucket ) {
285				$maxBucket = $bucket;
286				}
287				}
288				}
289
290				# set maxBucket and minBucket
291				my $minBucket = -$maxBucket;
292				$self->maxBucket($maxBucket);
293				$self->minBucket($minBucket);
294
295				# fill %distance2bucket from minBucket to maxBucket
296				if ( !$distance2bucket{1} ) {
297				warn "Bucket '1' is not defined, which does not make any sense; " .
298				"adding definition of bucket '1'.\n";
299				$distance2bucket{1} = 1;
300				$distance2bucket{-1} = -1;
301				}
302				my $lastBucket = 1;
303				for ( my $distance = 2; $distance < $maxBucket; $distance++ ) {
304				if ( $distance2bucket{$distance} ) {
305
306				# the distance defines a bucket
307				$lastBucket = $distance2bucket{$distance};
308				} else {
309
310				# the distance falls into the highest lower bucket
311				$distance2bucket{$distance} = $lastBucket;
312				$distance2bucket{ -$distance } = -$lastBucket;
313				}
314				}
315				$self->distance2bucket( \%distance2bucket );
316
317				return;
318				}
319
320				has 'distance2bucket' => (
321				is => 'rw',
322				isa => 'HashRef[Int]',
323				default => sub { {} },
324				);
325
326				# if mapping is not found in the hash, maxBucket or minBucket is used
327
328				has 'maxBucket' => (
329				isa => 'Int',
330				is => 'rw',
331				default => '9',
332				);
333
334				# any higher distance falls into this bucket
335
336				has 'minBucket' => (
337				isa => 'Int',
338				is => 'rw',
339				default => '-9',
340				);
341
342				# any lower distance falls into this bucket, distance is signed (ORD minus ord)
343
344				# FIELDS
345
346				# field names (for conversion of field index to field name)
347				has 'field_names' => (
348				is => 'rw',
349				isa => 'ArrayRef[Str]',
350				default => sub { [] },
351				trigger => \&_field_names_set,
352				);
353
354				# checks field_names, sets field_names_hash and field_indexes
355				sub _field_names_set {
356				my ( $self, $field_names ) = @_;
357
358				my %field_names_hash;
359				my %field_indexes;
360				for ( my $index = 0; $index < scalar( @{$field_names} ); $index++ ) {
361				my $field_name = $field_names->[$index];
362				if ( $field_names_hash{$field_name} ) {
363				croak "MSTperl config file error: " .
364				"Duplicate field name '$field_name'!";
365				} elsif ( $field_name ne lc($field_name) ) {
366				croak "MSTperl config file error: " .
367				"Field name '$field_name' is not lowercase!";
368				} elsif ( !$field_name =~ /a-z/ ) {
369				croak "MSTperl config file error: " .
370				"Field name '$field_name' does not contain " .
371				"any character from [a-z]!";
372				} else {
373				$field_names_hash{$field_name} = 1;
374				$field_indexes{$field_name} = $index;
375				}
376				}
377
378				$self->field_names_count( scalar( @{$field_names} ) );
379				$self->field_names_hash( \%field_names_hash );
380				$self->field_indexes( \%field_indexes );
381
382				return;
383				}
384
385				has 'field_names_count' => (
386				is => 'rw',
387				isa => 'Int',
388				default => '0',
389				);
390
391				# 1 for each field name to easily check if a field name exists
392				has 'field_names_hash' => (
393				is => 'rw',
394				isa => 'HashRef[Str]',
395				default => sub { {} },
396				);
397
398				# index of each field name in field_names
399				# (for conversion of field name to field index)
400				has 'field_indexes' => (
401				is => 'rw',
402				isa => 'HashRef[Str]',
403				default => sub { {} },
404				);
405
406				has lossFunction => ( is => 'rw', isa => 'Str', default => '' );
407
408				has use_pmi => (
409				is => 'rw',
410				isa => 'Bool',
411				default => 0
412				);
413
414				has pmi_model_file => (
415				is => 'rw',
416				isa => 'Str',
417				default => ''
418				);
419
420				has pmi_model_format => (
421				is => 'rw',
422				isa => 'Str',
423				default => 'tsv'
424				);
425
426				has 'pmi_buckets' => (
427				is => 'rw',
428				isa => 'Maybe[ArrayRef[Int]]',
429				default => undef,
430				);
431
432				has use_cprob => (
433				is => 'rw',
434				isa => 'Bool',
435				default => 0
436				);
437
438				has cprob_model_file => (
439				is => 'rw',
440				isa => 'Str',
441				default => ''
442				);
443
444				has cprob_model_format => (
445				is => 'rw',
446				isa => 'Str',
447				default => 'tsv'
448				);
449
450				has 'cprob_buckets' => (
451				is => 'rw',
452				isa => 'Maybe[ArrayRef[Int]]',
453				default => undef,
454				);
455
456				# METHODS
457
458				sub BUILD {
459				my ($self) = @_;
460
461				if ( $self->DEBUG >= 1 ) {
462				print "Processing config file " . $self->config_file . "...\n";
463				}
464
465				# check if file exists
466				unless ( -e $self->config_file ) {
467				my $dir;
468				my ( $volume, $directory, $cfile ) =
469				File::Spec->splitpath( $self->config_file );
470				$dir = File::Spec->catpath( $volume, $directory, '' );
471				my @files = ();
472				opendir( my $dirhandle, $dir ) or croak $!;
473				while ( my $file = readdir($dirhandle) ) {
474				push @files, $file;
475				}
476				closedir($dirhandle);
477				croak "The config file $cfile does not exists!\n" .
478				"The directory $dir contains the following files: " .
479				join ', ', @files;
480				}
481				use YAML::Tiny;
482				my $config = YAML::Tiny->new;
483				$config = YAML::Tiny->read( $self->config_file );
484
485				if ( !$config ) {
486				croak "MSTperl config file error: " . YAML::Tiny->errstr;
487
488				} else {
489
490				# fields to set, in the order in which they are to be set
491				my @fields = (
492				'field_names',
493				'root_field_values',
494				'parent_ord',
495				'distance_buckets',
496				'label',
497				'lossFunction',
498				'use_pmi',
499				'pmi_model_file',
500				'pmi_model_format',
501				'pmi_buckets',
502				'use_cprob',
503				'cprob_model_file',
504				'cprob_model_format',
505				'cprob_buckets',
506				'use_edge_features_cache',
507				'labeller_use_edge_features_cache',
508				'number_of_iterations',
509				'labeller_number_of_iterations',
510				'labeller_algorithm',
511				'DEBUG',
512				'SEQUENCE_BOUNDARY_LABEL',
513				'VITERBI_STATES_NUM_THRESHOLD',
514				'EM_EPSILON',
515				'EM_heldout_data_at',
516				);
517
518				# name => required?
519				my %required_fields = (
520				'field_names' => 1,
521				'root_field_values' => 1,
522				'parent_ord' => 1,
523				'distance_buckets' => 1,
524				);
525				foreach my $field (@fields) {
526				if ( $config->[0]->{$field} ) {
527				$self->$field( $config->[0]->{$field} );
528				} else {
529
530				# if required, then croak
531				if ( $required_fields{$field} ) {
532				croak "MSTperl config file error:"
533				. "Field $field must be set!";
534				}
535
536				# else OK (default value will be used)
537				}
538				}
539
540				# ignore some settings if in parsing-only mode
541				if ( !$self->training ) {
542				$self->use_edge_features_cache(0);
543				$self->labeller_use_edge_features_cache(0);
544				}
545
546				# unlabelled features
547				if ( $config->[0]->{features} && @{ $config->[0]->{features} } ) {
548				$self->unlabelledFeaturesControl(
549				Treex::Tool::Parser::MSTperl::FeaturesControl->new(
550				'config' => $self,
551				'feature_codes_from_config' => $config->[0]->{features},
552				'use_edge_features_cache'
553				=> $self->use_edge_features_cache,
554				)
555				);
556
557				if ( $self->use_pmi ) {
558				my $pmi_model = Treex::Tool::Parser::MSTperl::ModelAdditional->new(
559				config => $self,
560				model_file => $self->pmi_model_file,
561				model_format => $self->pmi_model_format,
562				buckets => $self->pmi_buckets,
563				);
564				my $result = $pmi_model->load();
565				if ($result) {
566				$self->unlabelledFeaturesControl->pmi_model($pmi_model);
567				}
568				}
569
570				if ( $self->use_cprob ) {
571				my $cprob_model = Treex::Tool::Parser::MSTperl::ModelAdditional->new(
572				config => $self,
573				model_file => $self->cprob_model_file,
574				model_format => $self->cprob_model_format,
575				buckets => $self->cprob_buckets,
576				);
577				my $result = $cprob_model->load();
578				if ($result) {
579				$self->unlabelledFeaturesControl->cprob_model($cprob_model);
580				}
581				}
582				}
583
584				# labeller features
585				if ($config->[0]->{labeller_features}
586				&& @{ $config->[0]->{labeller_features} }
587				)
588				{
589				$self->labelledFeaturesControl(
590				Treex::Tool::Parser::MSTperl::FeaturesControl->new(
591				'config' => $self,
592				'feature_codes_from_config'
593				=> $config->[0]->{labeller_features},
594				'use_edge_features_cache'
595				=> $self->labeller_use_edge_features_cache,
596				)
597				);
598				}
599
600				# imlabeller features
601				# if ($config->[0]->{imlabeller_features}
602				# && @{ $config->[0]->{imlabeller_features} }
603				# )
604				# {
605				# $self->imlabelledFeaturesControl(
606				# Treex::Tool::Parser::MSTperl::FeaturesControl->new(
607				# 'config' => $self,
608				# 'feature_codes_from_config'
609				# => $config->[0]->{imlabeller_features},
610				# 'use_edge_features_cache'
611				# => $self->imlabeller_use_edge_features_cache,
612				# )
613				# );
614				# }
615
616				if (!$self->unlabelledFeaturesControl
617				&& !$self->labelledFeaturesControl
618
619				# && !$self->imlabelledFeaturesControl
620				)
621				{
622				croak "MSTperl config file error: No features set!";
623				}
624
625				}
626
627				if ( $self->DEBUG >= 1 ) {
628				print "Done." . "\n";
629				}
630
631				return;
632				}
633
634				sub field_name2index {
635				my ( $self, $field_name ) = @_;
636
637				if ( ref $field_name eq 'ARRAY' ) {
638
639				# multiarg feature
640				my @return;
641				foreach my $field ( @{$field_name} ) {
642				push @return, $self->field_name2index($field);
643				}
644				return [@return];
645				} else {
646				if ( $self->field_names_hash->{$field_name} ) {
647
648				# everything OK -> return the field name
649				return $self->field_indexes->{$field_name};
650				} elsif ( $field_name =~ /^-?[0-9]+$/ ) {
651
652				# not an actual field name but an integer argument -> keep it
653				return $field_name;
654				} else {
655				croak "Unknown field '$field_name', quiting.";
656				}
657				}
658				}
659
660				1;
661
662				__END__
663
664
665
666
667
668
669
670
671
672				=pod
673
674				=for Pod::Coverage BUILD
675
676				=encoding utf-8
677
678				=head1 NAME
679
680				Treex::Tool::Parser::MSTperl::Config
681
682				=head1 VERSION
683
684				version 0.11949
685
686				=head1 DESCRIPTION
687
688				Handles the configuration of the parser.
689
690				=head1 FIELDS
691
692				=head2 Data fields
693
694				Fields describing fields used with nodes, such as form, pos, lemma...
695
696				=over 4
697
698				=item field_names (ArrayRef[Str])
699
700				Field names (for conversion of field index to field name)
701
702				=item field_names_hash (HashRef[Str])
703
704				1 for each field name to easily check if a field name exists
705
706				=item field_indexes (HashRef[Str])
707
708				Index of each field name in field_names (for conversion of field name to field
709				index)
710
711				=back
712
713				=head2 Settings
714
715				Most of the settings are set by a config file in YAML format.
716				However, you do not have to understand YAML to be able to change the
717				settings provided that you keep things like formating of the file unchanged
718				(some whitespaces are significant etc.). Actually only a subset of all
719				all that YAML provides is used.
720
721				Contents of a line from the # character till the end of the line are comments
722				and are ignored (if you need to actually use the # sign, you can quote it -
723				eg. C<'#empty#'> is interpreted as C<#empty#>). Lines that contain only
724				whitespace chars or are empty are ignored as well.
725
726				Some of the settings are ignored when in parsing mode (i.e. not training).
727				These are use_edge_features_cache (turned off) and number_of_iterations
728				(irrelevant).
729
730				These are settings which are acquired from the configuration file:
731
732				=head3 Required Settings
733
734				=over 4
735
736				=item field_names
737
738				Lowercase names of fields in the input file
739				(the data fields are to be separated by tabs in the input file).
740				Use [a-z0-9_] only, using always at least one letter.
741				Use unique names, i.e. devise some names even for unused fields.
742
743				=item root_field_values
744
745				Field values to set for the (technical) root node.
746
747				=item parent_ord
748
749				Name of field containing ord of the parent of the node
750				(also called "head" or "governing node").
751
752				=item distance_buckets
753
754				Buckets to use for C<distance()> function (positive integers in any order).
755				Each distance gets bucketed in the highest lower bucket (absolute-value-wise).
756
757				Default:
758
759				distance_buckets:
760				- 1
761				- 2
762				- 3
763				- 4
764				- 5
765				- 11
766
767				=back
768
769				=head3 Features Settings
770
771				Features to be computed on data.
772
773				Features for the unlabelled parser are set under C<features>,
774				the labeller features under C<labeller_features>.
775
776				Use the (lowercase) input file field names (e.g. C<pos>)
777				to use the field of the (child) node,
778				uppercase them (e.g. C<POS>) to use the field of the parent,
779				joined together by the C<\|> sign to form the features (e.g. C<POS\|LEMMA>).
780
781				Prefix the field names by C<1.> or C<2.>
782				to use the field on the first or second node in the sentence - based on
783				their order in the sentence, regardless of which is parent and which is child
784				(e.g. C<1.pos> for pos of first of the nodes).
785
786				There are also several predefined functions that you can make use of.
787				Usually you can write the function name in lowercase to invoke them on the child
788				field, uppercase for parent, or prefixed by C<1.> or C<2.> for first or second
789				node (e.g. C<CHILDNO()> to get the number of parent node's children). The
790				parameter of a function must be a (child) field name, or an integer (as the
791				C<index> in C<equalspcat>).
792
793				=over 4
794
795				=item distance()
796
797				bucketed ord-wise distance of child and parent: C<ORD> minus C<ord>
798
799				=item attdir()
800
801				parent - child attachement direction: C<signum(ORD minus ord)>
802
803				=item preceding(field)
804
805				value of the specified field on the ord-wise preceding node
806				(use C<PRECEDING(field)> to get field on node preceding the PARENT)
807
808				=item following(field)
809
810				value of the specified field on the ord-wise following node
811
812				=item between(field)
813
814				value of the specified field for each node which is ord-wise between the child
815				node and the parent node
816
817				=item equals(field1,field2)
818
819				Returns C<1> if the value of C<field1> is the same as
820				the value of C<field2>. For fields with multiple values,
821				it has the meaning of an "exists" operator: it returns
822				C<1> if there is at least one pair of values of each field that are
823				the same.
824
825				Returns C<0> if the values don't match.
826
827				Returns C<-1> if (at least) one of the vaues is
828				C<undef> (may be also represented by an empty string)
829
830				=item equalspc(field1,field2)
831
832				like C<equals> but C<field1> is taken from parent node
833				and C<field2> from child node
834
835				=item equalspcat(field,position)
836
837				like C<equalspc> but looks at the given position (1 character)
838				in the given field
839
840				=item substr(field,start,length)
841
842				substring of field value beginning at given
843				start position (0-based) of given length; standard substr behaviour,
844				i.e. both start and length can be negative and length can be omitted,
845				feature function to be then written as C<substr(field,start)>
846
847				=item arrayat(array_field,index_field)
848
849				array_field's value is an array of values
850				separated by single spaces (' '), index_field's value is a zero-based
851				index of a value in the array to be returned (used e.g. for tree distance)
852
853				=item isfirst()
854
855				returns 1 if node is the first in the sentence, 0 otherwise
856
857				=item islast()
858
859				returns C<1> if node is the last in the sentence, C<0> otherwise
860
861				=item isfirstchild()
862
863				returns C<1> if node is the first child of its parent, C<0> otherwise
864
865				=item islastchild()
866
867				returns C<1> if node is the last child of its parent, C<0> otherwise
868
869				=item childno()
870
871				returns number of node's children
872
873				=item islastleftchild()
874
875				is the rightmost of all left children of its parent
876
877				=item isfirstrightchild()
878
879				is the leftmost of all right children of its parent
880
881				=item LABEL()
882
883				label of parent (to be used only in labeller features);
884				label is somewhat special, it cannot be used as C<label>, C<LABEL> or C<label()>
885
886				Features containing the C<LABEL()> function are dynamic, i.e. they cannot be
887				precomputed and are always computed just at the time they are needed.
888
889				=item prevlabel()
890
891				label of previous sibling (to be used only in labeller features);
892				prevlabel is somewhat special, it cannot be used as
893				C<prevlabel>, C<PREVLABEL> or C<PREVLABEL()>
894
895				Features containing the C<prevlabel()> function are dynamic, i.e. they cannot be
896				precomputed and are always computed just at the time they are needed.
897
898				=back
899
900				See also L<Treex::Tool::Parser::MSTperl::FeaturesControl>.
901
902				=head3 Internal technical settings
903
904				These settings are probably better left as they are, but it might be
905				advantageous to have the ability of changing them sometimes, especially when
906				experimenting.
907
908				You can set the values in various ways. The order of priorities is:
909
910				=over 4
911
912				=item 1 set in runtime
913
914				i.e. set after having created a new Config object:
915
916				my $config = Treex::Tool::Parser::MSTperl::Config->new(
917				config_file => 'my_config.config');
918				$config->DEBUG(4);
919
920				The value is only valid from the time of setting.
921
922				=item 2 set in config file
923
924				in my_config.config:
925
926				DEBUG: 4
927
928				in the perl script:
929
930				my $config = Treex::Tool::Parser::MSTperl::Config->new(
931				config_file => 'my_config.config');
932
933				=item 3 set in the constructor
934
935				i.e. set while creating a new Config object:
936
937				in my_config.config:
938
939				# DEBUG: 0
940
941				in the perl script:
942
943				my $config = Treex::Tool::Parser::MSTperl::Config->new(
944				config_file => 'my_config.config',
945				DEBUG => 4 );
946
947				For the setting to take effect, you must not set another value in the config
948				file (you can comment out setting it with '#').
949
950				=item 4 the default value
951
952				Used if the value is not set in runtime, in constructor or in the config file.
953
954				=back
955
956				Please note that setting some of the values at runtime might not be a good idea.
957
958				The options are listed here together with their defaults.
959
960				=over 4
961
962				=item DEBUG: 0
963
964				An integer specifying how much debug information you will be getting while
965				running the program, ranging from 0 (no debug info)
966				through 1 (progress messages)
967				through 2, 3 and 4 to 5 (more and more debug info).
968
969				If you set this value to something higher than 1, you should always redirect
970				the output to a file as printing it to the console is very very slow
971				(and there is so much info that you wouldn't be able to
972				read anything anyway).
973
974				The possibility
975				to change the value
976				while running the program
977				might be beneficial
978				e.g. if you only want to debug only a particular
979				part of the program.
980
981				=item number_of_iterations: 3, labeller_number_of_iterations: 3
982
983				How many times the trainer (Tagger::MSTperl::Trainer) should go through
984				all the training data.
985
986				=item use_edge_features_cache: 0, labeller_use_edge_features_cache: 0
987
988				Currently deprecated, unmaintained and probably to be removed.
989
990				Turns on and off using the C<edge_features_cache>.
991
992				Using cache should be turned on (C<1>) if training with a lot of RAM or on small
993				training data, as it uses a lot of memory but speeds up the training greatly
994				(approx. by 30% to 50%). If you need to save RAM, turn it off (C<0>).
995
996				=item labeller_algorithm: 16
997
998				Algorithm used for Viterbi labelling as well as for training. Several
999				possibilities were tried out,
1000				especially regarding the emission probabilities used in the Viterbi algorithm;
1001				this is for development purposes only, preferebly do not use.
1002
1003				=over
1004
1005				=item (0) MIRA-trained weights
1006
1007				recomputed by +abs(min) and converted to probs,
1008				transitions by MLE on labels
1009
1010				=item (1) dtto, NOT converted to probs
1011
1012				should be same as 0
1013
1014				=item (2) dtto, sum in Viterbi instead of product
1015
1016				new_prob = old_prob + emiss*trans
1017
1018				=item (3) dtto, no recompution
1019
1020				just strip <= 0
1021
1022				=item (4) basic MLE
1023
1024				no MIRA, no smoothing, uniform feature weights
1025				blind (unigram) transition backoff,
1026				blind emission backoff (but should not be necessary)
1027
1028				=item (5) full Viterbi
1029
1030				dtto, transition probs lambda smoothing by EM
1031
1032				=item (8) MIRA for all
1033
1034				completely new, based on reading, no MLE, MIRA for all,
1035				same features for label unigrams and label bigrams
1036
1037				=item (9) dtto, initialize emissions and transitions by MLE
1038
1039				=item (10) 0 + fixed best state selection
1040
1041				=item (11) 10 + tries to use all possible labels
1042
1043				=item (12) 10 + EM for smoothing of transitions
1044
1045				=item (13) 11 + EM for smoothing of transitions
1046
1047				=item (14) 10 + update uses transition probs as well
1048
1049				=item (15) 12 + update uses transition probs as well
1050
1051				=item (16) 8 + transitions by MLE & EM on label pairs
1052
1053				multiplied with emission score in Viterbi and added to last state score
1054
1055				=item (17) dtto, different transition computation for negative scores
1056
1057				=item (18) 16 + no Viterbi summing
1058
1059				=item (19) 16, better formula for combining emissions and transitions
1060
1061				=item (20) MIRA for all
1062
1063				=item (21) MIRA for all, with Viterbi
1064
1065				=item (22) MIRA for all, sentence = one sequence (disregarding tree structure)
1066
1067				=back
1068
1069				=item SEQUENCE_BOUNDARY_LABEL: '###'
1070
1071				This is only a technical thing; a label must be assigned to the (basically
1072				virtual) boundary of a sequence, different from any label used in the data.
1073				The default value is '###', so if you use this exact label as a valid label in
1074				your data, change the setting to something else. If nothing goes wrong, you
1075				should never see this label in the output; however, it is contained in the
1076				model and used for "transition scores" to score the "transition" between the
1077				sequence boundary and the first/last node (i.e. it determines the scores of
1078				labels used as the first or last label in the sequence where no actual
1079				transition takes place and the transition scores would otherwise get ignored).
1080
1081				=item VITERBI_STATES_NUM_THRESHOLD
1082
1083				Number of states to keep when pruning. The pruning takes place after each
1084				Viterbi step (i.e. after each computation of possible labels and their scores
1085				for one edge). For more details see the C<prune> subroutine.
1086
1087				=item EM_EPSILON: 0.00001
1088
1089				Stopping criterion of EM algorithm which is used to compute smoothing
1090				parameters for linear combination smoothing of transition probabilities
1091				in some variants of the Labeller.
1092				(when the sum of change of smoothing
1093				parameters is lower than the epsilon, the algorithm stops).
1094
1095				=item EM_heldout_data_at: 0.9
1096
1097				A number between 0 and 1 specifying
1098				where in training data do heldout data for EM algorithm start
1099				(eg. 0.75 means that first 75% of sentences
1100				are training data and the last 25% are heldout data).
1101
1102				The training/heldout data division only affects computation of transition
1103				probabilities by MLE, it does not affect MIRA training or MLE for emission
1104				probabilities.
1105
1106				If EM is not used for smoothing, all data are used as training data.
1107
1108				=back
1109
1110				=head2 Technical fields
1111
1112				Provide access to things needed in more than one of the other packages.
1113
1114				=over 4
1115
1116				=item unlabelledFeaturesControl
1117
1118				Provides access to unlabelled features, especially enabling their computation.
1119				Intance of L<Treex::Tool::Parser::MSTperl::FeaturesControl>.
1120
1121				=item labelledFeaturesControl
1122
1123				Provides access to labeller features, especially enabling their computation.
1124				Intance of L<Treex::Tool::Parser::MSTperl::FeaturesControl>.
1125
1126				=back
1127
1128				=head1 METHODS
1129
1130				=head2 Settings
1131
1132				The best source of information about all the possible settings is the
1133				configuration file itself (usually called C<config.txt>), as it is richly
1134				commented and accompanied by real examples at the same time.
1135
1136				=over 4
1137
1138				=item my $config =
1139				Treex::Tool::Parser::MSTperl::Config->new(config_file => 'file.config')
1140
1141				Reads the configuration file (in YAML format) and applies the settings.
1142
1143				See file C<samples/sample.config>.
1144
1145				=item field_name2index ($field_name)
1146
1147				Fields are referred to by names in the config files but by indexes in the
1148				code. Therefore this conversion function is necessary; the other direction of
1149				the conversion is ensured by the C<field_names> field.
1150
1151				=back
1152
1153
1154				=head1 AUTHORS
1155
1156				Rudolf Rosa <rosa@ufal.mff.cuni.cz>
1157
1158				=head1 COPYRIGHT AND LICENSE
1159
1160				Copyright Â© 2011 by Institute of Formal and Applied Linguistics,
1161				Charles University in Prague
1162
1163				This module is free software;
1164				you can redistribute it and/or modify it under the same terms as Perl itself.