File Coverage

blib/lib/Treex/Tool/Parser/MSTperl/TrainerBase.pm

Criterion	Covered	Total	%
statement	1	3	33.3
branch			n/a
condition			n/a
subroutine	1	1	100.0
pod			n/a
total	2	4	50.0

line	stmt	sub	time	code
1				package Treex::Tool::Parser::MSTperl::TrainerBase;
2				{
3				$Treex::Tool::Parser::MSTperl::TrainerBase::VERSION = '0.11949';
4				}
5
6	1	1	2324	use Moose;
	0
	0
7				use Carp;
8
9				has config => (
10				isa => 'Treex::Tool::Parser::MSTperl::Config',
11				is => 'ro',
12				required => '1',
13				);
14
15				# to be filled in extending packages!
16				has model => (
17				isa => 'Treex::Tool::Parser::MSTperl::ModelBase',
18				is => 'rw',
19				);
20
21				# to be filled in extending packages!
22				has featuresControl => (
23				isa => 'Treex::Tool::Parser::MSTperl::FeaturesControl',
24				is => 'rw',
25				);
26
27				# to be filled in extending packages!
28				has number_of_iterations => (
29				isa => 'Int',
30				is => 'rw',
31				);
32
33				has number_of_inner_iterations => (
34				isa => 'Int',
35				is => 'rw',
36				);
37
38				has skip_scores_averaging => (
39				is => 'rw',
40				isa => 'Bool',
41				default => 0
42				);
43
44				# TRAINING COMMON SUBS
45
46				sub train_dev {
47				my ( $self, $training_data, $dev_data ) = @_;
48
49				$self->train( $training_data, 0 );
50				my $feature_count = $self->train( $dev_data, 1 );
51
52				return $feature_count;
53				}
54
55				sub train_2parts {
56				my ( $self, $training_data, $dev_data ) = @_;
57
58				$self->train( $training_data, 0 );
59				my $feature_count = $self->train( $dev_data, 0 );
60
61				return $feature_count;
62				}
63
64				sub train {
65
66				# (ArrayRef[Treex::Tool::Parser::MSTperl::Sentence] $training_data
67				# Bool $unlabelled)
68				# Training data: T = {(x_t, y_t)} t=1..T
69				my ( $self, $training_data, $forbid_new_features ) = @_;
70
71				# number of sentences in training data
72				my $sentence_count = scalar( @{$training_data} );
73
74				# how many times $self->mira_update() will be called
75				$self->number_of_inner_iterations(
76				$self->number_of_iterations * $sentence_count
77				);
78
79				# only progress and/or debug info
80				if ( $self->config->DEBUG >= 1 ) {
81				print "Going to train on $sentence_count sentences with "
82				. $self->number_of_iterations . " iterations.\n";
83				}
84
85				# precompute features of sentences in training data
86				# in labelled parsing also gets the list of labels
87				# and computes the transition probs
88				$self->preprocess_sentences($training_data);
89
90				# do the training
91				if ( $self->config->DEBUG >= 1 ) {
92				print "Training the model...\n";
93				}
94				my $innerIteration = 0;
95
96				# for n : 1..N
97				for (
98				my $iteration = 1;
99				$iteration <= $self->number_of_iterations;
100				$iteration++
101				)
102				{
103				if ( $self->config->DEBUG >= 1 ) {
104				print " Iteration number $iteration of "
105				. $self->number_of_iterations . "...\n";
106				}
107				my $sentNo = 0;
108
109				# for t : 1..T # these are the inner iterations
110				foreach my $sentence_correct ( @{$training_data} ) {
111
112				# weight of weights/scores sum update <N*T .. 1>;
113				# $sumUpdateWeight denotes number of summands
114				# in which the new value would appear
115				# if it were computed according to the definition
116				my $sumUpdateWeight =
117				$self->number_of_inner_iterations - $innerIteration;
118
119				# update on this instance
120				$self->update( $sentence_correct, $sumUpdateWeight, $forbid_new_features );
121
122				# $innerIteration = ( $iteration - 1 ) * $sentence_count + $sentNo;
123				$innerIteration++;
124
125				# only progress and/or debug info
126				if ( $self->config->DEBUG >= 1 ) {
127				$sentNo++;
128				if ( $sentNo % 50 == 0 ) {
129				print " $sentNo/$sentence_count sentences processed " .
130				"(iteration $iteration/"
131				. $self->number_of_iterations
132				. ")\n";
133				}
134				}
135
136				} # end for inner iterations
137				} # end for $iteration
138
139				# only progress and/or debug info
140				if ( $self->config->DEBUG >= 1 ) {
141				print "Done.\n";
142				}
143				if ( $self->config->DEBUG >= 2 ) {
144				print "FINAL FEATURE WEIGTHS:\n";
145				}
146
147				if ( !$self->skip_scores_averaging ) {
148
149				# average the model (is said to help overfitting)
150				$self->scores_averaging();
151				}
152
153				# only progress and/or debug info
154				my $feature_count = $self->model->get_feature_count();
155				if ( $self->config->DEBUG >= 1 ) {
156				print "Model trained with $feature_count features.\n";
157				}
158
159				return $feature_count;
160
161				} # end sub train
162
163				# precompute features of sentences in training data
164				sub preprocess_sentences {
165
166				# (ArrayRef[Treex::Tool::Parser::MSTperl::Sentence] $training_data
167				# Bool $unlabelled)
168				my ( $self, $training_data ) = @_;
169
170				# only progress and/or debug info
171				if ( $self->config->DEBUG >= 1 ) {
172				print "Computing sentence features...\n";
173				}
174
175				my $sentence_count = scalar( @{$training_data} );
176				my $sentNo = 0;
177
178				foreach my $sentence_correct ( @{$training_data} ) {
179
180				# compute sentence features
181				# in labelled parsing also gets the list of labels
182				# and computes the transition probs
183				$sentNo++;
184				$self->preprocess_sentence(
185				$sentence_correct, $sentNo / $sentence_count
186				);
187
188				# only progress and/or debug info
189				if ( $self->config->DEBUG >= 1 ) {
190				if ( $sentNo % 50 == 0 ) {
191				print " $sentNo/$sentence_count sentences "
192				. "processed (computing features)\n";
193				}
194				}
195				if ( $self->config->DEBUG >= 3 ) {
196				print "SENTENCE FEATURES:\n";
197				foreach my $feature ( @{ $sentence_correct->features } ) {
198				print "$feature\n";
199				}
200				print "CORRECT EDGES:\n";
201				foreach my $edge ( @{ $sentence_correct->edges } ) {
202				print $edge->parent->ord . " -> " . $edge->child->ord . "\n";
203				}
204				print "CORRECT LABELS:\n";
205				foreach my $node ( @{ $sentence_correct->nodes_with_root } ) {
206				print $node->ord . "/" . $node->label . "\n";
207				}
208				}
209
210				}
211
212				$self->model->prepare_for_mira($self);
213
214				if ( $self->config->DEBUG >= 1 ) {
215				print "Done.\n";
216				}
217
218				return;
219				}
220
221				# ABSTRACT TRAINING SUB STUBS (TO BE REDEFINED IN DESCENDED PACKAGES)
222
223				# compute the features of the sentence
224				# in labelling also used to get the list of labels and of transition probs
225				sub preprocess_sentence {
226
227				# (Treex::Tool::Parser::MSTperl::Sentence $sentence, Num $progress)
228				# my ( $self, $sentence, $progress ) = @_;
229
230				croak 'TrainerBase::preprocess_sentence is an abstract method,'
231				. ' it must be called'
232				. ' either from TrainerUnlabelled or TrainerLabelling!';
233				}
234
235				sub update {
236
237				# (Treex::Tool::Parser::MSTperl::Sentence $sentence_correct,
238				# Int $sumUpdateWeight)
239				# my ( $self, $sentence_correct, $sumUpdateWeight ) = @_;
240
241				croak 'TrainerBase::update is an abstract method, it must be called'
242				. ' either from TrainerUnlabelled or TrainerLabelling!';
243				}
244
245				# sub mira_update {
246				#
247				# # (Treex::Tool::Parser::MSTperl::Sentence $sentence_correct,
248				# # Treex::Tool::Parser::MSTperl::Sentence $sentence_best,
249				# # Int $sumUpdateWeight)
250				# # my ( $self, $sentence_correct, $sentence_best, $sumUpdateWeight ) = @_;
251				#
252				# croak 'TrainerBase::mira_update is an abstract method, it must be called'
253				# . ' either from TrainerUnlabelled or TrainerLabelling!';
254				# }
255
256				# recompute feature weights/scores as averages
257				sub scores_averaging {
258
259				# my ($self) = @_;
260
261				croak 'TrainerBase::scores_averaging is an abstract method, it '
262				. 'must be called either from TrainerUnlabelled or TrainerLabelling!';
263
264				}
265
266				# MODEL STORING
267
268				sub store_model {
269
270				my ( $self, $filename ) = @_;
271
272				$self->model->store($filename);
273
274				return;
275				}
276
277				sub store_model_tsv {
278
279				my ( $self, $filename ) = @_;
280
281				$self->model->store_tsv($filename);
282
283				return;
284				}
285
286				1;
287
288				__END__
289
290				=pod
291
292				=for Pod::Coverage BUILD
293
294				=encoding utf-8
295
296				=head1 NAME
297
298				Treex::Tool::Parser::MSTperl::TrainerBase
299
300				=head1 VERSION
301
302				version 0.11949
303
304				=head1 DESCRIPTION
305
306				Trains on correctly parsed sentences and so creates and tunes the model.
307				Uses single-best MIRA (McDonald et al., 2005, Proc. HLT/EMNLP)
308
309				=head1 FIELDS
310
311				=over 4
312
313				=item config
314
315				Reference to the instance of L<Treex::Tool::Parser::MSTperl::Config>.
316
317				=back
318
319				=head1 METHODS
320
321				=over 4
322
323				=item TODO
324
325				=back
326
327				=head1 AUTHORS
328
329				Rudolf Rosa <rosa@ufal.mff.cuni.cz>
330
331				=head1 COPYRIGHT AND LICENSE
332
333				Copyright Â© 2011 by Institute of Formal and Applied Linguistics, Charles
334				University in Prague
335
336				This module is free software; you can redistribute it and/or modify it under
337				the same terms as Perl itself.