File Coverage

blib/lib/Statistics/MaxEntropy.pm

Criterion	Covered	Total	%
statement	472	589	80.1
branch	113	184	61.4
condition	25	42	59.5
subroutine	40	48	83.3
pod	8	40	20.0
total	658	903	72.8

line	stmt	bran	cond	sub	pod	time	code
1							package Statistics::MaxEntropy;
2
3							##---------------------------------------------------------------------------##
4							## Author:
5							## Hugo WL ter Doest terdoest@cs.utwente.nl
6							## Description:
7							## Object-oriented implementation of
8							## Generalised Iterative Scaling algorithm,
9							## Improved Iterative Scaling algorithm, and
10							## Feature Induction algorithm
11							## for inducing maximum entropy probability distributions
12							## Keywords:
13							## Maximum Entropy Modeling
14							## Kullback-Leibler Divergence
15							## Exponential models
16							##
17							##---------------------------------------------------------------------------##
18							## Copyright (C) 1998, 1999 Hugo WL ter Doest terdoest@cs.utwente.nl
19							##
20							## This library is free software; you can redistribute it and/or modify
21							## it under the terms of the GNU General Public License as published by
22							## the Free Software Foundation; either version 2 of the License, or
23							## (at your option) any later version.
24							##
25							## This library is distributed in the hope that it will be useful,
26							## but WITHOUT ANY WARRANTY; without even the implied warranty of
27							## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
28							## GNU General Public License for more details.
29							##
30							## You should have received a copy of the GNU Library General Public
31							## License along with this program; if not, write to the Free Software
32							## Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
33							##---------------------------------------------------------------------------##
34
35
36							##---------------------------------------------------------------------------##
37							## Globals
38							##---------------------------------------------------------------------------##
39	3					408	use vars qw($VERSION
40							@ISA
41							@EXPORT
42							$VECTOR_PACKAGE
43
44							$debug
45							$SAMPLE_size
46							$NEWTON_max_it
47							$KL_max_it
48							$KL_min
49							$NEWTON_min
50							$cntrl_c_pressed
51							$cntrl_backslash_pressed
52	3			3		2027	);
	3					5
53
54
55							##---------------------------------------------------------------------------##
56							## Require libraries
57							##---------------------------------------------------------------------------##
58	3			3		15	use strict;
	3					6
	3					74
59	3			3		9048	use diagnostics -verbose;
	3					683256
	3					45
60	3			3		3384	use Statistics::SparseVector;
	3					8
	3					202
61							$VECTOR_PACKAGE = "Statistics::SparseVector";
62	3			3		2189	use POSIX;
	3					19543
	3					22
63	3			3		9449	use Carp;
	3					6
	3					152
64	3			3		2830	use Data::Dumper;
	3					36106
	3					20846
65							require Exporter;
66							require AutoLoader;
67
68							@ISA = qw(Exporter AutoLoader);
69							# Items to export into callers namespace by default. Note: do not export
70							# names by default without a very good reason. Use EXPORT_OK instead.
71							# Do not simply export all your public functions/methods/constants.
72							@EXPORT = qw($KL_min
73							$NEWTON_min
74							$debug
75							$nr_add
76							$KL_max_it
77							$NEWTON_max_it
78							$SAMPLE_size
79							);
80
81							$VERSION = '1.0';
82
83
84							# default values for some configurable parameters
85							$NEWTON_max_it = 20;
86							$NEWTON_min = 0.001;
87							$KL_max_it = 100;
88							$KL_min = 0.001;
89							$debug = 0;
90							$SAMPLE_size = 250; # the size of MC samples
91							# binary or integer feature functions
92
93							# for catching interrupts
94							$cntrl_c_pressed = 0;
95							$cntrl_backslash_pressed = 0;
96							$SIG{INT} = \&catch_cntrl_c;
97							$SIG{QUIT} = \&catch_cntrl_backslash;
98
99
100							# checks floats
101							sub is_float {
102	59812			59812	0	79971	my($f) = @_;
103
104	59812					542843	return ($f =~ /^([+-]?)(?=\d\|\.\d)\d(\.\d)?([Ee]([+-]?\d+))?$/);
105							}
106
107
108							# interrrupt routine for control c
109							sub catch_cntrl_c {
110	0			0	0	0	my($signame) = shift;
111
112	0					0	$cntrl_c_pressed = 1;
113	0					0	die " pressed\n";
114							}
115
116
117							# interrrupt routine for control \ (originally core-dump request)
118							sub catch_cntrl_backslash {
119	0			0	0	0	my($signame) = shift;
120
121	0					0	$cntrl_backslash_pressed = 1;
122							}
123
124
125							# creates a new event space
126							# depending on the $arg parameter samples it or reads it from a file
127							sub new {
128	37			37	1	223	my($this, $vectype, $filename) = @_;
129
130							# for calling $self->new($someth):
131	37		66			185	my $class = ref($this) \|\| $this;
132	37					92	my $self = {};
133	37					107	bless $self, $class;
134	37					142	$self->{SCALER} = "gis"; # default
135	37					134	$self->{SAMPLING} = "corpus"; # default
136	37					107	$self->{NR_CLASSES} = 0;
137	37					96	$self->{NR_EVENTS} = 0;
138	37					94	$self->{NR_FEATURES} = 0;
139	37					101	$self->{VECTYPE} = $vectype;
140	37	100				96	if ($filename) { # hey a filename
141	4					18	$self->read($filename);
142							}
143	37					232	$self->{FEATURE_IGNORE} = $VECTOR_PACKAGE->new($self->{NR_FEATURES});
144	37					96	return($self);
145							}
146
147
148							# decides how to sample, "enum", "mc", or "corpus"
149							sub sample {
150	332			332	0	669	my($self) = @_;
151
152	332					382	my($sample);
153
154	332	100				1450	if ($self->{SAMPLING} eq "mc") {
		100
155	16					72	$sample = $self->new();
156	16					39	$sample->{VECTYPE} = "binary";
157	16					46	$sample->{SCALER} = $self->{SCALER};
158	16					45	$sample->{NR_FEATURES} = $self->{NR_FEATURES};
159							# refer to the parameters of $self
160	16					66	$sample->{PARAMETERS} = $self->{PARAMETERS};
161	16					49	$sample->{NEED_CORRECTION_FEATURE} = 1;
162	16					48	$sample->{CORRECTION_PARAMETER} = $self->{CORRECTION_PARAMETER};
163	16					49	$sample->{E_REF} = $self->{E_REF};
164	16					28	$sample->{THIS_IS_A_SAMPLE} = 1;
165	16					63	$sample->mc($self);
166	16					40	$sample->{CLASSES_CHANGED} = 1;
167	16					73	$sample->prepare_model();
168							}
169							elsif ($self->{SAMPLING} eq "enum") {
170	17					76	$sample = $self->new();
171	17					51	$sample->{SCALER} = $self->{SCALER};
172	17					39	$sample->{SAMPLING} = "enum";
173	17					41	$sample->{NR_FEATURES} = $self->{NR_FEATURES};
174	17					66	$sample->{PARAMETERS} = $self->{PARAMETERS};
175	17					41	$sample->{NEED_CORRECTION_FEATURE} = 1;
176	17					49	$sample->{CORRECTION_PARAMETER} = $self->{CORRECTION_PARAMETER};
177	17					41	$sample->{E_REF} = $self->{E_REF};
178	17					30	$sample->{THIS_IS_A_SAMPLE} = 1;
179	17					42	$sample->{M} = $self->{NR_FEATURES};
180							}
181							else { # "corpus"
182	299					411	$sample = $self;
183							}
184	332					1025	$sample->prepare_sample();
185	332					795	return($sample);
186							}
187
188
189							# makes sure that when prepare_model is called, everything is recomputed
190							sub clear {
191	14			14	0	153	my($self) = @_;
192
193	14					37	undef $self->{PARAMETERS_INITIALISED};
194	14					34	$self->{PARAMETERS_CHANGED} = 1;
195	14					50	$self->{CLASSES_CHANGED} = 1;
196							}
197
198
199
200							sub DESTROY {
201	343			343		712	my($self) = @_;
202
203	343	50				1727	if ($cntrl_c_pressed) {
204	0					0	$self->dump();
205							}
206							}
207
208
209							# reads an events file, dies in case of inconsistent lines
210							# syntax first line: .....
211							# syntax other lines, binary functions:
212							# syntax other lines, integer functions:
213							# an intvector is a space separated list of integers
214							sub read {
215	4			4	0	8	my($self, $file) = @_;
216
217	4					7	my($features,
218							$feature_names,
219							@cols);
220
221	4					8	$feature_names = "";
222	4	50				129	open(EVENTS, $file) \|\|
223							$self->die("Could not open $file\n");
224	4					12	print "Opened $file\n";
225
226							# read the names of the features, skip comment
227							# note that feature name are in reverse order now
228	4					8	do {
229	4					88	$feature_names = ;
230							} until ($feature_names !~ /\#.*/);
231	4					11	chomp $feature_names;
232	4					34	$self->{FEATURE_NAMES} = [split(/\t/, $feature_names)];
233	4					10	$self->{NR_FEATURES} = $#{$self->{FEATURE_NAMES}} + 1;
	4					13
234
235							# read the bitvectors
236	4					19	while () {
237	400	50				910	if (!/\#.*/) { # skip comments
238	400					442	chomp;
239
240	400	50				1223	if (/\s*(\d+)\s+(.+)/) {
241	400					922	$self->{FREQ}[$self->{NR_CLASSES}] = $1;
242	400					686	$features = $2;
243							}
244	400	50				820	if ($self->{FREQ} == 0) {
245	0					0	$self->die("Class $self->{NR_CLASSES} has zero probability\n");
246							}
247	400					659	$self->{NR_EVENTS} += $self->{FREQ}[$self->{NR_CLASSES}];
248							$self->{CLASSES}[$self->{NR_CLASSES}] =
249							$VECTOR_PACKAGE->new_vec($self->{NR_FEATURES},
250	400					1211	$features, $self->{VECTYPE});
251	400					1561	$self->{NR_CLASSES}++;
252							}
253							}
254	4					31	close(EVENTS);
255
256	4					28	print "Read $self->{NR_EVENTS} events, $self->{NR_CLASSES} classes, " .
257							"and $self->{NR_FEATURES} features\n";
258	4					13	print "Closed $file\n";
259
260	4					7	$self->{FILENAME} = $file;
261	4					11	$self->{CLASSES_CHANGED} = 1;
262	4					11	$self->{PARAMETERS_CHANGED} = 1;
263							}
264
265
266							# reads an initial distribution
267							# syntax: one parameter per line
268							sub read_parameters {
269	0			0	0	0	my($self, $file) = @_;
270
271	0					0	my($i);
272
273	0					0	$i = 0;
274	0	0				0	open(DISTR,$file) \|\|
275							$self->die("Could not open $file\n");
276	0					0	print "Opened $file\n";
277
278	0					0	while () {
279	0	0				0	if (!/\#.*/) {
280	0					0	chomp;
281	0					0	$self->{PARAMETERS}[$i++] = $_;
282							}
283							}
284
285	0					0	close(DISTR);
286	0	0				0	if ($i != $self->{NR_FEATURES}) {
287	0					0	$self->die("Initial distribution file corrupt\n");
288							}
289	0					0	print "Read $i parameters; closed $file\n";
290	0					0	$self->{PARAMETERS_CHANGED} = 1;
291							}
292
293
294							# writes the the current parameters
295							# syntax:
296							sub write_parameters {
297	1			1	1	8	my($self, $file) = @_;
298
299	1					3	my($i);
300
301	1	50				80	open(DISTR,">$file") \|\|
302							$self->die("Could not open $file\n");
303	1					5	print "Opened $file\n";
304
305	1					26	for ($i = 0; $i < $self->{NR_FEATURES}; $i++) {
306	18	50				48	if ($self->{FEATURE_IGNORE}->bit_test($i)) {
307	0					0	print DISTR "IGNORED\n";
308							}
309							else {
310	18					99	print DISTR "$self->{PARAMETERS}[$i]\n";
311							}
312							}
313
314	1					28	close(DISTR);
315	1					6	print "Closed $file\n";
316							}
317
318
319							# writes the the current features with their parameters
320							# syntax first line: <$nr_features>
321							# syntax last line:
322							# syntax other lines:
323							sub write_parameters_with_names {
324	0			0	1	0	my($self, $file) = @_;
325
326	0					0	my($x,
327							$bitmask);
328
329	0	0				0	open(DISTR,">$file") \|\|
330							$self->die("Could not open $file\n");
331	0					0	print "Opened $file\n";
332
333							# preamble
334	0					0	print DISTR "$self->{NR_FEATURES}\n";
335	0					0	print DISTR "$self->{SCALER}\n";
336	0	0				0	if ($self->{SCALER} eq "gis") {
337	0					0	print DISTR "$self->{M}\n";
338	0					0	print DISTR "$self->{CORRECTION_PARAMETER}\n";
339							}
340
341							# print feature names with parameters
342							# in the meanwhile build the bitmask
343	0					0	$bitmask = "";
344	0					0	for ($x = 0; $x < $self->{NR_FEATURES}; $x++) {
345	0					0	print DISTR "$self->{FEATURE_NAMES}[$x]\t$self->{PARAMETERS}[$x]\n";
346	0	0				0	if ($self->{FEATURE_IGNORE}->bit_test($x)) {
347	0					0	$bitmask = "0" . $bitmask;
348							}
349							else {
350	0					0	$bitmask = "1" . $bitmask;
351							}
352							}
353	0					0	print DISTR "$bitmask\n";
354
355	0					0	close(DISTR);
356	0					0	print "Closed $file\n";
357							}
358
359
360							# generate random parameters
361							sub random_parameters {
362	2			2	0	3	my($self) = @_;
363
364	2					9	my($f);
365
366	2					107	srand();
367	2					11	for ($f = 0; $f < $self->{NR_FEATURES}; $f++) {
368	20					54	$self->{PARAMETERS}[$f] = rand() + 1;
369							}
370	2	100				8	if ($self->{SCALER} eq "gis") {
371	1					5	$self->{CORRECTION_PARAMETER} = rand();
372							}
373	2					5	$self->{PARAMETERS_CHANGED} = 1;
374							}
375
376
377							# sets parameters to $val
378							sub set_parameters_to {
379	12			12	0	19	my($self, $val) = @_;
380
381	12					18	my($f);
382
383	12					66	for ($f = 0; $f < $self->{NR_FEATURES}; $f++) {
384	132					332	$self->{PARAMETERS}[$f] = $val;
385							}
386	12	100				47	if ($self->{SCALER} eq "gis") {
387	6					17	$self->{CORRECTION_PARAMETER} = $val;
388							}
389	12					36	$self->{PARAMETERS_CHANGED} = 1;
390							}
391
392
393							# initialise if !$self->{PARAMETERS_INITIALISED}; subsequent calls
394							# of scale (by fi) should not re-initialise parameters
395							sub init_parameters {
396	22			22	0	40	my($self) = @_;
397
398	22	100				80	if (!$self->{PARAMETERS_INITIALISED}) {
399	14	100				50	if ($self->{SAMPLING} eq "mc") {
400							# otherwise bits will be flipped with prob 1.
401	2					19	$self->random_parameters();
402							}
403							else {
404	12	100				52	if ($self->{SCALER} eq "gis") {
405	6					27	$self->set_parameters_to(0);
406							}
407							else {
408	6					32	$self->set_parameters_to(0);
409							}
410							}
411	14					30	$self->{PARAMETERS_INITIALISED} = 1;
412							}
413							}
414
415
416							# make sure \tilde{p} << q_0
417							# constant feature functions are forbidden: that is why
418							# we check whether for all features \sum_x f(x) > 0
419							# and \sum_x f(x) != $corpus_size
420							sub check {
421	420			420	0	619	my($self) = @_;
422
423	420					461	my ($x);
424
425	420					1291	for ($x = 0; $x < $self->{NR_CLASSES}; $x++) {
426	41984	50				126301	if ($self->{CLASS_EXP_WEIGHTS}[$x] == 0) {
427	0					0	print "Initial distribution not ok; class $x\n";
428	0					0	print $self->{CLASS_EXP_WEIGHTS}[$x], "\t", $self->{CLASSES}[$x]->to_Bin(' '),"\n";
429							}
430							}
431							}
432
433
434							# writes events to a file
435							# usefull in case new features have been added
436							# syntax: same as input events file
437							sub write {
438	1			1	1	12	my($self, $file) = @_;
439
440	1					2	my($x, $f);
441
442							# prologue
443	1	50				157	open(EVENTS,">$file") \|\|
444							$self->die("Could not open $file\n");
445	1					4	print "Opened $file\n";
446
447							# write a line with the feature names
448	1					2	print EVENTS join("\t", @{$self->{FEATURE_NAMES}}), "\n";
	1					251
449							# write the events themselves
450	1					7	for ($x = 0; $x < $self->{NR_CLASSES}; $x++) {
451	100					229	print EVENTS $self->{FREQ}[$x],"\t";
452	100					254	print EVENTS $self->{CLASSES}[$x]->to_Bin(' '), "\n";
453							}
454
455							# close the file and tell you did that
456	1					43	close EVENTS;
457	1					10	print "Wrote $self->{NR_EVENTS} events, $self->{NR_CLASSES} classes, " .
458							"and $self->{NR_FEATURES} features\n";
459	1					6	print "Closed $file\n";
460							}
461
462
463							# reads a dump, and evaluates it into an object
464							sub undump {
465	0			0	1	0	my($class, $file) = @_;
466
467	0					0	my($x,
468							$VAR1);
469
470							# open, slurp, and close file
471	0	0				0	open(UNDUMP, "$file") \|\|
472							croak "Could not open $file\n";
473	0					0	print "Opened $file\n";
474	0					0	undef $/;
475	0					0	$x = ;
476	0					0	$/ = "\n";
477	0					0	close(UNDUMP);
478
479							# and undump
480	0					0	eval $x;
481	0					0	print "Undumped $VAR1->{NR_EVENTS} events, $VAR1->{NR_CLASSES} classes, " .
482							"and $VAR1->{NR_FEATURES} features\n";
483	0					0	print "Closed $file\n";
484	0					0	return($VAR1);
485							}
486
487
488							# makes dump of the event space using Data::Dumper
489							sub dump {
490	3			3	1	39	my($self, $file) = @_;
491
492	3					5	my(@bitvecs,
493							$dump,
494							%features,
495							$f);
496
497	3	50				13	if (!$file) {
498	0					0	$file = POSIX::tmpnam();
499							}
500	3	50				665	open(DUMP, ">$file") \|\|
501							croak "Could not open $file\n";
502	3					22	print "Opened $file\n";
503
504							# build something that we can sort
505							# ONLY FOR CORPUS!
506	3	50	33			34	if (!$self->{THIS_IS_A_SAMPLE} && $self->{PARAMETERS}) {
507	3					13	for ($f = 0; $f < $self->{NR_FEATURES}; $f++) {
508							$features{$self->{FEATURE_NAMES}[$f]} =
509	38					134	$self->{PARAMETERS}[$f];
510							}
511	3	100	66			28	if ($self->{NEED_CORRECTION_FEATURE} && ($self->{SCALER} eq "gis")) {
512							$features{"correction$self->{M}"} =
513	2					8	$self->{CORRECTION_PARAMETER};
514							}
515							# and print it into $self
516							$self->{FEATURE_SORTED} = join(' > ',
517							sort {
518	3	100				29	if ($features{$b} == $features{$a}) {
	89					201
519	1					3	return($b cmp $a)}
520							else {
521	88					138	return ($features{$b} <=> $features{$a})
522							}
523							}
524							keys(%features));
525							}
526
527	3					46	$dump = Data::Dumper->new([$self]);
528	3					159	print DUMP $dump->Dump();
529	3					20840	print "Dumped $self->{NR_EVENTS} events, $self->{NR_CLASSES} classes, " .
530							"and $self->{NR_FEATURES} features\n";
531
532	3					129	close(DUMP);
533	3					1666	print "Closed $file\n";
534							}
535
536
537							# $msg is logged, the time is logged, a dump is created, and the
538							# program dies with $msg
539							sub die {
540	0			0	0	0	my($self, $msg) = @_;
541
542	0					0	$self->log_msg($msg);
543	0					0	$self->log_msg(time());
544	0					0	$self->dump();
545	0					0	croak $msg;
546							}
547
548
549							# prints a msg to STDOUT, and appends it to $self->{LOG}
550							# so an emergency dump will contain some history information
551							sub log_msg {
552	532			532	0	1142	my($self, $x) = @_;
553
554	532					3611	$self->{LOG} .= $x;
555	532					37785	print $x;
556							}
557
558
559							# computes f_# for alle events; results in @sample_nr_feats_on
560							# computes %$sample_m_feats_on; a HOL from m
561							sub active_features {
562	428			428	0	573	my($self) = @_;
563
564	428					580	my($i,
565							$j,
566							$sum);
567
568	428	100				1124	if ($self->{CLASSES_CHANGED}) {
569							# check for constant features
570	78					255	for ($i = 0; $i < $self->{NR_FEATURES}; $i++) {
571	984					1020	$sum = 0;
572	984					2333	for ($j = 0; $j < $self->{NR_CLASSES}; $j++) {
573	98240					248178	$sum += $self->{CLASSES}[$j]->bit_test($i);
574							}
575	984	50	33			5406	if (!$sum \|\| ($sum == $self->{NR_CLASSES})) {
576	0					0	print "Feature ", $i + 1, " is constant ($sum), and will be ignored\n";
577	0					0	$self->{FEATURE_IGNORE}->Bit_On($i);
578							}
579							}
580							# M is needed for both gis and iis
581							# NEED_CORRECTION_FEATURE is for gis only
582	78					186	$self->{M} = 0;
583	78					132	$self->{NEED_CORRECTION_FEATURE} = 0;
584	78					252	for ($i = 0; $i < $self->{NR_CLASSES}; $i++) {
585	7784	100				19292	if ($self->{CLASSES}[$i]->Norm() > $self->{M}) {
586							# higher nr_features_active found
587	256					697	$self->{M} = $self->{CLASSES}[$i]->Norm();
588	256					675	$self->{NEED_CORRECTION_FEATURE} = 1;
589							}
590							}
591	78	50				182	if ($debug) {
592	0					0	print "M = $self->{M}\n";
593							}
594							# set up a hash from m to classes HOL; and the correction_feature
595							# CORRECTION_FEATURE FOR gis
596	78					152	undef $self->{M_FEATURES_ACTIVE};
597	78					232	for ($i = 0; $i < $self->{NR_CLASSES}; $i++) {
598	7784	100				19011	if ($self->{SCALER} eq "gis") {
599							$self->{CORRECTION_FEATURE}[$i] =
600	3595					9091	$self->{M} - $self->{CLASSES}[$i]->Norm();
601							}
602							}
603	78	50				224	if ($debug) {
604	0					0	print "M = $self->{M}\n";
605							}
606							# observed feature expectations
607	78	100				196	if (!$self->{THIS_IS_A_SAMPLE}) {
608	62					218	$self->E_reference();
609							}
610	78					228	undef $self->{CLASSES_CHANGED};
611							}
612							}
613
614
615							# compute the class probabilities according to the parameters
616							sub prepare_model {
617	428			428	0	604	my($self) = @_;
618
619	428					546	my ($x,
620							$f);
621
622	428					1366	$self->active_features();
623	428	100				1074	if ($self->{PARAMETERS_CHANGED}) {
624	420					653	$self->{Z} = 0;
625	420					1214	for ($x = 0; $x < $self->{NR_CLASSES}; $x++) {
626	41984					56577	$self->{CLASS_LOG_WEIGHTS}[$x] = 0;
627	41984					120520	for $f ($self->{CLASSES}[$x]->indices()) {
628							$self->{CLASS_LOG_WEIGHTS}[$x] += $self->{PARAMETERS}[$f] *
629	91446					290594	$self->{CLASSES}[$x]->weight($f);
630	91446	50				222340	if ($f >= $self->{NR_FEATURES}) {
631	0					0	print "alarm: wrong index: $f\n";
632							}
633							}
634	41984	100	66			200255	if ($self->{NEED_CORRECTION_FEATURE} && ($self->{SCALER} eq "gis")) {
635							$self->{CLASS_LOG_WEIGHTS}[$x] += $self->{CORRECTION_FEATURE}[$x] *
636	27695					53323	$self->{CORRECTION_PARAMETER};
637							}
638	41984					76949	$self->{CLASS_EXP_WEIGHTS}[$x] = exp($self->{CLASS_LOG_WEIGHTS}[$x]);
639	41984					115649	$self->{Z} += $self->{CLASS_EXP_WEIGHTS}[$x];
640							}
641							print "prepare_model: \$Z is not a number: $self->{Z}\n"
642	420	50				1237	unless is_float($self->{Z});
643
644	420	100				1175	if (!$self->{THIS_IS_A_SAMPLE}) {
645	404					1045	$self->entropies();
646							}
647	420					1286	$self->check();
648	420					1199	undef $self->{PARAMETERS_CHANGED};
649							}
650							}
651
652
653							sub prepare_sample {
654	332			332	0	399	my($self) = @_;
655
656							# expectations
657	332	100				706	if ($self->{SCALER} eq "gis") {
658	236					756	$self->E_loglinear();
659							}
660							else {
661							# A_{mj}
662	96					256	$self->A();
663							}
664							}
665
666
667							# feature expectations for the MaxEnt distribution
668							sub E_loglinear {
669	236			236	0	327	my($self) = @_;
670
671	236					302	my($x,
672							$f,
673							$vec,
674							$weight,
675							$Z);
676
677	236					578	undef $self->{E_LOGLIN};
678	236	100				959	if ($self->{SAMPLING} eq "enum") {
679	9					37	$vec = $VECTOR_PACKAGE->new($self->{NR_FEATURES});
680	9					22	$self->{Z} = 0;
681	9					39	for ($x = 0; $x < 2 ** $self->{NR_FEATURES}; $x++) {
682	18432					32991	$weight = $self->weight($vec);
683	18432					48064	for $f ($vec->indices()) {
684	103424					280939	$self->{E_LOGLIN}[$f] += $weight * $vec->weight($f);
685							}
686							$self->{E_LOGLIN}[$self->{NR_FEATURES}] += $weight *
687	18432					65901	($self->{M} - $vec->Norm());
688	18432					25238	$self->{Z} += $weight;
689	18432					45217	$vec->increment();
690							}
691	9					40	for $f (0..$self->{NR_FEATURES}) {
692	106					227	$self->{E_LOGLIN}[$f] /= $self->{Z};
693							}
694							}
695							else { # either corpus or mc sample
696	227					854	for ($x = 0; $x < $self->{NR_CLASSES}; $x++) {
697	22695					59286	for $f ($self->{CLASSES}[$x]->indices()) {
698							$self->{E_LOGLIN}[$f] += $self->{CLASS_EXP_WEIGHTS}[$x] *
699	48374					146590	$self->{CLASSES}[$x]->weight($f);
700							}
701	22695	50				58843	if ($self->{NEED_CORRECTION_FEATURE}) {
702							$self->{E_LOGLIN}[$self->{NR_FEATURES}] +=
703							$self->{CLASS_EXP_WEIGHTS}[$x] *
704	22695					71600	($self->{M} - $self->{CLASSES}[$x]->Norm());
705							}
706							}
707	227					561	for $f (0..$self->{NR_FEATURES}) {
708	2671					4020	$self->{E_LOGLIN}[$f] /= $self->{Z};
709							}
710							}
711							}
712
713
714							# observed feature expectations
715							sub E_reference {
716	62			62	0	101	my($self) = @_;
717
718	62					116	my($x,
719							$f,
720							@sum);
721
722	62					246	for ($x = 0; $x < $self->{NR_CLASSES}; $x++) {
723	6200					15334	for $f ($self->{CLASSES}[$x]->indices()) {
724	11765					33716	$sum[$f] += $self->{FREQ}[$x] * $self->{CLASSES}[$x]->weight($f);
725							}
726	6200	100				21104	if ($self->{SCALER} eq "gis") {
727							$sum[$self->{NR_FEATURES}] += $self->{CORRECTION_FEATURE}[$x] *
728	3100					9133	$self->{FREQ}[$x];
729							}
730							}
731	62					174	for $f (0..$self->{NR_FEATURES}) {
732	886	100				1565	if ($sum[$f]) {
733	855					1550	$self->{E_REF}[$f] = $sum[$f] / $self->{NR_EVENTS};
734							}
735							}
736							}
737
738
739							# compute several entropies
740							sub entropies {
741	404			404	0	565	my($self) = @_;
742
743	404					549	my ($i,
744							$w,
745							$log_w,
746							$w_ref,
747							$log_w_ref);
748
749	404					609	$self->{H_p} = 0;
750	404					540	$self->{H_cross} = 0;
751	404					509	$self->{H_p_ref} = 0;
752	404					502	$self->{KL} = 0;
753	404					1134	for ($i = 0; $i < $self->{NR_CLASSES}; $i++) {
754	40400					53151	$w = $self->{CLASS_EXP_WEIGHTS}[$i];
755							# we don't know whether $p > 0
756	40400					52602	$log_w = $self->{CLASS_LOG_WEIGHTS}[$i];
757	40400					57017	$w_ref = $self->{FREQ}[$i];
758							# we know that $p_ref > 0
759	40400					52474	$log_w_ref = log($w_ref);
760							# update the sums
761	40400					54828	$self->{H_p} -= $w * $log_w;
762	40400					50657	$self->{H_cross} -= $w_ref * $log_w;
763	40400					53466	$self->{KL} += $w_ref * ($log_w_ref - $log_w);
764	40400					53250	$self->{H_p_ref} -= $w_ref * $log_w_ref;
765	40400	50				121445	if ($w == 0) {
766	0					0	$self->log_msg("entropies: skipping event $i (p^n($i) = 0)\n");
767							}
768							}
769							# normalise
770	404					1026	$self->{H_p} = $self->{H_p} / $self->{Z} + log($self->{Z});
771	404					882	$self->{H_cross} = $self->{H_cross} / $self->{NR_EVENTS} + log($self->{Z});
772							$self->{KL} = $self->{KL} / $self->{NR_EVENTS} - log($self->{NR_EVENTS}) +
773	404					958	log($self->{Z});
774	404					800	$self->{H_p_ref} = $self->{H_p_ref} / $self->{NR_EVENTS} + log($self->{NR_EVENTS});
775	404					1361	$self->{L} = -$self->{H_cross};
776							}
777
778
779							# unnormalised p(x,y)
780							# $x is required, $y is optional
781							# $x->Size()+$y->Size() == $self->{NR_FEATURES}
782							sub weight {
783	77824			77824	0	129052	my($self, $x, $y) = @_;
784
785	77824					80687	my ($f,
786							$sum,
787							$norm);
788
789	77824					89352	$sum = 0;
790	77824					186595	for $f ($x->indices()) {
791	498688	50				1346787	if (!$self->{FEATURE_IGNORE}->bit_test($f)) {
792	498688					1454974	$sum += $self->{PARAMETERS}[$f] * $x->weight($f);
793	498688	50				1268936	if ($debug) {
794	0					0	print "Current weight: $sum, current feature: $f\n";
795							}
796							}
797							}
798	77824					253987	$norm = $x->Norm();
799							# if $y is defined,
800							# then $x->Size()+$y->Size() == $self->{NR_FEATURES} should hold:
801	77824	50	33			185428	if (defined($y) && (($x->Size() + $y->Size()) == $self->{NR_FEATURES})) {
802	0					0	for $f ($y->indices()) {
803	0	0				0	if (!$self->{FEATURE_IGNORE}->bit_test($f + $x->Size())) {
804	0					0	$sum += $self->{PARAMETERS}[$f + $x->Size()] *
805							$y->weight($f);
806	0	0				0	if ($debug) {
807	0					0	print "Current weight: $sum, current feature: $f\n";
808							}
809							}
810							}
811	0					0	$norm += $y->Norm();
812							}
813	77824	100	66			357723	if ($self->{NEED_CORRECTION_FEATURE} && ($self->{SCALER} eq "gis")) {
814	18432					32979	$sum += ($self->{M} - $norm) * $self->{CORRECTION_PARAMETER};
815							}
816	77824					168315	return(exp($sum));
817							}
818
819
820							# computes the `a' coefficients of
821							# \sum_{m=0}^{M} a_{m,j}^{(n)} e^{\alpha^{(n)}_j m}
822							# according to the current distribution
823							sub A {
824	96			96	0	123	my($self) = @_;
825
826	96					108	my($f,
827							$m,
828							$x,
829							$weight,
830							$vec,
831							$class);
832
833	96					937	undef $self->{A};
834	96					566	undef $self->{C};
835	96	100				213	if ($self->{SAMPLING} eq "enum") {
836	8					27	undef $self->{Z};
837	8					29	$vec = $VECTOR_PACKAGE->new($self->{NR_FEATURES});
838	8					33	for ($x = 0; $x < 2 ** $self->{NR_FEATURES}; $x++) {
839	59392					121944	$weight = $self->weight($vec);
840	59392					159868	for $f ($vec->indices()) {
841	395264					1054831	$self->{A}{$vec->Norm()}{$f} += $weight * $vec->weight($f);
842	395264					1113796	$self->{C}{$vec->Norm()}{$f} += $vec->weight($f);
843							}
844	59392					138935	$self->{Z} += $weight;
845	59392	50				112051	print "Z = $self->{Z}" unless is_float($self->{Z});
846	59392					184110	$vec->increment();
847							}
848							}
849							else { # mc or corpus
850	88					276	for ($class = 0; $class < $self->{NR_CLASSES}; $class++) {
851	8789					25068	for $f ($self->{CLASSES}[$class]->indices()) {
852							$self->{A}{$self->{CLASSES}[$class]->Norm()}{$f} +=
853							$self->{CLASS_EXP_WEIGHTS}[$class] *
854	22878					66984	$self->{CLASSES}[$class]->weight($f);
855							$self->{C}{$self->{CLASSES}[$class]->Norm()}{$f} +=
856	22878					67247	$self->{CLASSES}[$class]->weight($f);
857							}
858							}
859							}
860							}
861
862
863							#
864							# Monte Carlo sampling with the Metropolis update
865							#
866
867							# returns heads up with probability $load
868							sub loaded_die {
869	4795			4795	0	5679	my($load) = @_;
870
871	4795	100				12232	(rand() <= $load) ? 1 : 0;
872							}
873
874
875							# samples from the probability distribution of $other to create $self
876							# we use the so-called Metropolis update R = h(new)/h(old)
877							# Metropolis algorithm \cite{neal:probabilistic}
878							sub mc {
879	16			16	0	30	my($self, $other, $type) = @_;
880
881	16					49	my($R,
882							$weight,
883							$state,
884							$old_weight,
885							$k,
886							%events
887							);
888
889	16					827	srand();
890							# take some class from the sample space as initial state
891	16					67	$state = $VECTOR_PACKAGE->new($self->{NR_FEATURES});
892							# make sure there are no constant features!
893	16					79	$state->Fill();
894	16					79	$events{$state->to_Bin(' ')}++;
895	16					89	$state->Empty();
896	16					23	$weight = 0;
897							# iterate
898	16					20	$k = 0;
899
900							do {
901	4795					5862	$old_weight = $weight;
902	4795	100				11803	if ($state->bit_flip($k)) {
903	1205					2220	$weight += $self->{PARAMETERS}[$k];
904							}
905							else {
906	3590					5517	$weight -= $self->{PARAMETERS}[$k];
907							}
908	4795					6655	$R = exp($weight - $old_weight);
909	4795	100				12107	if (!loaded_die(1 < $R ? 1 : $R)) { # stay at the old state
		100
910	2511					5821	$state->bit_flip($k);
911	2511					3137	$weight = $old_weight;
912							}
913							else { # add state
914	2284					6085	$events{$state->to_Bin(' ')}++;
915							}
916	4795	50				10245	if ($debug) {
917	0					0	print $state->to_Bin(' '),"\t",scalar(keys(%events)),"\t$R\n";
918							}
919							# next component
920	4795					24514	$k = ($k + 1) % $self->{NR_FEATURES};
921							} until ((scalar(keys(%events)) == $SAMPLE_size) \|\|
922	16		66			24	(scalar(keys(%events)) == 2 ** $self->{NR_FEATURES}));
923
924	16					263	for (keys(%events)) {
925	1600					6703	push @{$self->{CLASSES}},
926	1600					1851	$VECTOR_PACKAGE->new_vec($self->{NR_FEATURES}, $_, $self->{VECTYPE});
927							}
928	16					139	$self->{NR_CLASSES} = scalar(keys(%events)) - 1;
929
930	16					49	$self->{CLASSES_CHANGED} = 1;
931	16					459	$self->{PARAMETERS_CHANGED} = 1;
932							}
933
934
935							#
936							# IIS
937							#
938
939							# Newton estimation according to (Abney 1997), Appendix B
940							sub C_func {
941	0			0	0	0	my($self, $j, $x) = @_;
942
943	0					0	my($m,
944							$s0,
945							$s1,
946							$a_x_m);
947
948	0					0	$s0 = - $self->{NR_EVENTS} * $self->{E_REF}[$j];
949	0					0	$s1 = 0;
950	0					0	for ($m = 1; $m <= $self->{M}; $m++) {
951	0	0				0	if ($self->{"C"}{$m}{$j}) {
952	0					0	$a_x_m = $self->{"C"}{$m}{$j} * exp($x * $m);
953	0					0	$s0 += $a_x_m;
954	0					0	$s1 += $m * $a_x_m;
955							}
956							}
957	0	0				0	print "sum_func not a number: $s0\n"
958							unless is_float($s0);
959	0	0				0	print "sum_deriv not a number: $s1\n"
960							unless is_float($s1);
961
962	0	0				0	if ($s1 == 0) {
963	0					0	return(0);
964							}
965							else {
966	0					0	return($s0 / $s1);
967							}
968							}
969
970
971							# Newton estimation according to (Della Pietra et al. 1997)
972							sub A_func {
973	3538			3538	0	7401	my($self, $j, $x) = @_;
974
975	3538					3826	my($m,
976							$sum_func,
977							$sum_deriv,
978							$a_x_m);
979
980	3538					4987	$sum_func = -$self->{E_REF}[$j] * $self->{Z};
981	3538					3432	$sum_deriv = 0;
982	3538					7952	for ($m = 1; $m <= $self->{M}; $m++) {
983	45279	100				126808	if ($self->{"A"}{$m}{$j}) {
984	21560					41114	$a_x_m = $self->{"A"}{$m}{$j} * exp($x * $m);
985	21560					23045	$sum_func += $a_x_m;
986	21560					50776	$sum_deriv += $m * $a_x_m;
987							}
988							}
989	3538	50				5733	if ($sum_deriv == 0) {
990	0					0	return(0);
991							}
992							else {
993	3538					17901	return($sum_func / $sum_deriv);
994							}
995							}
996
997
998							# solves \alpha from
999							# \sum_{m=0}^{M} a_{m,j}^{(n)} e^{\alpha^{(n)}_j m}=0
1000							sub iis_estimate_with_newton {
1001	1103			1103	0	1399	my($self, $i) = @_;
1002
1003	1103					1153	my($x,
1004							$old_x,
1005							$deriv_res,
1006							$func_res,
1007							$k);
1008
1009							# $x = log(0)
1010	1103					1136	$x = 0;
1011	1103					1118	$k = 0;
1012
1013							# do newton's method
1014	1103		66			1210	do {
1015							# save old x
1016	3538					4086	$old_x = $x;
1017							# compute new x
1018	3538	100				6560	if ($self->{SAMPLING} eq "enum") {
1019							# (DDL 1997)
1020	504					912	$x -= $self->A_func($i, $x);
1021							}
1022							else {
1023							# sample -> (Abney 1997)
1024	3034					6113	$x -= $self->A_func($i, $x);
1025							}
1026							} until ((abs($x - $old_x) <= $NEWTON_min) \|\|
1027							($k++ > $NEWTON_max_it));
1028	1103	50				2642	if ($debug) {
1029	0					0	print "Estimated gamma_$i with Newton's method: $x\n";
1030							}
1031	1103					3008	return($x);
1032							}
1033
1034
1035							# updates parameter $i
1036							sub gamma {
1037	332			332	0	497	my($self, $sample) = @_;
1038
1039	332					437	my($f);
1040
1041	332					865	for $f (0..$self->{NR_FEATURES} - 1) {
1042	3644	50				9846	if (!$self->{FEATURE_IGNORE}->bit_test($f)) {
1043	3644	100				6768	if ($self->{SCALER} eq "gis") {
1044							$self->{PARAMETERS}[$f] +=
1045	2541					7796	log($self->{E_REF}[$f] / $sample->{E_LOGLIN}[$f]) / $sample->{M};
1046							}
1047							else {
1048	1103					2405	$self->{PARAMETERS}[$f] +=
1049							$sample->iis_estimate_with_newton($f);
1050							}
1051							}
1052							}
1053
1054	332	50	66			1409	if (($self->{SCALER} eq "gis") && ($self->{NEED_CORRECTION_FEATURE})) {
1055							$self->{CORRECTION_PARAMETER} +=
1056							log($self->{E_REF}[$self->{NR_FEATURES}] /
1057	236					801	$sample->{E_LOGLIN}[$self->{NR_FEATURES}]) / $self->{M};
1058							}
1059							}
1060
1061
1062							# the iterative scaling algorithms
1063							sub scale {
1064	22			22	1	94	my($self, $sampling, $scaler) = @_;
1065
1066	22					40	my($k,
1067							$i,
1068							$kl,
1069							$old_kl,
1070							$diff,
1071							$sample,
1072							$old_correction_parameter,
1073							@old_parameters);
1074
1075	22	100				51	if ($sampling) {
1076	10					26	$self->{SAMPLING} = $sampling;
1077							}
1078	22	100				69	if ($scaler) {
1079	10					25	$self->{SCALER} = $scaler;
1080							}
1081	22	50	66			119	if (($self->{SAMPLING} eq "enum") && ($self->{VECTYPE} eq "integer")) {
1082	0					0	$self->die("Cannot enumerate integer vectors\n");
1083							}
1084	22	50	66			90	if (($self->{SAMPLING} eq "mc") && ($self->{VECTYPE} eq "integer")) {
1085	0					0	$self->die("Cannot sample from integer vector space\n");
1086							}
1087
1088	22					91	$self->init_parameters();
1089	22					81	$self->prepare_model();
1090	22					429	$self->log_msg("scale($self->{SCALER}, $self->{SAMPLING}, $self->{VECTYPE}): H(p_ref)=$self->{H_p_ref}\nit.\tD(p_ref\|\|p)\t\tH(p)\t\t\tL(p_ref,p)\t\ttime\n0\t$self->{KL}\t$self->{H_p}\t$self->{L}\t" . time() . "\n");
1091	22					35	$k = 0;
1092	22					43	$kl = 1e99;
1093	22		66			30	do {
			66
1094							# store parameters for reverting if converging stops
1095	332					436	@old_parameters = @{$self->{PARAMETERS}};
	332					2038
1096	332					650	$old_correction_parameter = $self->{CORRECTION_PARAMETER};
1097	332	100				720	if ($sample) {
1098	310					1018	$sample->DESTROY();
1099							}
1100	332					836	$sample = $self->sample();
1101	332					1027	$self->gamma($sample);
1102	332					582	$self->{PARAMETERS_CHANGED} = 1;
1103	332					788	$self->prepare_model();
1104	332					609	$diff = $kl - $self->{KL};
1105	332					498	$kl = $self->{KL};
1106
1107	332					414	$k++;
1108	332					3922	$self->log_msg("$k\t$self->{KL}\t$self->{H_p}\t$self->{L}\t" . time() . "\n");
1109	332	50				866	if ($debug) {
1110	0					0	$self->check();
1111							}
1112	332	100				726	if ($diff < 0) {
1113	10					31	$self->log_msg("Scaling is not converging (anymore); will revert parameters!\n");
1114							# restore old parameters
1115	10					35	$self->{PARAMETERS} = \@old_parameters;
1116	10					26	$self->{CORRECTION_PARAMETER} = $old_correction_parameter;
1117	10					18	$self->{PARAMETERS_CHANGED} = 1;
1118	10					27	$self->prepare_model();
1119							}
1120	332	50				9036	if ($cntrl_backslash_pressed) {
1121	0					0	$self->dump();
1122	0					0	$cntrl_backslash_pressed = 0;
1123							}
1124							} until ($diff <= $KL_min \|\| ($k > $KL_max_it) \|\| ($diff < 0));
1125							}
1126
1127
1128							#
1129							# Field Induction Algorithm
1130							#
1131
1132							# add feature $g to $self
1133							sub add_candidate {
1134	28			28	0	45	my($self, $candidates, $g) = @_;
1135
1136	28					39	my($i);
1137
1138	28					57	$self->{NR_FEATURES}++;
1139	28					102	for ($i = 0; $i < $self->{NR_CLASSES}; $i++) {
1140							$self->{CLASSES}[$i]->insert_column($g,
1141	2800					7938	$candidates->{CANDIDATES}[$i]->weight($g));
1142							}
1143	28	100				90	if ($self->{SCALER} eq "gis") {
1144	14					44	$self->{PARAMETERS}[$self->{NR_FEATURES} - 1] = 1;
1145							}
1146							else {
1147	14					56	$self->{PARAMETERS}[$self->{NR_FEATURES} - 1] = $candidates->{ALPHA}[$g];
1148							}
1149	28					38	push @{$self->{FEATURE_NAMES}}, $candidates->{CANDIDATE_NAMES}[$g];
	28					134
1150	28					45	$self->{PARAMETERS_CHANGED} = 1;
1151	28					45	$self->{CLASSES_CHANGED} = 1;
1152	28					74	$self->prepare_model();
1153							}
1154
1155
1156							# remove the last column
1157							sub remove_candidate {
1158	20			20	0	138	my($self) = @_;
1159
1160	20					27	my($i);
1161
1162	20					61	for ($i = 0; $i < $self->{NR_CLASSES}; $i++) {
1163							# substitute offset $g length 1 by nothing
1164	2000					5046	$self->{CLASSES}[$i]->delete_column($self->{NR_FEATURES}-1);
1165							}
1166	20					31	pop @{$self->{PARAMETERS}};
	20					43
1167	20					32	pop @{$self->{FEATURE_NAMES}};
	20					54
1168	20					45	$self->{NR_FEATURES}--;
1169	20					36	$self->{PARAMETERS_CHANGED} = 1;
1170	20					30	$self->{CLASSES_CHANGED} = 1;
1171	20					52	$self->prepare_model();
1172							}
1173
1174
1175							# checks for $event, if not there adds it, otherwise increases its {FREQ}
1176							sub add_event {
1177	0			0	0	0	my($self, $event) = @_;
1178
1179	0					0	my($i,
1180							$found);
1181
1182	0					0	$found = 0;
1183	0					0	for ($i = 0; $i < $self->{NR_CLASSES}; $i++) {
1184	0					0	$found = ($event->Compare($self->{CLASSES}[$i]) == 0);
1185	0	0				0	if ($found) {
1186	0					0	$self->{FREQ}[$i]++;
1187	0					0	last;
1188							}
1189							}
1190	0	0				0	if (!$found) {
1191	0					0	$self->{CLASSES}[$self->{NR_CLASSES}] = $event;
1192	0					0	$self->{FREQ}[$self->{NR_CLASSES}] = 1;
1193	0					0	$self->{NR_CLASSES}++;
1194							}
1195	0					0	$self->{NR_EVENTS}++;
1196							}
1197
1198
1199							# computes the gain for all $candidates
1200							sub gain {
1201	8			8	0	149	my($self, $candidates) = @_;
1202
1203	8					14	my($c,
1204							$x,
1205							$kl,
1206							$below,
1207							$above,
1208							$sum_p_ref,
1209							$sum_p);
1210
1211	8					27	$candidates->{MAX_GAIN} = 0;
1212	8					20	$candidates->{BEST_CAND} = 0;
1213	8					39	for ($c = 0; $c < $candidates->{NR_CANDIDATES}; $c++) {
1214	24	100				89	if (!$candidates->{ADDED}{$c}) {
1215	20					29	$sum_p_ref = 0;
1216	20					30	$sum_p = 0;
1217	20					60	for ($x = 0; $x < $self->{NR_CLASSES}; $x++) {
1218	2000	100				5854	if ($candidates->{CANDIDATES}[$x]->bit_test($c)) {
1219	238					324	$sum_p += $self->{CLASS_EXP_WEIGHTS}[$x];
1220	238					630	$sum_p_ref += $self->{FREQ}[$x];
1221							}
1222							}
1223	20					33	$sum_p /= $self->{Z};
1224	20					35	$sum_p_ref /= $self->{NR_EVENTS};
1225	20					37	$above = $sum_p_ref * (1 - $sum_p);
1226	20					223	$below = $sum_p * (1 - $sum_p_ref);
1227	20	50				68	if ($above * $below > 0) {
1228	20					75	$candidates->{ALPHA}[$c] = log($above / $below);
1229							}
1230							else {
1231	0					0	$self->die("Cannot take log of negative/zero value: $above / $below\n");
1232							}
1233							# temporarily add feature to classes and compute $gain
1234	20					37	$kl = $self->{KL};
1235	20					101	$self->add_candidate($candidates, $c);
1236	20					84	$candidates->{GAIN}[$c] = $kl - $self->{KL};
1237	20					254	$self->log_msg("G($c, $candidates->{ALPHA}[$c]) = $candidates->{GAIN}[$c]\n");
1238	20	100				79	if (($candidates->{MAX_GAIN} <= $candidates->{GAIN}[$c])) {
1239	5					14	$candidates->{MAX_GAIN} = $candidates->{GAIN}[$c];
1240	5					11	$candidates->{BEST_CAND} = $c;
1241							}
1242							# remove the feature
1243	20					73	$self->remove_candidate();
1244							}
1245							}
1246							}
1247
1248
1249							# adds the $n best candidates
1250							sub fi {
1251	4			4	1	27	my($self, $scaler, $candidates, $n, $sample) = @_;
1252
1253	4					5	my ($i,
1254							$kl);
1255
1256	4					29	$self->log_msg("fi($scaler, $sample, $n, $self->{VECTYPE})\n");
1257	4	50				13	if ($scaler) {
1258	4					48	$self->{SCALER} = $scaler;
1259							}
1260	4	50				14	if ($sample) {
1261	4					12	$self->{SAMPLING} = $sample;
1262							}
1263
1264	4	50				16	if ($self->{NR_CLASSES} != $candidates->{NR_CLASSES}) {
1265	0					0	$self->die("Candidates have the wrong number of events\n");
1266							}
1267	4					18	$self->scale();
1268	4					15	$kl = $self->{KL};
1269	4	50				20	$n = ($n > $candidates->{NR_CANDIDATES}) ? $candidates->{NR_CANDIDATES} : $n;
1270	4					14	for ($i = 0; $i < $n; $i++) {
1271	8					45	$self->gain($candidates);
1272	8					52	$self->add_candidate($candidates, $candidates->{BEST_CAND});
1273	8					39	$candidates->{ADDED}{$candidates->{BEST_CAND}} = 1;
1274	8					42	$self->log_msg("Adding candidate $candidates->{BEST_CAND}\n");
1275	8					37	$self->scale();
1276	8					75	$self->log_msg("Actual gain: " . ($self->{KL} - $kl) . "\n");
1277	8					37	$kl = $self->{KL};
1278							}
1279	4					22	return(1);
1280							}
1281
1282
1283							#
1284							# Routines for classification, only binary features!
1285							#
1286
1287							# context features are 0 .. $n-1
1288							# $x is a vector, $sampling
1289							sub classify {
1290	128			128	0	618	my($self, $x) = @_;
1291
1292	128					172	my($y,
1293							$sum,
1294							$i,
1295							$weight,
1296							$best_class,
1297							$best_weight);
1298
1299	128					448	$self->log_msg("classify(" . $x->to_Bin('') . ")\n");
1300	128					357	$sum = 0;
1301							# use every possible completion of $x to compute $sum
1302							# allocate a class vector
1303	128					666	$y = $VECTOR_PACKAGE->new($self->{NR_FEATURES} - $x->Size());
1304	128					201	$best_weight = 0;
1305							# for every possible $y
1306	128					406	for ($i = 0; $i < 2 ** $y->Size(); $i++) {
1307							# compute p(x,y) which proportional to p(y\|x) (I hope)
1308	0					0	$weight = $self->weight($x, $y);
1309	0	0				0	if ($weight > $best_weight) {
1310	0					0	$best_class = $y;
1311	0					0	$best_weight = $weight;
1312	0	0				0	if ($debug) {
1313	0					0	print "$i\t", $y->to_Bin(''), "\t$weight\n";
1314							}
1315							}
1316	0					0	$y->increment();
1317							}
1318	128					492	return($best_class, $best_weight);
1319							}
1320
1321
1322							1;
1323
1324							__END__