File Coverage

blib/lib/PDL/HMM.pm

Criterion	Covered	Total	%
statement	30	30	100.0
branch	8	16	50.0
condition			n/a
subroutine	5	5	100.0
pod	2	2	100.0
total	45	53	84.9

line	stmt	bran	sub	pod	time	code
1
2						#
3						# GENERATED WITH PDL::PP! Don't modify!
4						#
5						package PDL::HMM;
6
7						@EXPORT_OK = qw( PDL::PP logzero PDL::PP logadd PDL::PP logdiff PDL::PP logsumover PDL::PP hmmfw hmmalpha PDL::PP hmmfwq hmmalphaq PDL::PP hmmbw hmmbeta PDL::PP hmmbwq hmmbetaq hmmexpect0 PDL::PP hmmexpect PDL::PP hmmexpectq hmmmaximize PDL::PP hmmviterbi PDL::PP hmmviterbiq PDL::PP hmmpath PDL::PP hmmpathq );
8						%EXPORT_TAGS = (Func=>[@EXPORT_OK]);
9
10	5		5		1029782	use PDL::Core;
	5				14
	5				29
11	5		5		1103	use PDL::Exporter;
	5				10
	5				27
12	5		5		141	use DynaLoader;
	5				12
	5				2859
13
14
15
16						$PDL::HMM::VERSION = 0.06005;
17						@ISA = ( 'PDL::Exporter','DynaLoader' );
18						push @PDL::Core::PP, __PACKAGE__;
19						bootstrap PDL::HMM $VERSION;
20
21
22
23
24						=pod
25
26						=head1 NAME
27
28						PDL::HMM - Hidden Markov Model utilities in PDL
29
30						=head1 SYNOPSIS
31
32						use PDL::HMM;
33
34						##-----------------------------------------------------
35						## Dimensions
36
37						$N = $number_of_states;
38						$M = $number_of_symbols;
39						$T = $length_of_input;
40
41						$A = $maximum_ambiguity;
42
43						##-----------------------------------------------------
44						## Parameters
45
46						$af = log(random($N,$N));
47						$bf = log(random($N,$M));
48						$pif = log(random($N));
49						$omegaf = log(random($N));
50
51						@theta = ($a,$b,$pi,$omega) = hmmmaximize($af,$bf,$pif,$omegaf);
52
53						$o = long(rint($M*random($T)));
54
55						maximum_n_ind(dice_axis($a->logsumover+$pi+$b, 1,$o),
56						($oq=zeroes(long,$A,$T))); ##-- for constrained variants
57
58						##-----------------------------------------------------
59						## Log arithmetic
60
61						$log0 = logzero;
62						$logz = logadd(log($x),log($y));
63						$logz = logdiff(log($x),log($y));
64						$logz = logsumover(log($x));
65
66						##-----------------------------------------------------
67						## Sequence Probability
68
69						$alpha = hmmfw ($a,$b,$pi, $o ); ##-- forward (full)
70						$alphaq = hmmfwq($a,$b,$pi, $o, $oq); ##-- forward (constrained)
71
72						$beta = hmmbw ($a,$b,$omega, $o ); ##-- backward (full)
73						$betaq = hmmbwq($a,$b,$omega, $o,$oq); ##-- backward (constrained)
74
75						##-----------------------------------------------------
76						## Parameter Estimation
77
78						@expect = ($ea,$eb,$epi,$eomega) = hmmexpect0(@theta); ##-- initialize
79
80						hmmexpect (@theta, $o, $alpha, $beta, $ea,$eb,$epi); ##-- expect (full)
81						hmmexpectq(@theta, $o,$oq, $alphaq,$betaq, $ea,$eb,$epi); ##-- expect (constrained)
82
83						($a,$b,$pi,$omega) = hmmmaximize($ea,$eb,$epi,$eomega); ##-- maximize
84
85						##-----------------------------------------------------
86						## Sequence Analysis
87
88						($delta,$psi) = hmmviterbi ($a,$b,$pi, $o); ##-- trellis (full)
89						($deltaq,$psiq) = hmmviterbiq($a,$b,$pi, $o,$oq); ##-- trellis (constrained)
90
91						$paths = hmmpath ( $psi, sequence($N)); ##-- backtrace (full)
92						$pathsq = hmmpathq($oq, $psiq, sequence($A)); ##-- backtrace (constrained)
93
94						=cut
95
96
97
98
99
100
101
102						=head1 FUNCTIONS
103
104
105
106						=cut
107
108
109
110
111						=pod
112
113						=head1 Log Arithmetic
114
115						=cut
116
117
118
119
120						=head2 logzero
121
122						=for sig
123
124						Signature: (float+ [o]a())
125
126						=for ref
127
128						Approximates $a() = log(0), avoids nan.
129
130						=for bad
131
132						logzero() handles bad values. The state of the output PDL is always good.
133
134						=cut
135
136
137
138
139
140
141						*logzero = \&PDL::logzero;
142
143
144
145
146
147						=head2 logadd
148
149						=for sig
150
151						Signature: (a(); b(); [o]c())
152
153						=for ref
154
155						Computes $c() = log(exp($a()) + exp($b())), should be more stable.
156
157						=for bad
158
159						logadd does not process bad values.
160						It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles.
161
162
163						=cut
164
165
166
167
168
169
170						*logadd = \&PDL::logadd;
171
172
173
174
175
176						=head2 logdiff
177
178						=for sig
179
180						Signature: (a(); b(); [o]c())
181
182						=for ref
183
184						Computes log symmetric difference c = log(exp(max(a,b)) - exp(min(a,b))), may be more stable.
185
186						=for bad
187
188						logdiff does not process bad values.
189						It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles.
190
191
192						=cut
193
194
195
196
197
198
199						*logdiff = \&PDL::logdiff;
200
201
202
203
204
205						=head2 logsumover
206
207						=for sig
208
209						Signature: (a(n); [o]b())
210
211						=for ref
212
213						Computes $b() = log(sumover(exp($a()))), should be more stable.
214
215						=for bad
216
217						logsumover does not process bad values.
218						It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles.
219
220
221						=cut
222
223
224
225
226
227
228						*logsumover = \&PDL::logsumover;
229
230
231
232						=pod
233
234						=head1 Sequence Probability
235
236						=cut
237
238
239
240
241						=head2 hmmfw
242
243						=for sig
244
245						Signature: (a(N,N); b(N,M); pi(N); o(T); [o]alpha(N,T))
246
247						Compute forward probability (alpha) matrix
248						for input $o given model parameters
249						@theta = ($a, $b, $pi, $omega).
250
251						Output (pseudocode) for all 0<=i
252
253						$alpha(i,t) = log P( $o(0:t), q(t)==i \| @theta )
254
255						Note that the final-state probability vector $omega() is neither
256						passed to this function nor used in the computation, but
257						can be used to compute the final sequence probability for $o as:
258
259						log P( $o \| @theta ) = logsumover( $omega() + $alpha(:,t-1) )
260
261
262
263						=for bad
264
265						hmmfw does not process bad values.
266						It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles.
267
268
269						=cut
270
271
272
273
274
275
276						*hmmfw = \&PDL::hmmfw;
277
278
279
280						*hmmalpha = \&hmmfw;
281
282
283
284						=head2 hmmfwq
285
286						=for sig
287
288						Signature: (a(N,N); b(N,M); pi(N); o(T); oq(Q,T); [o]alphaq(Q,T))
289
290						Compute constrained forward probability (alphaq) matrix
291						for input $o given model parameters
292						@theta = ($a, $b, $pi, $omega),
293						considering only the initial
294						non-negative state indices in $oq(:,t) for each observation $o(t).
295
296						Output (pseudocode) for all 0<=qi
297
298						$alphaq(qi,t) = log P( $o(0:t), q(t)==$oq(qi,t) \| @theta )
299
300						Note that the final-state probability vector $omega() is neither
301						passed to this function nor used in the computation, but
302						can be used to compute the final sequence probability for $o as:
303
304						log P( $o \| @theta ) = logsumover( $alphaq(:,t-1) + $omega($oqTi) )
305
306						where:
307
308						$oqTi = $oq(:,t-1)->where($oq(:,t-1)>=0)
309
310
311
312						=for bad
313
314						hmmfwq does not process bad values.
315						It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles.
316
317
318						=cut
319
320
321
322
323
324
325						*hmmfwq = \&PDL::hmmfwq;
326
327
328
329						*hmmalphaq = \&hmmfwq;
330
331
332
333						=head2 hmmbw
334
335						=for sig
336
337						Signature: (a(N,N); b(N,M); omega(N); o(T); [o]beta(N,T))
338
339						Compute backward probability (beta) matrix
340						for input $o given model parameters
341						@theta = ($a, $b, $pi, $omega).
342
343						Output (pseudocode) for all 0<=i
344
345						$beta(i,t) = log P( $o(t+1:T-1) \| q(t)==i, @theta )
346
347						Note that the initial-state probability vector $pi() is neither
348						passed to this function nor used in the computation, but
349						can be used to compute the final sequence probability for $o as:
350
351						log P( $o \| @theta ) = logsumover( $pi() + $b(:,$o(0)) + $beta(:,0) )
352
353
354
355						=for bad
356
357						hmmbw does not process bad values.
358						It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles.
359
360
361						=cut
362
363
364
365
366
367
368						*hmmbw = \&PDL::hmmbw;
369
370
371
372						*hmmbeta = \&hmmbw;
373
374
375
376						=head2 hmmbwq
377
378						=for sig
379
380						Signature: (a(N,N); b(N,M); omega(N); o(T); oq(Q,T); [o]betaq(Q,T))
381
382						Compute constrained backward probability (betaq) matrix
383						for input $o given model parameters
384						@theta = ($a, $b, $pi, $omega),
385						considering only the initial non-negative state indices in $oq(:,t) for
386						each observation $o(t).
387
388						Output (pseudocode) for all 0<=qi
389
390						$betaq(qi,t) = log P( $o(t+1:T-1) \| q(t)==$oq(qi,t), @theta )
391
392						Note that the initial-state probability vector $pi() is neither
393						passed to this function nor used in the computation, but
394						can be used to compute the final sequence probability for $o as:
395
396						log P( $o \| @theta ) = logsumover( $betaq(:,0) + $pi($oq0i) + $b($oq0i,$o(0)) )
397
398						where:
399
400						$oq0i = $oq(:,0)->where( $oq(:,0) >= 0 );
401
402
403
404						=for bad
405
406						hmmbwq does not process bad values.
407						It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles.
408
409
410						=cut
411
412
413
414
415
416
417						*hmmbwq = \&PDL::hmmbwq;
418
419
420
421						*hmmbetaq = \&hmmbwq;
422
423						=pod
424
425						=head1 Parameter Estimation
426
427						=head2 hmmexpect0
428
429						=for sig
430
431						Signature: (a(N,N); b(N,M); pi(N); omega(N); [o]ea(N,N); [o]eb(N,M); [o]epi(N); [o]eomega(N))
432
433						Initializes expectation matrices $ea(), $eb() and $epi() to logzero().
434						For use with hmmexpect().
435
436						=cut
437
438						sub hmmexpect0 {
439	1		1	1	92909	my ($a,$b,$pi,$omega, $ea,$eb,$epi,$eomega) = @_;
440
441	1	50			24	$ea = zeroes($a->type, $a->dims) if (!defined($ea));
442	1	50			142	$eb = zeroes($b->type, $b->dims) if (!defined($eb));
443	1	50			96	$epi = zeroes($pi->type, $pi->dims) if (!defined($epi));
444	1	50			84	$eomega = zeroes($omega->type, $omega->dims) if (!defined($eomega));
445
446	1				84	$ea .= PDL::logzero();
447	1				20	$eb .= PDL::logzero();
448	1				19	$epi .= PDL::logzero();
449	1				18	$eomega .= PDL::logzero();
450
451	1				17	return ($ea,$eb,$epi,$eomega);
452						}
453
454
455
456
457
458
459						=head2 hmmexpect
460
461						=for sig
462
463						Signature: (a(N,N); b(N,M); pi(N); omega(N); o(T); alpha(N,T); beta(N,T); [o]ea(N,N); [o]eb(N,M); [o]epi(N); [o]eomega(N))
464
465						Compute partial Baum-Welch re-estimation of the model @theta = ($a, $b, $pi, $omega)
466						for the observation sequence $o() with forward- and backward-probability
467						matrices $alpha(), $beta(). Result is recorded as log pseudo-frequencies
468						in the expectation matrices $ea(), $eb(), $epi(), and $eomega(), which are required parameters,
469						and should have been initialized (e.g. by L()) before calling this function.
470
471						Can safely be called sequentially for incremental reestimation.
472
473
474						=for bad
475
476						hmmexpect does not process bad values.
477						It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles.
478
479
480						=cut
481
482
483
484
485
486
487						*hmmexpect = \&PDL::hmmexpect;
488
489
490
491
492
493						=head2 hmmexpectq
494
495						=for sig
496
497						Signature: (a(N,N); b(N,M); pi(N); omega(N); o(T); oq(Q,T); alphaq(Q,T); betaq(Q,T); [o]ea(N,N); [o]eb(N,M); [o]epi(N); [o]eomega(N))
498
499						Compute constrained partial Baum-Welch re-estimation of the model @theta = ($a, $b, $pi, $omega)
500						for the observation sequence $o(),
501						with constrained forward- and backward-probability
502						matrices $alphaq(), $betaq(),
503						considering only the initial non-negative state
504						indices in $oq(:,t) for observation $o(t).
505						Result is recorded as log pseudo-frequencies
506						in the expectation matrices $ea(), $eb(), $epi(), and $eomega(), which are required parameters,
507						and should have been initialized (e.g. by L()) before calling this function.
508
509						Can safely be called sequentially for incremental reestimation.
510
511
512						=for bad
513
514						hmmexpectq does not process bad values.
515						It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles.
516
517
518						=cut
519
520
521
522
523
524
525						*hmmexpectq = \&PDL::hmmexpectq;
526
527
528
529						=pod
530
531						=head2 hmmmaximize
532
533						=for sig
534
535						Signature: (Ea(N,N); Eb(N,M); Epi(N); Eomega(N); [o]ahat(N,N); [o]bhat(N,M); [o]pihat(N); [o]omegahat(N));
536
537						Maximizes expectation values from $Ea(), $Eb(), $Epi(), and $Eomega()
538						to log-probability matrices $ahat(), $bhat(), $pihat(), and $omegahat().
539						Can also be used to compile a maximum-likelihood model
540						from log-frequency matrices.
541
542						=cut
543
544						sub hmmmaximize {
545	1		1	1	636	my ($ea,$eb,$epi,$eomega, $ahat,$bhat,$pihat,$omegahat) = @_;
546
547	1	50			8	$ahat = zeroes($ea->type, $ea->dims) if (!defined($ahat));
548	1	50			101	$bhat = zeroes($eb->type, $eb->dims) if (!defined($bhat));
549	1	50			92	$pihat = zeroes($epi->type, $epi->dims) if (!defined($pihat));
550	1	50			158	$omegahat = zeroes($eomega->type, $eomega->dims) if (!defined($omegahat));
551
552	1				101	my $easumover = $ea->xchg(0,1)->logsumover->inplace->logadd($eomega);
553
554	1				32	$ahat .= $ea - $easumover;
555	1				27	$bhat .= $eb - $eb->xchg(0,1)->logsumover;
556	1				25	$pihat .= $epi - $epi->logsumover;
557	1				18	$omegahat .= $eomega - $easumover;
558
559	1				16	return ($ahat,$bhat,$pihat,$omegahat);
560						}
561
562
563
564						=pod
565
566						=head1 Sequence Analysis
567
568						=cut
569
570
571
572
573						=head2 hmmviterbi
574
575						=for sig
576
577						Signature: (a(N,N); b(N,M); pi(N); o(T); [o]delta(N,T); int [o]psi(N,T))
578
579						Computes Viterbi algorithm trellises $delta() and $psi() for the
580						observation sequence $o() given the model parameters @theta = ($a,$b,$pi,$omega).
581
582						Outputs:
583
584						Probability matrix $delta(): log probability of best path to state $j at time $t:
585
586						$delta(j,t) = max_{q(0:t)} log P( $o(0:t), q(0:t-1), $q(t)==j \| @theta )
587
588						Path backtrace matrix $psi(): best predecessor for state $j at time $t:
589
590						$psi(j,t) = arg_{q(t-1)} max_{q(0:t)} P( $o(0:t), q(0:t-1), $q(t)==j \| @theta )
591
592						Note that if you are using termination probabilities $omega(),
593						then in order to find the most likely final state, you need to
594						compute the contribution of $omega() yourself, which is easy
595						to do:
596
597						$best_final_q = maximum_ind($delta->slice(",-1") + $omega);
598
599
600
601						=for bad
602
603						hmmviterbi does not process bad values.
604						It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles.
605
606
607						=cut
608
609
610
611
612
613
614						*hmmviterbi = \&PDL::hmmviterbi;
615
616
617
618
619
620						=head2 hmmviterbiq
621
622						=for sig
623
624						Signature: (a(N,N); b(N,M); pi(N); o(T); oq(Q,T); [o]deltaq(Q,T); int [o]psiq(Q,T))
625
626						Computes constrained Viterbi algorithm trellises $deltaq() and $psiq() for the
627						observation sequence $o() given the model parameters @theta = ($a,$b,$pi,$omega),
628						considering only the initial non-negative state indices $oq(:,t) for each
629						observarion $o(t).
630
631						Outputs:
632
633						Constrained probability matrix $deltaq(): log probability of best path to state $oq(j,t) at time $t:
634
635						$deltaq(j,t) = max_{j(0:t)} log P( $o(0:t), q(0:t-1)==$oq(:,j(0:t-1)), q(t)==$oq(j,t) \| @theta )
636
637						Constrained path backtrace matrix $psiq(): best predecessor index for state $oq(j,t) at time $t:
638
639						$psiq(j,t) = arg_{j(t-1)} max_{j(0:t)} P( $o(0:t), q(0:t-1)=$oq(:,j(0:t-1)), q(t)==$oq(j,t) \| @theta )
640
641						Note that if you are using termination probabilities $omega(),
642						then in order to find the most likely final state, you need to
643						compute the contribution of $omega() yourself, which is quite easy
644						to do:
645
646						$best_final_j = maximum_ind($deltaq->slice(",-1") + $omega->index($oq->slice(",(-1)")))
647
648
649
650						=for bad
651
652						hmmviterbiq does not process bad values.
653						It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles.
654
655
656						=cut
657
658
659
660
661
662
663						*hmmviterbiq = \&PDL::hmmviterbiq;
664
665
666
667
668
669						=head2 hmmpath
670
671						=for sig
672
673						Signature: (psi(N,T); int qfinal(); int [o]path(T))
674
675						Computes best-path backtrace $path() for the final state $qfinal()
676						from completed Viterbi trellis $psi().
677
678						Outputs:
679
680						Path backtrace $path(): state (in best sequence) at time $t:
681
682						$path(t) = arg_{q(t)} max_{q(0:T-1)} log P( $o(), q(0:T-2), $q(T-1)==$qfinal() \| @theta )
683
684						This even threads over multiple final states, if specified,
685						so you can align paths to their final states just by calling:
686
687						$bestpaths = hmmpath($psi, sequence($N));
688
689						Note that $path(T-1) == $qfinal(): yes, this is redundant,
690						but also tends to be quite convenient.
691
692
693
694						=for bad
695
696						hmmpath does not process bad values.
697						It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles.
698
699
700						=cut
701
702
703
704
705
706
707						*hmmpath = \&PDL::hmmpath;
708
709
710
711
712
713						=head2 hmmpathq
714
715						=for sig
716
717						Signature: (oq(Q,T); psiq(Q,T); int qfinalq(); int [o]path(T))
718
719						Computes constrained best-path backtrace $path() for the final state index $qfinalq()
720						from completed constrained Viterbi trellis $psiq().
721
722						Outputs:
723
724						Path backtrace $path(): state (in best sequence) at time $t:
725
726						$path(t) = arg_{q(t)} max_{q(0:T-1)} log P( $o(), q(0:T-2), $q(T-1)==$oq($qfinalq(),T-1) \| @theta )
727
728						This is really just a convenience method for dealing with constrained
729						lookup -- the same thing can be accomplished using hmmpath() and
730						some PDL index magic.
731
732
733
734						=for bad
735
736						hmmpathq does not process bad values.
737						It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles.
738
739
740						=cut
741
742
743
744
745
746
747						*hmmpathq = \&PDL::hmmpathq;
748
749
750
751
752
753						##---------------------------------------------------------------------
754						=pod
755
756						=head1 COMMON PARAMETERS
757
758						HMMs are specified by parameters $a(N,N), $b(N,M), $pi(N), and $omega(N);
759						input sequences are represented as vectors $o(T) of integer values in the range [0..M-1],
760						where the following notational conventions are used:
761
762						=over 4
763
764
765						=item States:
766
767						The model has $N states, denoted $q,
768						0 <= $q < $N.
769
770
771						=item Alphabet:
772
773						The input- (aka "observation-") alphabet of the model has $M elements,
774						denoted $o(t), 0 <= $o(t) < $M.
775
776
777						=item Time indices:
778
779						Time indices are denoted $t,
780						1 <= $t < $T.
781
782
783						=item Input Sequences:
784
785						Input- (aka "observation-") sequences are represented as vectors of
786						of length $T whose component values are in the range [0..M-1],
787						i.e. alphabet indices.
788
789
790
791						=item Initial Probabilities:
792
793						The vector $pi(N) gives the (log) initial state probability distribution:
794
795						$pi(i) = log P( $q(0)==i )
796
797
798
799						=item Final Probabilities:
800
801						The vector $omega(N) gives the (log) final state probability distribution:
802
803						$omega(i) = log P( $q($T)==i )
804
805						This parameter is a nonstandard extension.
806						You can simulate the behavior of more traditional definitions
807						(such as that given in Rabiner (1989)) by setting:
808
809						$omega = zeroes($N);
810
811						wherever it is required.
812
813
814
815						=item Arc Probabilities:
816
817						The matrix $a(N,N) gives the (log) conditional state-transition probability distribution:
818
819						$a(i,j) = log P( $q(t+1)==j \| $q(t)==i )
820
821
822
823						=item Emission Probabilities:
824
825						The matrix $b(N,M) gives the (log) conditional symbol emission probability:
826
827						$b(j,o) = log P( $o(t)==o \| $q(t)==j )
828
829
830
831						=back
832
833						=cut
834
835						##---------------------------------------------------------------------
836						=pod
837
838						=head1 ACKNOWLEDGEMENTS
839
840						Perl by Larry Wall.
841
842						PDL by Karl Glazebrook, Tuomas J. Lukka, Christian Soeller, and others.
843
844						Implementation based largely on the formulae in:
845						L. E. Rabiner, "A tutorial on Hidden Markov Models and selected
846						applications in speech recognition," Proceedings of the IEEE 77:2,
847						Februrary, 1989, pages 257--286.
848
849						=cut
850
851						##----------------------------------------------------------------------
852						=pod
853
854						=head1 KNOWN BUGS
855
856						Probably many.
857
858						=cut
859
860
861						##---------------------------------------------------------------------
862						=pod
863
864						=head1 AUTHOR
865
866						Bryan Jurish Emoocow@cpan.orgE
867
868						=head2 Copyright Policy
869
870						Copyright (C) 2005, 2006, 2008, 2011 by Bryan Jurish. All rights reserved.
871
872						This package is free software, and entirely without warranty.
873						You may redistribute it and/or modify it under the same terms
874						as Perl itself.
875
876						=head1 SEE ALSO
877
878						perl(1), PDL(3perl).
879
880						=cut
881
882
883
884						;
885
886
887
888						# Exit with OK status
889
890						1;
891
892