File Coverage

blib/lib/OpenMosix/HA.pm
Criterion Covered Total %
statement 7 9 77.7
branch n/a
condition n/a
subroutine 3 3 100.0
pod n/a
total 10 12 83.3


line stmt bran cond sub pod time code
1             package OpenMosix::HA;
2 15     15   118437 use strict;
  15         38  
  15         617  
3 15     15   93 use warnings;
  15         28  
  15         512  
4 15     15   35014 use Cluster::Init;
  0            
  0            
5             # use Event qw(one_event loop unloop);
6             use Time::HiRes qw(time);
7             use Data::Dump qw(dump);
8             use Sys::Syslog;
9              
10             BEGIN {
11             use Exporter ();
12             use vars qw (
13             $VERSION
14             @ISA
15             @EXPORT
16             @EXPORT_OK
17             %EXPORT_TAGS
18             $LOGOPEN
19             $PROGRAM
20             );
21             $VERSION = 0.555;
22             @ISA = qw (Exporter);
23             @EXPORT = qw ();
24             @EXPORT_OK = qw ();
25             %EXPORT_TAGS = ();
26             $LOGOPEN = 0;
27             $PROGRAM=$0; $PROGRAM =~ s/.*\///;
28             }
29              
30             # COMMON
31              
32             sub debug
33             {
34             my $debug = $ENV{DEBUG} || 0;
35             return unless $debug;
36             my ($package, $filename, $line, $subroutine, $hasargs, $wantarray, $evaltext, $is_require, $hints, $bitmask) = caller(1);
37             my $subline = (caller(0))[2];
38             my $msg = join(' ',@_);
39             $msg.="\n" unless $msg =~ /\n$/;
40             warn time()." $$ $subroutine,$subline: $msg" if $debug;
41             if ($debug > 1)
42             {
43             warn _stacktrace();
44             }
45             if ($debug > 2)
46             {
47             Event::Stats::collect(1);
48             warn sprintf("%d\n%-35s %3s %10s %4s %4s %4s %4s %7s\n", time,
49             "DESC", "PRI", "CBTIME", "PEND", "CARS", "RAN", "DIED", "ELAPSED");
50             for my $w (reverse all_watchers())
51             {
52             my @pending = $w->pending();
53             my $pending = @pending;
54             my $cars=sprintf("%01d%01d%01d%01d",
55             $w->is_cancelled,$w->is_active,$w->is_running,$w->is_suspended);
56             my ($ran,$died,$elapsed) = $w->stats(60);
57             warn sprintf("%-35s %3d %10d %4d %4s %4d %4d %7.3f\n",
58             $w->desc,
59             $w->prio,
60             $w->cbtime,
61             $pending,
62             $cars,
63             $ran,
64             $died,
65             $elapsed);
66             }
67             }
68             }
69              
70             sub logger
71             {
72             my $level=shift;
73             my $msg = join(' ',@_);
74             openlog($PROGRAM,,"daemon") unless $LOGOPEN;
75             $LOGOPEN=1;
76             debug $msg;
77             syslog($level,$msg);
78             }
79              
80             sub logcrit
81             {
82             my $msg = join(' ',@_);
83             logger "crit", $msg;
84             }
85              
86             sub logalert
87             {
88             my $msg = join(' ',@_);
89             logger "alert", $msg;
90             }
91              
92             sub loginfo
93             {
94             my $msg = join(' ',@_);
95             logger "info", $msg;
96             }
97              
98             sub logdebug
99             {
100             my $msg = join(' ',@_);
101             logger "debug", $msg;
102             }
103              
104             sub _stacktrace
105             {
106             my $out="";
107             for (my $i=1;;$i++)
108             {
109             my @frame = caller($i);
110             last unless @frame;
111             $out .= "$frame[3] $frame[1] line $frame[2]\n";
112             }
113             return $out;
114             }
115              
116             =head1 NAME
117              
118             OpenMosix::HA -- High Availability (HA) layer for an openMosix cluster
119              
120             =head1 SYNOPSIS
121              
122             use OpenMosix::HA;
123              
124             my $ha = new OpenMosix::HA;
125              
126             # start the monitor daemon
127             $ha->monitor;
128              
129             =head1 DESCRIPTION
130              
131             This module provides the basic functionality needed to manage resource
132             startup and restart across a cluster of openMosix machines.
133              
134             This gives you a high-availability cluster with low hardware overhead.
135             In contrast to traditional HA clusters, we use the openMosix cluster
136             membership facility, rather than hardware serial cables or extra
137             ethernet ports, to provide heartbeat and to detect network partitions.
138              
139             All you need to do is build a relatively conventional openMosix
140             cluster, install this module on each node, and configure it to start
141             and manage your HA processes. You do not need the relatively
142             high-end server machines which traditional HA requires. There is no
143             need for chained SCSI buses (though you can use them) -- you can
144             instead share disks among many nodes via any number of other current
145             technologies, including SAN, NAS, GFS, or Firewire (IEEE-1394).
146              
147             Commercial support is available for B as well as for
148             openMosix clusters and related products and services: see L.
149              
150             =head1 QUICK START
151              
152             See L for cluster management
153             techniques, including clean ways to install, replicate, and update
154             nodes.
155              
156             To use B to provide high availability for
157             processes hosted on an B cluster:
158              
159             =over 4
160              
161             =item *
162              
163             Install B and B on each node.
164              
165             =item *
166              
167             Create L<"/var/mosix-ha/cltab"> on any node.
168              
169             =item *
170              
171             Create L<"/var/mosix-ha/hactl"> on any node.
172              
173             =item *
174              
175             Run 'C' on each node. Putting this in F as a
176             "respawn" process would be a good idea.
177              
178             =item *
179              
180             Check current status in L<"/var/run/mosix-ha/hastat"> on any node.
181              
182             =back
183              
184             =head1 INSTALLATION
185              
186             Use Perl's normal sequence:
187              
188             perl Makefile.PL
189             make
190             make test
191             make install
192              
193             You'll need to install this module on each node in the cluster.
194              
195             This module includes a script, L, which will be installed when
196             you run 'make install'. See the output of C to
197             find out which directory the script is installed in.
198              
199             =head1 CONCEPTS
200              
201             See L for more discussion of basic concepts
202             used here, such as I, I,
203             and I.
204              
205             Normally, a high-throughput cluster computing technology is orthogonal
206             to the intent of high availability, particularly if the cluster
207             supports process migration, as in openMosix. When ordinary openMosix
208             nodes die, any processes migrated to or spawned from those nodes will
209             also die. The higher the node count, the more frequently these
210             failures are likely to occur.
211              
212             If the goal is high availability, then node failure in an openMosix
213             cluster presents two problems: (1) All processes which had migrated to
214             a failed node will die; their stubs on the home node will receive a
215             SIGCHLD. (2) All processes which had the failed node as their home
216             node will die; their stubs will no longer exist, and the migrated
217             processes will receive SIGKILL.
218              
219             Dealing with (1) by itself might be easy; just use the native UNIX
220             init's "respawn" to start the process on the home node. Dealing with
221             (2) is harder; you need to detect death of the home node, then figure
222             out which processes were spawned from there, and restart them on a
223             secondary node, again with a "respawn". If you also lose the
224             secondary node, then you need to restart on a tertiary node, and so
225             on. And managing /etc/inittab on all of the nodes would be an issue;
226             it would likely need to be both dynamically generated and different on
227             each node.
228              
229             What's really needed is something like "init", but that acts
230             cluster-wide, using one replicated configuration file, providing both
231             respawn for individual dead processes and migration of entire resource
232             groups from dead home nodes. That's what OpenMosix::HA does.
233              
234             If processes are started via OpenMosix::HA, any processes and resource
235             groups which fail due to node failure will automatically restart on
236             other nodes. OpenMosix::HA detects node failure, selects a new node
237             out of those currently available, and deconflicts the selection so
238             that two nodes don't restart the same process or resource group.
239              
240             There is no "head" or "supervisor" node in an OpenMosix::HA cluster --
241             there is no single point of failure. Each node makes its own
242             observations and decisions about the start or restart of processes and
243             resource groups.
244              
245             You can build OpenMosix::HA clusters of dissimilar machines -- any
246             given node only needs to provide the hardware and/or software to
247             support a subset of all resource groups. OpenMosix::HA is able to
248             test a node for eligibility before attempting to start a resource
249             group there -- resource groups will "seek" the nodes which can support
250             them.
251              
252             IO fencing (the art of making sure that a partially-dead node doesn't
253             continue to access shared disk or other resources) can be handled as
254             it is in conventional HA clusters, by a combination of exclusive
255             device logins when using Firewire, or distributed locks when using GFS
256             or SAN.
257              
258             In the Linux HA community, simpler, more brute-force methods for IO
259             fencing are also used, involving network-controlled powerstrips or X10
260             controllers. These methods are usually termed STOMITH or STONITH --
261             "shoot the other machine|node in the head". OpenMosix::HA provides a
262             callback hook which can be used to trigger these external STOMITH
263             actions.
264              
265             =head2 RESOURCE GROUP LIFECYCLE
266              
267             Each OpenMosix::HA node acts independently, while watching the
268             activity of others. If any node sees that a resource group is not
269             running anywhere in the cluster, it attempts to start the resource
270             group locally by following the procedure described here. The
271             following discussion is from the perspective of that local node.
272              
273             The node watches all other nodes in the cluster by consolidating
274             F into the local
275             L. It then ensures that each resource group
276             configured in L is running somewhere in the
277             cluster, at the runlevel specified in L.
278              
279             If a resource group is found to be not running anywhere in the
280             cluster, then the local OpenMosix::HA will attempt to transition the
281             resource group through each of the following runlevels on the local
282             node, in this order:
283              
284             plan
285             test
286             start (or whatever is named in hactl)
287             stop (later, at shutdown)
288              
289             The following is a detailed discussion of each of these runlevels.
290              
291             =head3 plan
292              
293             Under normal circumstances, you should not create a 'plan' runlevel
294             entry in L for any resource group. This is
295             because 'plan' is used as a collision detection phase, a NOOP;
296             anything you run at the 'plan' runlevel will be run on multiple nodes
297             simultaneously.
298              
299             When starting a resource group on the local node, OpenMosix::HA will
300             first attempt to run the resource group at the 'plan' runlevel. If
301             there is a 'plan' runlevel in L for this
302             resource group, then OpenMosix::HA will execute it; otherwise, it will
303             just set the runlevel to 'plan' in its own copy of
304             L.
305              
306             After several seconds in 'plan' mode, OpenMosix::HA will check other
307             nodes, to see if they have also started 'plan' or other activity for
308             the same resource group.
309              
310             If any other node shows 'plan' or other activity for the same resource
311             group during that time, then OpenMosix::HA will conclude that there
312             has been a collision, L the resource group on the local node,
313             and pause for several seconds.
314              
315             The "several seconds" described here is dependent on the number of
316             nodes in the cluster and a collision-avoidance random backoff
317             calculation.
318              
319             =head3 test
320              
321             You should specify at least one 'test' runlevel, with runmode also set
322             to 'test', for each resource group in L. This
323             entry should test for prerequisites for the resource group, and its
324             command should exit with a non-zero return code if the test fails.
325              
326             For example, if F requires the 'modbar' kernel module,
327             then the following entries in L will do the
328             job:
329              
330             foogrp:foo1:test:test:/sbin/modprobe modbar
331             foogrp:foo2:start:respawn:/usr/bin/foo
332              
333             ...in this example, C will exit with an error if 'modbar'
334             can't be loaded on this node.
335              
336             If a 'test' entry fails, then OpenMosix::HA will conclude that the
337             node is unusable for this resource group. It will discontinue
338             startup, and will cleanup by executing the L entry for the
339             resource group.
340              
341             After a 'test' has failed and the resource group stopped, another node
342             will typically detect the stopped resource group within several
343             seconds, and execute L and L again there. This
344             algorithm continues, repeating as needed, until a node is found that
345             is eligible to run the resource group. (For large clusters with small
346             groups of eligible nodes, this could take a while. I'm considering
347             adding a "preferred node" list in hactl to shorten the search time.)
348              
349             =head3 start
350              
351             After the 'test' runlevel passes, and if there are still no collisions
352             detected, then OpenMosix::HA will start the resource group, using the
353             runlevel specified in L.
354              
355             This runlevel is normally called 'start', but could conceivably be any
356             string matching C; you could use a numerical runlevel, a
357             product or project name, or whatever fits your needs. The only other
358             requirement is that the string you use must be the same as whatever
359             you used in L.
360              
361             =head3 stop
362              
363             If you issue a L, then OpenMosix::HA will transition all
364             resource groups to the 'stop' runlevel. If there is a 'stop' entry
365             for the resource group in L, then it will be
366             executed.
367              
368             You do not need to specify a 'stop' entry in
369             L; you B specify one if you'd like to do
370             any final cleanup, unmount filesystems, etc.
371              
372             =head1 METHODS
373              
374             =head2 new()
375              
376             Loads Cluster::Init, but does not start any resource groups.
377              
378             Accepts an optional parameter hash which you can use to override
379             module defaults. Defaults are set for a typical openMosix cluster
380             installation. Parameters you can override include:
381              
382             =over 4
383              
384             =item mfsbase
385              
386             MFS mount point. Defaults to C.
387              
388             =item mynode
389              
390             Mosix node number of local machine. You should only override this for
391             testing purposes.
392              
393             =item varpath
394              
395             The local path under C where the module should look for the
396             C and C files, and where it should put clstat
397             and clinit.s; this is also the subpath where it should look for
398             these things on other machines, under C. Defaults to
399             C.
400              
401             =item timeout
402              
403             The maximum age (in seconds) of any node's C file, after which
404             the module considers that node to be stale, and calls for a STOMITH.
405             Defaults to 60 seconds.
406              
407             =item mwhois
408              
409             The command to execute to get the local node number. Defaults to
410             "mosctl whois". This command must print some sort of string on
411             STDOUT; a C pattern will be used to extract the node number
412             from the string.
413              
414             =item stomith
415              
416             The *CODE callback to execute when a machine needs to be STOMITHed.
417             The node number will be passed as the first argument. Defaults to an
418             internal function which just prints "STOMITH node N" on STDERR.
419              
420             =back
421              
422             =cut
423              
424             sub new
425             {
426             my $class=shift;
427             my $self={@_};
428             bless $self, $class;
429             $self->{mfsbase} ||="/mfs";
430             $self->{hpcbase} ||="/proc/hpc";
431             $self->{mwhois} ||= "mosctl whois";
432             $self->{mynode} ||= $self->mosnode();
433             $self->{varpath} ||= "var/mosix-ha";
434             $self->{clinit_s} ||= "/".$self->{varpath}."/clinit.s";
435             $self->{timeout} ||= 60;
436             $self->{cycletime} ||= 1;
437             $self->{balance} ||= 1.5;
438             $self->{stomith} ||= sub{$self->stomith(@_)};
439             $self->{mybase} = $self->nodebase($self->{mynode});
440             $self->{hactl} = $self->{mybase}."/hactl";
441             $self->{cltab} = $self->{mybase}."/cltab";
442             $self->{clstat} = $self->{mybase}."/clstat";
443             $self->{hastat} = $self->{mybase}."/hastat";
444             unless (-d $self->{mybase})
445             {
446             mkdir $self->{mybase} || die $!;
447             }
448             return $self;
449             }
450              
451             sub clinit
452             {
453             my $self=shift;
454             my %parms = (
455             'clstat' => $self->{clstat},
456             'cltab' => $self->{cltab},
457             'socket' => $self->{clinit_s}
458             );
459             # start Cluster::Init daemon
460             unless (fork())
461             {
462             $0.=" [Cluster::Init->daemon]";
463             $self->cleanup;
464             $self->getcltab($self->nodes);
465             require Event;
466             import Event;
467             # noop; only -9 should be able to kill; we do orderly shutdown
468             # in monitor
469             Event->signal(signal=>"HUP" ,cb=>sub{1});
470             Event->signal(signal=>"INT" ,cb=>sub{1});
471             Event->signal(signal=>"QUIT",cb=>sub{1});
472             Event->signal(signal=>"TERM",cb=>sub{1});
473             my $clinit = Cluster::Init->daemon(%parms);
474             debug "daemon exiting";
475             exit;
476             }
477             sleep(1);
478             # initialize client
479             $self->{clinit} = Cluster::Init->client(%parms);
480             return $self->{clinit};
481             }
482              
483             ### MONITOR
484              
485             sub cleanexit
486             {
487             my $self=shift;
488             loginfo "calling haltwait";
489             $self->haltwait;
490             loginfo "calling shutdown";
491             $self->{clinit}->shutdown();
492             loginfo "calling cleanup";
493             $self->cleanup;
494             loginfo "exiting";
495             exit 0;
496             }
497              
498             sub cleanup
499             {
500             my $self=shift;
501             # unlink $self->{hastat};
502             unlink $self->{clstat};
503             }
504              
505             sub backoff
506             {
507             my $self=shift;
508             $self->{cycletime}+=rand(10);
509             }
510              
511             sub cycle_faster
512             {
513             my $self=shift;
514             $self->{cycletime}/=rand(.5)+.5;
515             # $self->{cycletime}=15 if $self->{cycletime} < 15;
516             }
517              
518             sub cycle_slower
519             {
520             my $self=shift;
521             $self->{cycletime}*=rand()+1;
522             }
523              
524             sub cycletime
525             {
526             my $self=shift;
527             my $time=shift;
528             if ($time)
529             {
530             my $ct = $self->{cycletime};
531             $ct = ($ct+$time)/2;
532             $self->{cycletime}=$ct;
533             }
534             return $self->{cycletime};
535             }
536              
537             sub compile_metrics
538             {
539             my $self=shift;
540             my $hastat=shift;
541             my $hactl=shift;
542             my $group=shift;
543             my $mynode=$self->{mynode};
544             my %metric;
545             # is group active somewhere?
546             if ($hastat->{$group})
547             {
548             $metric{isactive}=1;
549             # is group active on my node?
550             $metric{islocal}=1 if $hastat->{$group}{$mynode};
551             for my $node (keys %{$hastat->{$group}})
552             {
553             # is group active in multiple places?
554             $metric{instances}++;
555             }
556             }
557             if ($metric{islocal})
558             {
559             # run levels which must be defined in cltab: plan test stop
560             # ("start" or equivalent is defined in hactl)
561             my $level=$hastat->{$group}{$mynode}{level};
562             my $state=$hastat->{$group}{$mynode}{state};
563             debug "$group $level $state";
564             # is our local instance of group contested?
565             $metric{inconflict}=1 if $metric{instances} > 1;
566             # has group been planned here?
567             $metric{planned}=1 if $level eq "plan" && $state eq "DONE";
568             # did group pass or fail a test here?
569             $metric{passed}=1 if $level eq "test" && $state eq "PASSED";
570             $metric{failed}=1 if $level eq "test" && $state eq "FAILED";
571             # allow group to have no defined "test" runlevel -- default to pass
572             $metric{passed}=1 if $level eq "test" && $state eq "DONE";
573             # is group in transition?
574             $metric{intransition}=1 unless $state =~ /^(DONE|PASSED|FAILED)$/;
575             # is group in hactl?
576             if ($hactl->{$group})
577             {
578             # does group runlevel match what's in hactl?
579             $metric{chlevel}=1 if $level ne $hactl->{$group};
580             # do we want to plan to test and start group on this node?
581             unless ($hactl->{$group} eq "stop" || $metric{instances})
582             {
583             $metric{needplan}=1;
584             }
585             }
586             else
587             {
588             $metric{deleted}=1;
589             }
590             }
591             if ($hactl->{$group})
592             {
593             # do we want to plan to test and start group on this node?
594             unless ($hactl->{$group} eq "stop" || $metric{instances})
595             {
596             $metric{needplan}=1;
597             }
598             }
599             return %metric;
600             }
601              
602             # get latest hactl file
603             sub gethactl
604             {
605             my $self=shift;
606             my @node=@_;
607             $self->getlatest("hactl",@node);
608             # return the contents
609             my $hactl;
610             open(CONTROL,"<".$self->{hactl}) || die $!;
611             while()
612             {
613             next if /^\s*#/;
614             next if /^\s*$/;
615             chomp;
616             my ($group,$level)=split;
617             $hactl->{$group}=$level;
618             }
619             return $hactl;
620             }
621              
622             # get latest cltab file
623             sub getcltab
624             {
625             my $self=shift;
626             my @node=@_;
627             if ($self->getlatest("cltab",@node))
628             {
629             # reread cltab if it changed
630             # if $self->{clinit}
631             # XXX $self->tell("::ALL::","::REREAD::");
632             }
633             # return the contents
634             my $cltab;
635             open(CLTAB,"<".$self->{cltab}) || die $!;
636             while()
637             {
638             next if /^\s*#/;
639             next if /^\s*$/;
640             chomp;
641             my ($group,$tag,$level,$mode)=split(':');
642             next unless $group;
643             $cltab->{$group}=1;
644             }
645             return $cltab;
646             }
647              
648             # get the latest version of a file
649             sub getlatest
650             {
651             my $self=shift;
652             my $file=shift;
653             my @node=@_;
654             my $newfile;
655             # first we have to find it...
656             my $myfile;
657             for my $node (@node)
658             {
659             my $base=$self->nodebase($node);
660             my $ckfile="$base/$file";
661             $myfile=$ckfile if $node == $self->{mynode};
662             next unless -f $ckfile;
663             $newfile||=$ckfile;
664             if (-M $newfile > -M $ckfile)
665             {
666             debug "$ckfile is newer than $newfile";
667             $newfile=$ckfile;
668             }
669             }
670             # ...then get it...
671             if ($newfile && $myfile && $newfile ne $myfile)
672             {
673             if (-f $myfile && -M $myfile <= -M $newfile)
674             {
675             return 0;
676             }
677             sh("cp -p $newfile $myfile") || die $!;
678             return 1;
679             }
680             return 0;
681             }
682              
683             # halt all local resource groups
684             sub haltall
685             {
686             my $self=shift;
687             my ($hastat)=$self->hastat($self->{mynode});
688             debug dump $hastat;
689             for my $group (keys %$hastat)
690             {
691             debug "halting $group";
692             $self->tell($group,"stop");
693             }
694             }
695              
696             # halt all local resource groups and wait for them to complete
697             sub haltwait
698             {
699             my $self=shift;
700             my $hastat;
701             loginfo "shutting down resource groups";
702             my @group;
703             do
704             {
705             $self->haltall;
706             sleep(1);
707             ($hastat)=$self->hastat($self->{mynode});
708             @group=keys %$hastat;
709             loginfo "still active: @group";
710             for my $group (@group)
711             {
712             my $level=$hastat->{$group}{$self->{mynode}}{level};
713             my $state=$hastat->{$group}{$self->{mynode}}{state};
714             loginfo "$group: level=$level, state=$state";
715             }
716             } while (@group);
717             }
718              
719             # build consolidated clstat and STOMITH stale nodes
720             sub hastat
721             {
722             my $self=shift;
723             my @node=@_;
724             my $hastat;
725             my @stomlist;
726             for my $node (@node)
727             {
728             my $base=$self->nodebase($node);
729             my $file="$base/clstat";
730             next unless -f $file;
731             # STOMITH stale nodes
732             my $mtime = (stat($file))[9];
733             debug "$node age $mtime\n";
734             my $mintime = time - $self->{timeout};
735             debug "$file mtime $mtime mintime $mintime\n";
736             if ($mtime < $mintime)
737             {
738             debug "$node is old\n";
739             unless($node == $self->{mynode})
740             {
741             push @stomlist, $node;
742             }
743             }
744             open(CLSTAT,"<$file") || next;
745             while()
746             {
747             chomp;
748             my ($class,$group,$level,$state) = split;
749             next unless $class eq "Cluster::Init::Group";
750             # ignore inactive groups
751             next if $state eq "CONFIGURED";
752             next if $level eq "stop" && $state eq "DONE";
753             $hastat->{$group}{$node}{level}=$level;
754             $hastat->{$group}{$node}{state}=$state;
755             }
756             }
757             # note that this file is not always populated with the entire node
758             # set -- depends on how hastat() was called!
759             open(HASTAT,">".$self->{hastat}."tmp") || die $!;
760             print HASTAT (dump $hastat);
761             close HASTAT;
762             rename($self->{hastat}."tmp", $self->{hastat}) || die $!;
763             return ($hastat,\@stomlist);
764             }
765              
766             =head2 monitor()
767              
768             Starts the monitor daemon. Does not return.
769              
770             The monitor does the real work for this module; it ensures the
771             resource groups in L are each running
772             somewhere in the cluster, at the runlevels specified in
773             L. Any resource groups found not running are
774             candidates for a restart on the local node.
775              
776             Before restarting a resource group, the local monitor announces its
777             intentions in the local C file, and observes C on
778             other nodes. If the monitor on any other node also intends to start
779             the same resource group, then the local monitor will detect this and
780             cancel its own restart. The checks and restarts are staggered by
781             random times on various nodes to prevent oscillation.
782              
783             See L.
784              
785             =cut
786              
787             sub monitor
788             {
789             my $self=shift;
790             my $runtime=shift || 999999999;
791             my $start=time();
792             my $stop=$start + $runtime;
793             # Event->signal(signal=>"HUP" ,cb=>[$self,"cleanexit"]);
794             # Event->signal(signal=>"INT" ,cb=>[$self,"cleanexit"]);
795             # Event->signal(signal=>"QUIT",cb=>[$self,"cleanexit"]);
796             # Event->signal(signal=>"TERM",cb=>[$self,"cleanexit"]);
797             $SIG{HUP}=sub{$self->cleanexit};
798             $SIG{INT}=sub{$self->cleanexit};
799             $SIG{QUIT}=sub{$self->cleanexit};
800             $SIG{TERM}=sub{$self->cleanexit};
801             while(time < $stop)
802             {
803             my @node = $self->nodes();
804             unless($self->quorum(@node))
805             {
806             my $node = $self->{mynode};
807             logcrit "node $node: quorum lost: can only see nodes @node\n";
808             $self->haltwait;
809             sleep(30);
810             next;
811             }
812             # build consolidated clstat
813             my ($hastat,$stomlist)=$self->hastat(@node);
814             # STOMITH stale nodes
815             $self->stomscan($stomlist) if time > $start + 120;
816             # get and read latest hactl and cltab
817             my $hactl=$self->gethactl(@node);
818             my $cltab=$self->getcltab(@node);
819             $self->scangroups($hastat,$hactl,@node);
820             logdebug "node $self->{mynode} cycletime $self->{cycletime}\n";
821             sleep($self->cycletime) if $self->cycletime + time < $stop;
822             }
823             return 1;
824             }
825              
826             sub mosnode
827             {
828             my $self=shift;
829             my $whois=`$self->{mwhois}`;
830             # "This is MOSIX #32"
831             $whois =~ /(\d+)/;
832             my $node=$1;
833             die "can't figure out my openMosix node number" unless $node;
834             return $node;
835             }
836              
837             sub nodebase
838             {
839             my $self=shift;
840             my $node=shift;
841             my $base = join
842             (
843             "/",
844             $self->{mfsbase},
845             $node,
846             $self->{varpath}
847             );
848             return $base;
849             }
850              
851             # build list of nodes by looking in /proc/hpc/nodes
852             sub nodes
853             {
854             my $self=shift;
855             opendir(NODES,$self->{hpcbase}."/nodes") || die $!;
856             my @node = grep /^\d/, readdir(NODES);
857             closedir NODES;
858             my @upnode;
859             # check availability
860             for my $node (@node)
861             {
862             open(STATUS,$self->{hpcbase}."/nodes/$node/status") || next;
863             chomp(my $status=);
864             # XXX status bits mean what?
865             next unless $status & 2;
866             push @upnode, $node;
867             }
868             return @upnode;
869             }
870              
871             # detect if we've lost quorum
872             sub quorum
873             {
874             my ($self,@node)=@_;
875             $self->{quorum}||=0;
876             logdebug "quorum count: ".$self->{quorum}."\n";
877             if (@node < $self->{quorum} * .6)
878             {
879             return 0;
880             }
881             $self->{quorum}=@node;
882             return 1;
883             }
884              
885             sub runXXX
886             {
887             my $seconds=shift;
888             Event->timer(at=>time() + $seconds,cb=>sub{unloop()});
889             loop();
890             }
891              
892             # scan through all known groups, stopping or starting them according
893             # to directives in hactl and status of all nodes; the goal here is to
894             # make each group be at the runlevel shown in hactl
895             sub scangroups
896             {
897             my $self=shift;
898             my $hastat=shift;
899             my $hactl=shift;
900             my @node=@_;
901             my $clinit=$self->{clinit};
902             # for each group in hastat or hactl
903             for my $group (uniq(keys %$hastat, keys %$hactl))
904             {
905             my %metric = $self->compile_metrics($hastat,$hactl,$group);
906             debug "$group ", dump %metric;
907             # stop groups which have been deleted from hactl
908             if ($metric{deleted})
909             {
910             $self->tell($group,"stop");
911             $self->cycletime(5);
912             next;
913             }
914             # stop contested groups
915             if ($metric{inconflict})
916             {
917             $self->tell($group,"stop");
918             $self->backoff();
919             next;
920             }
921             # start groups which previously passed tests
922             if ($metric{passed})
923             {
924             $self->tell($group,$hactl->{$group});
925             $self->cycletime(5);
926             next;
927             }
928             # stop failed groups
929             if ($metric{failed})
930             {
931             $self->tell($group,"stop");
932             $self->cycletime(5);
933             next;
934             }
935             # start tests for all uncontested groups we planned
936             if ($metric{planned})
937             {
938             $self->tell($group,"test");
939             $self->cycletime(5);
940             next;
941             }
942             # notify world of groups we plan to test
943             if ($metric{needplan})
944             {
945             $self->cycletime(10);
946             # balance startup across all nodes
947             next if rand(scalar @node) > $self->{balance};
948             # start planning
949             $self->tell($group,"plan");
950             next;
951             }
952             # in transition -- don't do anything yet
953             if ($metric{intransition})
954             {
955             $self->cycletime(5);
956             next;
957             }
958             # whups -- level changed in hactl
959             if ($metric{chlevel})
960             {
961             $self->tell($group,$hactl->{$group});
962             $self->cycletime(5);
963             next;
964             }
965             # normal cycletime is such that one node in the cluster should
966             # wake up each second
967             # XXX this won't work with larger clusters -- too long to detect
968             # shutdown in hactl -- maybe need to go with event loop here?
969             $self->cycletime(scalar @node);
970             }
971             }
972              
973             sub sh
974             {
975             my @cmd=@_;
976             my $cmd=join(' ',@cmd);
977             debug "> $cmd\n";
978             my $res=`$cmd`;
979             my $rc= $? >> 8;
980             $!=$rc;
981             return ($rc,$res) if wantarray;
982             return undef if $rc;
983             return 1;
984             }
985              
986             sub stomith
987             {
988             my ($self,$node)=@_;
989             logalert "STOMITH node $node\n";
990             }
991              
992             sub stomscan
993             {
994             my $self=shift;
995             my $stomlist=shift;
996             for my $node (@$stomlist)
997             {
998             # warn "STOMITH $node\n";
999             &{$self->{stomith}}($node);
1000             }
1001             }
1002              
1003             sub tell
1004             {
1005             my $self=shift;
1006             my $group=shift;
1007             my $level=shift;
1008             debug "tell $group $level";
1009             $self->{clinit}->tell($group,$level);
1010             }
1011              
1012             sub uniq
1013             {
1014             my @in=@_;
1015             my @out;
1016             for my $in (@in)
1017             {
1018             push @out, $in unless grep /^$in$/, @out;
1019             }
1020             return @out;
1021             }
1022              
1023             =head1 UTILITIES
1024              
1025             =head2 mosha
1026              
1027             OpenMosix::HA includes B, a script which is intended to be
1028             started as a "respawn" entry in each node's F. It
1029             requires no arguments.
1030              
1031             This is a simple script; all it does is create an OpenMosix::HA object
1032             and call the L method on that object.
1033              
1034             =head1 FILES
1035              
1036             =head2 /var/mosix-ha/cltab
1037              
1038             The main configuration file; describes the processes and resource
1039             groups you want to run in the cluster.
1040              
1041             See L for the format of this file -- it's
1042             the same file; OpenMosix::HA tells Cluster::Init to place cltab under
1043             F instead of F. For a configured example, see
1044             F in the OpenMosix::HA
1045             distribution.
1046              
1047             See L for runmodes and entries you should
1048             specify in this file; specifically, you should set up at least one
1049             'test' entry and one 'start' entry for each resource group.
1050              
1051             You do B need to replicate this file to any other node --
1052             B will do it for you.
1053              
1054             =head2 /var/mosix-ha/hactl
1055              
1056             The HA control file; describes the resource groups you want to run,
1057             and the runlevels you want them to execute at. See the L
1058             paragraph about the L runlevel. See
1059             F for an example.
1060              
1061             You do B need to replicate this file to any other node --
1062             B will do it for you.
1063              
1064             Format is one resource group per line, whitespace delimited, '#' means
1065             comment:
1066              
1067             # resource_group runlevel
1068             mygroup start
1069             foogroup start
1070             bargroup 3
1071             bazgroup 2
1072             # missing or commented means 'stop' -- the following two
1073             # lines are equivalent:
1074             redgrp stop
1075             # redgrp start
1076              
1077             =head2 /var/mosix-ha/hastat
1078              
1079             The cluster status file. Rebuilt periodically on each node by
1080             consolidating F. Each node's version of
1081             this file normally matches the others. Interesting to read; can be
1082             eval'd by other Perl processes for building automated monitoring
1083             tools.
1084              
1085             =head2 /var/mosix-ha/clstat
1086              
1087             The per-node status file; see
1088             L. Not very interesting unless
1089             you're troubleshooting OpenMosix::HA itself -- see
1090             F instead.
1091              
1092             =head1 BUGS
1093              
1094             The underlying module, Cluster::Init, has a Perl 5.8 compatibility
1095             problem, documented there; fix targeted for next point release.
1096              
1097             Quorum counting accidentally counts nodes that are up but not running
1098             OpenMosix::HA; easy fix, to be done in next point release.
1099              
1100             This version currently spits out debug messages every few seconds.
1101              
1102             No test cases for monitor() yet.
1103              
1104             Right now we don't detect or act on errors in cltab.
1105              
1106             At this time, B is a very minimal script which just gets the
1107             job done, and probably will need some more work once we figure out
1108             what else it might need to do.
1109              
1110             =head1 SUPPORT
1111              
1112             Commercial support for B is available at
1113             L. On that web site, you'll also find
1114             pointers to the latest version, a community mailing list, and other
1115             cluster management software.
1116              
1117             You can also find help for general infrastructure (and cluster)
1118             administration at L.
1119              
1120             =head1 AUTHOR
1121              
1122             Steve Traugott
1123             CPAN ID: STEVEGT
1124             stevegt@TerraLuna.Org
1125             http://www.stevegt.com
1126              
1127             =head1 COPYRIGHT
1128              
1129             Copyright (c) 2003 Steve Traugott. All rights reserved.
1130             This program is free software; you can redistribute
1131             it and/or modify it under the same terms as Perl itself.
1132              
1133             The full text of the license can be found in the
1134             LICENSE file included with this module.
1135              
1136             =head1 SEE ALSO
1137              
1138             Cluster::Init, openMosix.Org, qlusters.com, Infrastructures.Org
1139              
1140             =cut
1141              
1142             1;
1143              
1144