File Coverage

blib/lib/MogileFS/Worker/Fsck.pm
Criterion Covered Total %
statement 87 323 26.9
branch 0 106 0.0
condition 0 31 0.0
subroutine 29 51 56.8
pod 0 13 0.0
total 116 524 22.1


line stmt bran cond sub pod time code
1             package MogileFS::Worker::Fsck;
2              
3 21     21   130 use strict;
  21         36  
  21         538  
4 21     21   85 use base 'MogileFS::Worker';
  21         35  
  21         2169  
5             use fields (
6 21         89 'opt_nostat', # bool: do we trust mogstoreds? skipping size stats?
7             'opt_checksum', # (class|off|MD5) checksum mode
8 21     21   109 );
  21         32  
9 21     21   1183 use MogileFS::Util qw(every error debug);
  21         44  
  21         900  
10 21     21   97 use MogileFS::Config;
  21         34  
  21         1741  
11 21     21   118 use MogileFS::Server;
  21         29  
  21         382  
12 21     21   78 use List::Util ();
  21         38  
  21         378  
13 21     21   97 use Time::HiRes ();
  21         29  
  21         469  
14              
15 21     21   88 use constant SUCCESS => 0;
  21         32  
  21         932  
16 21     21   113 use constant TEMPORARY => 1;
  21         32  
  21         829  
17 21     21   105 use constant PERMANENT => 2;
  21         48  
  21         884  
18 21     21   101 use constant REPLICATE => 3;
  21         27  
  21         975  
19              
20 21     21   99 use constant EV_NO_PATHS => "NOPA";
  21         37  
  21         806  
21 21     21   93 use constant EV_POLICY_VIOLATION => "POVI";
  21         39  
  21         787  
22 21     21   116 use constant EV_FILE_MISSING => "MISS";
  21         30  
  21         840  
23 21     21   109 use constant EV_BAD_LENGTH => "BLEN";
  21         32  
  21         768  
24 21     21   93 use constant EV_CANT_FIX => "GONE";
  21         33  
  21         804  
25 21     21   95 use constant EV_START_SEARCH => "SRCH";
  21         33  
  21         880  
26 21     21   99 use constant EV_FOUND_FID => "FOND";
  21         35  
  21         808  
27 21     21   93 use constant EV_RE_REPLICATE => "REPL";
  21         30  
  21         865  
28 21     21   105 use constant EV_BAD_COUNT => "BCNT";
  21         28  
  21         825  
29 21     21   101 use constant EV_BAD_CHECKSUM => "BSUM";
  21         40  
  21         953  
30 21     21   108 use constant EV_NO_CHECKSUM => "NSUM";
  21         30  
  21         832  
31 21     21   105 use constant EV_MULTI_CHECKSUM => "MSUM";
  21         29  
  21         783  
32 21     21   93 use constant EV_BAD_HASHTYPE => "BALG";
  21         29  
  21         712  
33              
34 21     21   104 use POSIX ();
  21         43  
  21         6748  
35              
36             my $nowish; # approximate unixtime, updated once per loop.
37              
38 0     0 0   sub watchdog_timeout { 120 }
39              
40             sub work {
41 0     0 0   my $self = shift;
42              
43             # this can be CPU-intensive. let's nice ourselves down.
44 0           POSIX::nice(10);
45              
46 0           my $sto = Mgd::get_store();
47 0           my $max_checked = 0;
48              
49             every(2.0, sub {
50 0     0     my $sleep_set = shift;
51 0           $nowish = time();
52 0           local $Mgd::nowish = $nowish;
53              
54 0           my $queue_todo = $self->queue_todo('fsck');
55             # This counts the same as a $self->still_alive;
56 0           $self->send_to_parent('worker_bored 50 fsck');
57 0 0         return unless @{$queue_todo};
  0            
58 0 0         return unless $self->validate_dbh;
59              
60 0           my @fids = ();
61 0           while (my $todo = shift @{$queue_todo}) {
  0            
62 0           my $fid = MogileFS::FID->new($todo->{fid});
63 0 0         if ($fid->exists) {
64 0           push(@fids, $fid);
65             } else {
66             # FID stopped existing before being checked.
67 0           $sto->delete_fid_from_file_to_queue($fid->id, FSCK_QUEUE);
68             }
69             }
70 0 0         return unless @fids;
71              
72 0   0       $self->{opt_nostat} = MogileFS::Config->server_setting('fsck_opt_policy_only') || 0;
73 0           my $alg = MogileFS::Config->server_setting_cached("fsck_checksum");
74 0 0 0       if (defined($alg) && $alg eq "off") {
75 0           $self->{opt_checksum} = "off";
76             } else {
77 0 0         $self->{opt_checksum} = MogileFS::Checksum->valid_alg($alg) ? $alg : 0;
78             }
79 0           MogileFS::FID->mass_load_devids(@fids);
80              
81             # don't sleep in loop, next round, since we found stuff to work on
82             # this round...
83 0           $sleep_set->(0);
84              
85 0           my $new_max;
86 0           my $hit_problem = 0;
87              
88 0           foreach my $fid (@fids) {
89 0 0         if (!$self->check_fid($fid)) {
90             # some connectivity problem... retry this fid later.
91             # (don't dequeue it)
92 0           $self->still_alive;
93 0           next;
94             }
95 0           $sto->delete_fid_from_file_to_queue($fid->id, FSCK_QUEUE);
96             }
97 0           });
98             }
99              
100             # given a $fid (MogileFS::FID, with pre-populated ->devids data)
101             # return 0 if reachability problems.
102             # return 1 if fid was checked (regardless of there being problems or not)
103             # if no problems, no action.
104             # if problems, log & enqueue fixes
105 21     21   129 use constant STALLED => 0;
  21         32  
  21         902  
106 21     21   102 use constant HANDLED => 1;
  21         34  
  21         13830  
107             sub check_fid {
108 0     0 0   my ($self, $fid) = @_;
109              
110             my $fix = sub {
111 0     0     my ($reason, $recheck) = @_;
112 0           my $fixed;
113              
114             # we cached devids without locking for the fast path,
115             # ensure we get an up-to-date list in the slow path.
116 0           $fid->forget_cached_devids;
117              
118 0           my $sto = Mgd::get_store();
119 0 0         unless ($sto->should_begin_replicating_fidid($fid->id)) {
120 0           error("Fsck stalled for fid $fid: failed to acquire lock");
121 0           return STALLED;
122             }
123              
124 0 0         unless ($fid->exists) {
125             # FID stopped existing while doing (or waiting on)
126             # the fast check, give up on this fid
127 0           $sto->note_done_replicating($fid->id);
128 0           return HANDLED;
129             }
130              
131             # we may have a lockless check which failed, retry the check
132             # with the lock and see if it succeeds here:
133 0 0         if ($recheck) {
134 0           $fixed = $recheck->();
135 0 0         if (!$fixed) {
136 0           $fid->fsck_log($reason);
137             }
138             }
139              
140 0   0       $fixed ||= eval { $self->fix_fid($fid) };
  0            
141 0           my $err = $@;
142 0           $sto->note_done_replicating($fid->id);
143 0 0         if (! defined $fixed) {
144 0           error("Fsck stalled for fid $fid: $err");
145 0           return STALLED;
146             }
147 0 0         $fid->fsck_log(EV_CANT_FIX) if ! $fixed;
148              
149             # that might've all taken awhile, let's update our approximate time
150 0           $nowish = $self->still_alive;
151 0           return HANDLED;
152 0           };
153              
154             # first obvious fucked-up case: no devids even presumed to exist.
155 0 0         unless ($fid->devids) {
156             # weird, recheck with a lock and then log it if it fails
157             # and attempt a fix (which will do a search over all
158             # devices as a last-ditch effort to locate it)
159 0     0     return $fix->(EV_NO_PATHS, sub { $fid->devids });
  0            
160             }
161              
162             # first, see if the assumed devids meet the replication policy for
163             # the fid's class.
164 0 0         unless ($fid->devids_meet_policy) {
165             # recheck for policy violation under a lock, logging the violation
166             # if we failed.
167 0     0     return $fix->(EV_POLICY_VIOLATION, sub { $fid->devids_meet_policy });
  0            
168             }
169              
170             # This is a simple fixup case
171             # If we got here, we already know we have no policy violation and
172             # don't need to call $fix->() to just fix a devcount
173 0           $self->maybe_fix_devcount($fid);
174              
175             # missing checksum row
176 0 0 0       if ($fid->class->hashtype && ! $fid->checksum) {
177 0           return $fix->();
178             }
179              
180             # in the fast case, do nothing else (don't check if assumed file
181             # locations are actually there). in the fast case, all we do is
182             # check the replication policy, which is already done, so finish.
183 0 0         return HANDLED if $self->{opt_nostat};
184              
185 0 0 0       if ($self->{opt_checksum} && $self->{opt_checksum} ne "off") {
186 0           return $fix->();
187             }
188              
189             # stat each device to see if it's still there. on first problem,
190             # stop and go into the slow(er) fix function.
191 0           my $err;
192             my $rv = $self->parallel_check_sizes([ $fid->devfids ], sub {
193 0     0     my ($dfid, $disk_size) = @_;
194 0 0         if (! defined $disk_size) {
195 0           my $dev = $dfid->device;
196             # We end up checking is_perm_dead twice, but that's the way the
197             # flow goes...
198 0 0         if ($dev->dstate->is_perm_dead) {
199 0           $err = "needfix";
200 0           return 0;
201             }
202 0           error("Connectivity problem reaching device " . $dev->id . " on host " . $dev->host->ip . "\n");
203 0           $err = "stalled";
204 0           return 0;
205             }
206 0 0         return 1 if $disk_size == $fid->length;
207 0           $err = "needfix";
208             # Note: not doing fsck_log, as fix_fid will log status for each device.
209 0           return 0;
210 0           });
211              
212 0 0         if ($rv) {
    0          
    0          
213 0 0 0       return ($fid->class->hashtype && !($self->{opt_checksum} && $self->{opt_checksum} eq "off"))
214             ? $fix->() : HANDLED;
215             } elsif ($err eq "stalled") {
216 0           return STALLED;
217             } elsif ($err eq "needfix") {
218 0           return $fix->();
219             } else {
220 0           die "Unknown error checking fid sizes in parallel.\n";
221             }
222             }
223              
224             # returns true if all size checks succeeded, false otherwise
225             sub parallel_check_sizes {
226 0     0 0   my ($self, $dflist, $cb) = @_;
227 0           my $expect = scalar @$dflist;
228 0           my ($good, $done) = (0, 0);
229              
230 0           foreach my $df (@$dflist) {
231             $df->size_on_disk(sub {
232 0     0     my ($size) = @_;
233 0           $done++;
234 0 0         if ($cb->($df, $size)) {
235 0           $good++;
236             } else {
237             # use another timer to force PostLoopCallback to run
238 0           Danga::Socket->AddTimer(0, sub { $self->still_alive });
  0            
239             }
240 0           });
241             }
242              
243 0     0     Danga::Socket->SetPostLoopCallback(sub { $done != $expect });
  0            
244 0           Danga::Socket->EventLoop;
245              
246 0           return $good == $expect;
247             }
248              
249             # this is the slow path. if something above in check_fid finds
250             # something amiss in any way, we went the slow path on a fid and try
251             # really hard to fix the situation.
252             #
253             # return true if situation handled, 0 if nothing could be done.
254             # die on errors (like connectivity problems).
255 21     21   131 use constant CANT_FIX => 0;
  21         41  
  21         25868  
256             sub fix_fid {
257 0     0 0   my ($self, $fid) = @_;
258 0           debug(sprintf("Fixing FID %d", $fid->id));
259              
260             # make devfid objects from the devids that this fid is on,
261 0           my @dfids = map { MogileFS::DevFID->new($_, $fid) } $fid->devids;
  0            
262              
263             # track all known good copies (dev objects), as well as all bad
264             # copies (places it should've been, but isn't)
265 0           my @good_devs;
266             my @bad_devs;
267 0           my %already_checked; # devid -> 1.
268 0   0       my $alg = $fid->class->hashname || $self->{opt_checksum};
269 0           my $checksums = {};
270 0     0     my $ping_cb = sub { $self->still_alive };
  0            
271              
272             my $check_dfids = sub {
273 0     0     my $is_desperate_mode = shift;
274              
275             # stat all devices.
276 0           foreach my $dfid (@dfids) {
277 0           my $dev = $dfid->device;
278 0 0         next if $already_checked{$dev->id}++;
279              
280             # Got a dead link, but reaper hasn't cleared it yet?
281 0 0         if ($dev->dstate->is_perm_dead) {
282 0           push @bad_devs, $dev;
283 0           next;
284             }
285              
286 0           my $disk_size = $dfid->size_on_disk;
287 0 0         die "dev " . $dev->id . " unreachable" unless defined $disk_size;
288              
289 0 0         if ($disk_size == $fid->length) {
290 0 0 0       if ($alg && $alg ne "off") {
291 0           my $digest = $self->checksum_on_disk($dfid, $alg, $ping_cb);
292 0 0         unless (defined $digest) {
293 0           die "dev " . $dev->id . " unreachable";
294             }
295              
296             # DELETE could've hit right after size check
297 0 0         if ($digest eq "-1") {
298 0 0         unless ($is_desperate_mode) {
299 0           $fid->fsck_log(EV_FILE_MISSING, $dev);
300             }
301 0           push @bad_devs, $dfid->device;
302 0           next;
303             }
304 0   0       push @{$checksums->{$digest} ||= []}, $dfid->device;
  0            
305             }
306              
307 0           push @good_devs, $dfid->device;
308             # if we were doing a desperate search, one is enough, we can stop now!
309 0 0         return if $is_desperate_mode;
310 0           next;
311             }
312              
313             # don't log in desperate mode, as we'd have "file missing!" log entries
314             # for every device in the normal case, which is expected.
315 0 0         unless ($is_desperate_mode) {
316 0 0         if ($disk_size == -1) {
317 0           $fid->fsck_log(EV_FILE_MISSING, $dev);
318             } else {
319 0           $fid->fsck_log(EV_BAD_LENGTH, $dev);
320             }
321             }
322              
323 0           push @bad_devs, $dfid->device;
324             }
325 0           };
326              
327 0           $check_dfids->();
328              
329             # if we didn't find it anywhere, let's go do an exhaustive search over
330             # all devices, looking for it...
331 0 0         unless (@good_devs) {
332             # replace @dfids with list of all (alive) devices. dups will be ignored by
333             # check_dfids
334 0           $fid->fsck_log(EV_START_SEARCH);
335             @dfids = List::Util::shuffle(
336 0           map { MogileFS::DevFID->new($_, $fid) }
337 0           grep { $_->dstate->should_fsck_search_on }
  0            
338             Mgd::device_factory()->get_all
339             );
340 0           $check_dfids->("desperate");
341              
342             # still can't fix it?
343 0 0         unless (@good_devs) {
344 0           $self->forget_bad_devs($fid, @bad_devs);
345 0           $fid->update_devcount;
346 0           return CANT_FIX;
347             }
348              
349             # wow, we actually found it!
350 0           $fid->note_on_device($good_devs[0]); # at least one good one.
351 0           $fid->fsck_log(EV_FOUND_FID);
352              
353             # fall through to check policy (which will most likely be
354             # wrong, with only one file_on record...) and re-replicate
355             }
356              
357 0           $self->forget_bad_devs($fid, @bad_devs);
358             # in case the devcount or similar was fixed.
359 0           $fid->want_reload;
360              
361 0 0 0       $self->fix_checksums($fid, $alg, $checksums) if $alg && $alg ne "off";
362              
363             # Note: this will reload devids, if they called 'note_on_device'
364             # or 'forget_about_device'
365 0 0         unless ($fid->devids_meet_policy) {
366 0           $fid->enqueue_for_replication(in => 1);
367 0           $fid->fsck_log(EV_RE_REPLICATE);
368 0           return HANDLED;
369             }
370            
371             # Clean up the device count if it's wrong
372 0           $self->maybe_fix_devcount($fid);
373              
374 0           return HANDLED;
375             }
376              
377             sub forget_file_on_with_bad_checksums {
378 0     0 0   my ($self, $fid, $checksums) = @_;
379 0           foreach my $bdevs (values %$checksums) {
380 0           foreach my $bdev (@$bdevs) {
381 0           error("removing file_on mapping for fid=" . $fid->id . ", dev=" . $bdev->id);
382 0           $fid->forget_about_device($bdev);
383             }
384             }
385             }
386              
387             # returns -1 on missing,
388             # undef on connectivity error,
389             # else checksum of file on disk (after HTTP GET or mogstored read)
390             sub checksum_on_disk {
391 0     0 0   my ($self, $dfid, $alg, $ping_cb) = @_;
392 0           return $dfid->checksum_on_disk($alg, $ping_cb, "fsck");
393             }
394              
395             sub bad_checksums_errmsg {
396 0     0 0   my ($self, $alg, $checksums) = @_;
397 0           my @err;
398              
399 0           foreach my $checksum (keys %$checksums) {
400 0           my $bdevs = join(",", map { $_->id } @{$checksums->{$checksum}});
  0            
  0            
401 0           $checksum = unpack("H*", $checksum);
402 0           push @err, "$alg:$checksum on devids=[$bdevs]"
403             }
404              
405 0           return join('; ', @err);
406             }
407              
408             # we don't now what checksum the file is supposed to be, but some
409             # of the devices had checksums that didn't match the other(s).
410             sub auto_checksums_bad {
411 0     0 0   my ($self, $fid, $checksums) = @_;
412 0           my $alg = $self->{opt_checksum};
413 0           my $err = $self->bad_checksums_errmsg($alg, $checksums);
414              
415 0           error("$fid has multiple checksums: $err");
416 0           $fid->fsck_log(EV_MULTI_CHECKSUM);
417             }
418              
419             sub all_checksums_bad {
420 0     0 0   my ($self, $fid, $checksums) = @_;
421 0 0         my $alg = $fid->class->hashname or return; # class could've changed
422 0           my $cur_checksum = $fid->checksum;
423 0           my $err = $self->bad_checksums_errmsg($alg, $checksums);
424 0 0         my $cur = $cur_checksum ? "Expected: $cur_checksum"
425             : "No known valid checksum";
426 0           error("all checksums bad: $err. $cur");
427 0           $fid->fsck_log(EV_BAD_CHECKSUM);
428             }
429              
430             sub fix_checksums {
431 0     0 0   my ($self, $fid, $alg, $checksums) = @_;
432 0           my $cur_checksum = $fid->checksum;
433 0           my @all_checksums = keys(%$checksums);
434              
435 0 0         if (scalar(@all_checksums) == 1) { # all checksums match, good!
    0          
    0          
436 0           my $disk_checksum = $all_checksums[0];
437 0 0         if ($cur_checksum) {
438 0 0         if ($cur_checksum->{checksum} ne $disk_checksum) {
439 0           my $expect = $cur_checksum->info;
440 0           my $actual = "$alg:" . unpack("H*", $disk_checksum);
441 0           error("$cur_checksum does not match disk: $actual");
442 0 0         if ($alg ne $cur_checksum->hashname) {
443 0           $fid->fsck_log(EV_BAD_HASHTYPE);
444             } else {
445 0           $fid->fsck_log(EV_BAD_CHECKSUM);
446             }
447             }
448             } else { # fresh row to checksum
449 0           my $hashtype = $fid->class->hashtype;
450              
451             # we store this in the database
452 0 0         if ($hashtype) {
453 0           my %row = (
454             fid => $fid->id,
455             checksum => $disk_checksum,
456             hashtype => $hashtype,
457             );
458 0           my $new_checksum = MogileFS::Checksum->new(\%row);
459 0           debug("creating new checksum=$new_checksum");
460 0           $fid->fsck_log(EV_NO_CHECKSUM);
461 0           $new_checksum->save;
462             } else {
463 0           my $hex_checksum = unpack("H*", $disk_checksum);
464 0           my $alg = $self->{opt_checksum};
465 0           debug("fsck_checksum=auto good: $fid $alg:$hex_checksum");
466             }
467             }
468             } elsif ($cur_checksum) {
469 0           my $good = delete($checksums->{$cur_checksum->{checksum}});
470 0 0 0       if ($good && (scalar(@$good) > 0)) {
471 0           $self->forget_file_on_with_bad_checksums($fid, $checksums);
472             # will fail $fid->devids_meet_policy and re-replicate
473             } else {
474 0           $self->all_checksums_bad($fid, $checksums);
475             }
476             } elsif ($self->{opt_checksum}) {
477 0           $self->auto_checksums_bad($fid, $checksums);
478             } else {
479 0           $self->all_checksums_bad($fid, $checksums);
480             }
481             }
482              
483             # remove the file_on mappings for devices that were bogus/missing.
484             sub forget_bad_devs {
485 0     0 0   my ($self, $fid, @bad_devs) = @_;
486 0           foreach my $bdev (@bad_devs) {
487 0           error("removing file_on mapping for fid=" . $fid->id . ", dev=" . $bdev->id);
488 0           $fid->forget_about_device($bdev);
489             }
490             }
491              
492             sub maybe_fix_devcount {
493             # don't even log BCNT errors if skip_devcount is enabled
494 0 0   0 0   return if MogileFS::Config->server_setting_cached('skip_devcount');
495              
496 0           my ($self, $fid) = @_;
497 0 0         return if scalar($fid->devids) == $fid->devcount;
498             # log a bad count
499 0           $fid->fsck_log(EV_BAD_COUNT);
500 0           $fid->update_devcount();
501             }
502              
503             1;
504              
505             # Local Variables:
506             # mode: perl
507             # c-basic-indent: 4
508             # indent-tabs-mode: nil
509             # End: